import argparse
import datetime
import urlparse
import time

from nile.api.v1 import (
    filters as nf,
    aggregators as na,
    extractors as ne,
    clusters,
    Record,
)
import nile
import pandas as pd
import yt.wrapper as yt
import logging
logging.basicConfig(format='[%(asctime)s] %(filename)s[LINE:%(lineno)d] %(levelname)-8s %(message)s',
                    level=logging.ERROR)

JOB_NAME = 'TURBO ZEN METRIC'
LOG_ROOT = '//logs/zen-stats-log/1d'
RETRY_NUM = 48
SLEEP_TIME = 60 * 30

GOOD_PRODUCT = [
    'abr_native',
    'web',
    'zenkit',
    'apad_native',
    'ios_native',
    'ipad_native',
    'launcher_native',
    'site_mobile',
    'zen_app',
    'launcher',
]


def get_host(url):
    try:
        host = urlparse.urlparse(url).hostname.replace('www.', '')
    except:
        host = 'Unknown'
    return host


def argument_parser():
    parser = argparse.ArgumentParser(description='Get parameters')
    parser.add_argument(
        '-p',
        dest='pool',
        type=str,
        help='you hahn pool',
    )
    parser.add_argument(
        '-r',
        dest='rpath',
        type=str,
        help='result path',
    )
    parser.add_argument(
        '-t',
        dest='timestamp',
        type=int,
        help='timestamp',
    )
    parser.add_argument(
        '-n',
        dest='ndays',
        type=int,
        help='num of days',
    )
    args = parser.parse_args()
    return args


def main():
    args = argument_parser()
    timestamp = args.timestamp
    job_root = args.rpath
    n_days = args.ndays

    if timestamp > 100000000000:
        timestamp = timestamp / 1000
    current_day = datetime.datetime.fromtimestamp(timestamp) - datetime.timedelta(1)
    last_needed_day = datetime.datetime.fromtimestamp(timestamp) - datetime.timedelta(n_days)
    current_date = '{}-{:0>2}-{:0>2}'.format(
        current_day.year,
        current_day.month,
        current_day.day
    )
    last_needed_date = '{}-{:0>2}-{:0>2}'.format(
        last_needed_day.year,
        last_needed_day.month,
        last_needed_day.day
    )

    dates = [i.strftime('%Y-%m-%d') for i in pd.date_range(last_needed_date, current_date)]

    yt.config.set_proxy("hahn")
    dates_log = yt.list(LOG_ROOT)
    current_dates = set(dates).intersection(set(dates_log))

    dates_str = '{' + ','.join([x for x in dates if x in current_dates]) + '}'

    cluster = clusters.Hahn(pool=args.pool).env(templates=dict(
        log_root=LOG_ROOT,
        job_root=job_root,
        )
    )

    for retry in range(RETRY_NUM):
        try:
            job = cluster.job(JOB_NAME + ': filter_logs_{}'.format(current_date)).env(
                templates=dict(
                    dates=dates_str,
                )
            )
            all_log = job.table('$log_root/@dates')
            result_logs = all_log.filter(
                nf.and_(
                    nf.equals('event', 'click'),
                    nf.custom(lambda p: p in GOOD_PRODUCT, 'product')
                )
            ).project(
                api_name='api_name',
                is_turbo=ne.custom(lambda x: 1 if x == 'turbo' else 0, 'url_type'),
                host=ne.custom(get_host, 'url'),
            )
            aggregate_logs = result_logs.groupby('host').aggregate(
                count=na.count(),
                turbo_count=na.sum('is_turbo'),
                turbo_freq=na.mean('is_turbo'),
            ).put('$job_root/host_tables/{}'.format(current_date))
            total_aggregate_logs = result_logs.groupby().aggregate(
                count=na.count(),
                turbo_count=na.sum('is_turbo'),
                turbo_freq=na.mean('is_turbo'),
            ).put('$job_root/total_tables/{}'.format(current_date))
            job.run()
            break
        except nile.nodes.table.MissingSourceTablesError as ex:
            logging.error('Error with try {}, go to sleep for {} second'.format(retry, SLEEP_TIME))
            time.sleep(SLEEP_TIME)


if __name__ == "__main__":
    main()
