#!/usr/bin/env python
# -*- coding: utf-8 -*-

import argparse
import ast
import collections
import datetime
from itertools import repeat, izip
import multiprocessing
import urlparse

from nile.api.v1 import (
    filters as nf,
    aggregators as na,
    extractors as ne,
    clusters,
    Record,
)
import nile
import pandas as pd
import yt.wrapper as yt
import logging
logging.basicConfig(format='[%(asctime)s] %(filename)s[LINE:%(lineno)d] %(levelname)-8s %(message)s',
                    level=logging.ERROR)

PROCESSES = 2
JOB_NAME = 'HOME TRAIN TEST (CRYPTA)'
CRYPTA_ROOT = '//statbox/crypta-yandexuid-profiles-log'


def argument_parser():
    parser = argparse.ArgumentParser(description='Get parameters')
    parser.add_argument(
        '-p',
        dest='pool',
        type=str,
        help='you hahn pool',
    )
    parser.add_argument(
        '-r',
        dest='rpath',
        type=str,
        help='result path',
    )
    parser.add_argument(
        '-d1',
        dest='date1',
        type=str,
        help='date1 format "yyyy-mm-dd"',
    )
    parser.add_argument(
        '-d2',
        dest='date2',
        type=str,
        help='date2 format "yyyy-mm-dd"',
    )
    args = parser.parse_args()
    return args


def one_process(job_root, pool, date):
    logging.info(date + ' run')
    try:
        cluster = clusters.Hahn(pool=pool).env(templates=dict(
            job_root=job_root,
            crypta_root=CRYPTA_ROOT,
            )
        )
        date_parsed = datetime.datetime.strptime(date, "%Y-%m-%d").date()
        prev_day = date_parsed - datetime.timedelta(3)
        prev_date = '{}-{:0>2}-{:0>2}'.format(prev_day.year, prev_day.month, prev_day.day)

        yt.config.set_proxy("hahn")
        crypta_dates = yt.list(CRYPTA_ROOT)
        need_dates = [i.strftime('%Y-%m-%d') for i in pd.date_range(prev_date, date)]
        dates_for_prepare = list(set(need_dates).intersection(set(crypta_dates)))
        if len(dates_for_prepare) < 2:
            prev_day = date_parsed - datetime.timedelta(7)
            prev_date = '{}-{:0>2}-{:0>2}'.format(prev_day.year, prev_day.month, prev_day.day)
            need_dates = [i.strftime('%Y-%m-%d') for i in pd.date_range(prev_date, date)]
            dates_for_prepare = list(set(need_dates).intersection(set(crypta_dates)))

        dates_str = '{' + ','.join([x for x in dates_for_prepare]) + '}'

        job = cluster.job(JOB_NAME + ': tmp_' + date).env(
            templates=dict(
                dates=dates_str,
            )
        )
        log = job.table('$crypta_root/@dates')
        result = log.groupby('yandexuid').sort('update_time').aggregate(
            update_time=na.last('update_time', by='update_time'),
            gender=na.last('gender', by='update_time'),
            user_age_6s=na.last('user_age_6s', by='update_time'),
            income_segments=na.last('income_segments', by='update_time'),
            top_common_site_ids=na.last('top_common_site_ids', by='update_time'),
            top_common_sites=na.last('top_common_sites', by='update_time'),
            search_fraudness=na.last('search_fraudness', by='update_time'),
            heuristic_segments=na.last('heuristic_segments', by='update_time'),
            probabilistic_segments=na.last('probabilistic_segments', by='update_time'),
            interests_composite=na.last('interests_composite', by='update_time'),
            yandex_loyalty=na.last('yandex_loyalty', by='update_time'),
            interests_longterm=na.last('interests_longterm', by='update_time'),
            ado_lal=na.last('ado_lal', by='update_time'),
            multiclass_segments=na.last('multiclass_segments', by='update_time'),
            marketing_segments=na.last('marketing_segments', by='update_time'),
            yandex_services_visits=na.last('yandex_services_visits', by='update_time'),
        ).put('$job_root/' + date)
        job.run()
    except Exception as ex:
        logging.error('Error with: {}, \nException: {}'.format(date, ex))


def one_process_star(all_args):
    return one_process(*all_args)


def main():
    args = argument_parser()
    date1 = args.date1
    date2 = args.date2
    job_root = args.rpath

    dates = [i.strftime('%Y-%m-%d') for i in pd.date_range(date1, date2)]
    try:
        yt.config.set_proxy("hahn")
        dates_done = yt.list(job_root)
        dates_done = [x for x in dates_done if x[0] == '2']
        dates_for_refresh = list(set(dates) - set(dates_done))
    except:
        dates_for_refresh = dates

    multiprocessing.freeze_support()
    proc_pool = multiprocessing.Pool(processes=PROCESSES)
    proc_pool.map(
        one_process_star,
        izip(
            repeat(job_root),
            repeat(args.pool),
            dates,
        )
    )


if __name__ == "__main__":
    main()
