#!/usr/bin/env python
# -*- coding: utf-8 -*-

import argparse
import ast
import collections
from datetime import datetime
from itertools import repeat, izip
import multiprocessing
import urlparse

from nile.api.v1 import (
    filters as nf,
    aggregators as na,
    extractors as ne,
    clusters,
    Record,
)
import nile
import pandas as pd
import yt.wrapper as yt
import logging
logging.basicConfig(format='[%(asctime)s] %(filename)s[LINE:%(lineno)d] %(levelname)-8s %(message)s',
                    level=logging.ERROR)

PROCESSES = 3
JOB_NAME = 'HOME CRYPTA JOIN'


def argument_parser():
    parser = argparse.ArgumentParser(description='Get parameters')
    parser.add_argument(
        '-p',
        dest='pool',
        type=str,
        help='you hahn pool',
    )
    parser.add_argument(
        '-sp',
        dest='spath',
        type=str,
        help='sessions path',
    )
    parser.add_argument(
        '-cp',
        dest='cpath',
        type=str,
        help='crypta path',
    )
    parser.add_argument(
        '-rp',
        dest='rpath',
        type=str,
        help='result path',
    )
    parser.add_argument(
        '-fp',
        dest='featpath',
        type=str,
        help='session features path',
    )
    parser.add_argument(
        '-d1',
        dest='date1',
        type=str,
        help='date1 format "yyyy-mm-dd"',
    )
    parser.add_argument(
        '-d2',
        dest='date2',
        type=str,
        help='date2 format "yyyy-mm-dd"',
    )
    parser.add_argument(
        '-ap',
        dest='uapath',
        type=str,
        help='useraction path',
    )
    args = parser.parse_args()
    return args


def one_process(tmp_root, crypta_root, job_root, features_root, feature_tables_list, useractions_list, pool, date):
    logging.info(date + ' run')
    try:
        needed_features_table = sorted([x for x in feature_tables_list if x < date])[-1]
        cluster = clusters.Hahn(pool=pool).env(templates=dict(
            tmp_root=tmp_root,
            job_root=job_root,
            crypta_root=crypta_root,
            features_root=features_root,
            useractions_list=useractions_list,
            )
        )
        job = cluster.job(JOB_NAME + ': tmp_' + date)
        log = job.table('$tmp_root/{}'.format(date))
        crypta_logs = job.table('$crypta_root/{}'.format(date)).project(
            heuristic_segments='heuristic_segments',
            probabilistic_segments='probabilistic_segments',
            top_common_site_ids='top_common_site_ids',
            yandex_loyalty='yandex_loyalty',
            ado_lal='ado_lal',
            search_fraudness='search_fraudness',
            gender='gender',
            top_common_sites='top_common_sites',
            user_age_6s='user_age_6s',
            interests_longterm='interests_longterm',
            income_segments='income_segments',
            yandex_services_visits='yandex_services_visits',
            interests_composite='interests_composite',
            update_time='update_time',
            marketing_segments='marketing_segments',
            multiclass_segments='multiclass_segments',
            uid=ne.custom(lambda x: 'y' + str(x), 'yandexuid'),
        )
        features_table = job.table('$features_root/{}'.format(needed_features_table))
        user_actions = job.table('$useractions_list/{}'.format(date))
        result = log.join(
            crypta_logs,
            by='uid',
            type='left'
        ).join(
            user_actions,
            by='uid',
            type='left'
        ).join(
            features_table,
            by='uid',
            type='left'
        ).put('$job_root/' + date)
        job.run()
    except Exception as ex:
        logging.error('Error with: {}, \nException: {}'.format(date, ex))


def one_process_star(all_args):
    return one_process(*all_args)


def main():
    logging.info('script start')
    args = argument_parser()
    date1 = args.date1
    date2 = args.date2
    job_root = args.rpath
    tmp_root = args.spath
    crypta_root = args.cpath
    features_root = args.featpath
    uapath = args.uapath

    dates = [i.strftime('%Y-%m-%d') for i in pd.date_range(date1, date2)]
    yt.config.set_proxy("hahn")
    feature_tables_list = yt.list(features_root)
    try:
        dates_done = yt.list(job_root)
        dates_done = [x for x in dates_done if x[0] == '2']
        dates_for_refresh = list(set(dates) - set(dates_done))
    except:
        dates_for_refresh = dates

    logging.info('parsing start')
    multiprocessing.freeze_support()
    proc_pool = multiprocessing.Pool(processes=PROCESSES)
    proc_pool.map(
        one_process_star,
        izip(
            repeat(tmp_root),
            repeat(crypta_root),
            repeat(job_root),
            repeat(features_root),
            repeat(feature_tables_list),
            repeat(uapath),
            repeat(args.pool),
            dates_for_refresh
        )
    )


if __name__ == "__main__":
    main()
