#!/usr/bin/env python
# -*- coding: utf-8 -*-

import argparse
import ast
import collections
from datetime import datetime
from itertools import repeat, izip
import multiprocessing
import urlparse

from nile.api.v1 import (
    filters as nf,
    aggregators as na,
    extractors as ne,
    clusters,
    Record,
)
import nile
import pandas as pd
import yt.wrapper as yt
import logging
logging.basicConfig(format='[%(asctime)s] %(filename)s[LINE:%(lineno)d] %(levelname)-8s %(message)s',
                    level=logging.ERROR)

PROCESSES = 3
JOB_NAME = 'PP: FEATURES JOIN'


def argument_parser():
    parser = argparse.ArgumentParser(description='Get parameters')
    parser.add_argument(
        '-p',
        dest='pool',
        type=str,
        help='you hahn pool',
    )
    parser.add_argument(
        '-sp',
        dest='spath',
        type=str,
        help='sessions path',
    )
    parser.add_argument(
        '-rp',
        dest='rpath',
        type=str,
        help='result path',
    )
    parser.add_argument(
        '-fp',
        dest='featpath',
        type=str,
        help='session features path',
    )
    parser.add_argument(
        '-d1',
        dest='date1',
        type=str,
        help='date1 format "yyyy-mm-dd"',
    )
    parser.add_argument(
        '-d2',
        dest='date2',
        type=str,
        help='date2 format "yyyy-mm-dd"',
    )
    args = parser.parse_args()
    return args


def one_process(tmp_root, job_root, features_root, feature_tables_list, pool, date):
    logging.info(date + ' run')
    try:
        needed_features_table = sorted([x for x in feature_tables_list if x < date])[-1]
        logging.info(date + ': needed table - ' + needed_features_table)
        cluster = clusters.Hahn(pool=pool).env(templates=dict(
            tmp_root=tmp_root,
            job_root=job_root,
            features_root=features_root,
            )
        )
        job = cluster.job(JOB_NAME + ': tmp_' + date)
        log = job.table('$tmp_root/{}'.format(date))
        features_table = job.table('$features_root/{}'.format(needed_features_table))
        result = log.join(
            features_table,
            by='uid',
            type='left'
        ).put('$job_root/' + date)
        job.run()
    except Exception as ex:
        logging.error('Error with: {}, \nException: {}'.format(date, ex))


def one_process_star(all_args):
    return one_process(*all_args)


def main():
    logging.info('script start')
    args = argument_parser()
    date1 = args.date1
    date2 = args.date2
    job_root = args.rpath
    tmp_root = args.spath
    features_root = args.featpath

    dates = [i.strftime('%Y-%m-%d') for i in pd.date_range(date1, date2)]
    yt.config.set_proxy("hahn")
    feature_tables_list = yt.list(features_root)
    logging.info('feat_tables:' + str(feature_tables_list))
    try:
        dates_done = yt.list(job_root)
        dates_done = [x for x in dates_done if x[0] == '2']
        dates_for_refresh = list(set(dates) - set(dates_done))
    except:
        dates_for_refresh = dates

    logging.info('join start')
    multiprocessing.freeze_support()
    proc_pool = multiprocessing.Pool(processes=PROCESSES)
    proc_pool.map(
        one_process_star,
        izip(
            repeat(tmp_root),
            repeat(job_root),
            repeat(features_root),
            repeat(feature_tables_list),
            repeat(args.pool),
            dates_for_refresh
        )
    )


if __name__ == "__main__":
    main()
