#!/usr/bin/env python
# -*- coding: utf-8 -*-

import argparse
import pytz
import ast
import collections
from datetime import datetime
from itertools import repeat, izip
import multiprocessing
import urlparse
import qb2.resources_impl.testenv
from geobase4 import Lookup

from nile.api.v1 import (
    filters as nf,
    aggregators as na,
    extractors as ne,
    clusters,
    Record,
)
import nile
import pandas as pd
import yt.wrapper as yt
import logging
logging.basicConfig(format='[%(asctime)s] %(filename)s[LINE:%(lineno)d] %(levelname)-8s %(message)s',
                    level=logging.ERROR)

JOB_NAME = 'APP HOME TRAIN TEST'
EMPTY_CONST = -100
ZERO_CONST = 0
BLOCK_FEATURES = {
    'clicked': 0,
    'clicked_freq': 0,
    'clicked_top': 0,
    'count': 0,
    'ctr': 0,
    'ctr_top': 0,
    'showed_freq': 0,
    'total_clicks': 0,
}

with open('b_list', 'r') as b_list:
    BLAND_BLOCKS = [x for x in b_list.read().split('\n')]


def show_day_part(ts, pytz_timezone):
    hour = datetime.fromtimestamp(ts, pytz_timezone).hour
    if hour > 21 or hour <= 3:
        return 'night'
    elif hour > 17:
        return 'evening'
    elif hour > 12:
        return 'afternoon'
    elif hour > 7:
        return 'morning'
    else:
        return 'early_morning'


def make_features(
    ts,
    timezone,
    block_path,
    mean_clicked,
    mean_realshow_blocks,
    req_clicked_freq,
    req_count,
    blocks_dict,
    mean_blocks,
):
    features_dict = {}
    try:
        pytz_timezone = pytz.timezone(timezone)
    except:
        pytz_timezone = pytz.timezone('Europe/Moscow')
    features_dict['week_day'] = str(datetime.fromtimestamp(ts, pytz_timezone).isoweekday())
    features_dict['weekend'] = int(features_dict['week_day'] in ['6', '7'])
    features_dict['day_time'] = show_day_part(ts, pytz_timezone)
    if mean_clicked is not None:
        features_dict['personal__mean_clicked'] = mean_clicked
    else:
        features_dict['personal__mean_clicked'] = EMPTY_CONST
    if mean_realshow_blocks is not None:
        features_dict['personal__mean_realshow_blocks'] = mean_realshow_blocks
    else:
        features_dict['personal__mean_realshow_blocks'] = EMPTY_CONST
    if req_clicked_freq is not None:
        features_dict['personal__req_clicked_freq'] = req_clicked_freq
    else:
        features_dict['personal__req_clicked_freq'] = EMPTY_CONST
    if req_count is not None:
        features_dict['personal__req_count'] = req_count
    else:
        features_dict['personal__req_count'] = EMPTY_CONST
    if mean_blocks is not None:
        features_dict['personal__mean_blocks'] = mean_blocks
    else:
        features_dict['personal__mean_blocks'] = EMPTY_CONST

    if blocks_dict is not None:
        if block_path in blocks_dict:
            for block_feat in BLOCK_FEATURES:
                features_dict['block__{}'.format(block_feat)] = blocks_dict[block_path][block_feat]
        else:
            for block_feat in BLOCK_FEATURES:
                features_dict['block__{}'.format(block_feat)] = BLOCK_FEATURES[block_feat]
    else:
        for block_feat in BLOCK_FEATURES:
            features_dict['block__{}'.format(block_feat)] = EMPTY_CONST
    return features_dict


def geobase_reduce(groups):
    gb = Lookup('geodata4.bin')
    for key, recs in groups:
        region = key.region
        date = key.date
        try:
            timezone = gb.region_by_id(region).timezone
            test = pytz.timezone(timezone)
        except:
            timezone = 'error'
        if timezone == '':
            timezone = 'error'
        for request in recs:
            reqid = request['reqid']
            clicks_in_req = request['clicks_in_req']
            yield Record(
                reqid=reqid,
                region=region,
                date=date,
                timezone=timezone,
                clicks_in_req=clicks_in_req,
            )


def argument_parser():
    parser = argparse.ArgumentParser(description='Get parameters')
    parser.add_argument(
        '-p',
        dest='pool',
        type=str,
        help='you hahn pool',
    )
    parser.add_argument(
        '-lr',
        dest='lpath',
        type=str,
        help='logs path',
    )
    parser.add_argument(
        '-r',
        dest='rpath',
        type=str,
        help='result path',
    )
    parser.add_argument(
        '-d1',
        dest='date1',
        type=str,
        help='date1 format "yyyy-mm-dd"',
    )
    parser.add_argument(
        '-d2',
        dest='date2',
        type=str,
        help='date2 format "yyyy-mm-dd"',
    )
    args = parser.parse_args()
    return args


def main():
    args = argument_parser()
    date1 = args.date1
    date2 = args.date2
    result_path = args.rpath
    log_root = args.lpath
    cluster = clusters.Hahn(pool=args.pool).env(templates=dict(
        result_path=result_path,
        log_root=log_root,
        tmp_files='//home/search-functionality/syudin/tmp',
        )
    )
    dates = [i.strftime('%Y-%m-%d') for i in pd.date_range(date1, date2)]

    yt.config.set_proxy('hahn')
    external_files = [
        nile.files.StatboxDict('geodata4.bin'),
    ]
    dates_done = list(set(dates).intersection(set(yt.list(log_root))))
    dates_str = '{' + ','.join([x for x in dates_done]) + '}'
    job = cluster.job(JOB_NAME + ': result_{}_{}'.format(date1, date2)).env(
        templates=dict(
            dates=dates_str,
        )
    )
    log = job.table('$log_root/@dates')
    filtered_log = log.filter(
        nf.custom(lambda x: x in BLAND_BLOCKS, 'path')
    )

    reqid_aggr = filtered_log.groupby(
        'reqid',
        'region',
        'date',
    ).aggregate(
        clicks_in_req=na.sum('num_clicks')
    ).groupby(
        'region',
        'date',
    ).reduce(
        geobase_reduce,
        files=external_files,
    )

    result = filtered_log.join(
        reqid_aggr,
        by=('reqid', 'date', 'region'),
        type='left'
    ).filter(
        nf.and_(
            nf.custom(lambda x: x > 0, 'clicks_in_req'),
            nf.custom(lambda x: x != 'error', 'timezone'),
        )
    ).project(
        key=ne.custom(
            lambda r, p, c, rs: '{}_{}_{}_{}'.format(r, p, c, rs),
            'reqid',
            'path',
            'num_clicks',
            'realshow',
        ),
        subkey='date',
        uid='uid',
        path='path',
        region='region',
        timezone='timezone',
        label_clicked=ne.custom(lambda c, : int(c > 0), 'num_clicks'),
        features_dict=ne.custom(
            make_features,
            'ts',
            'timezone',
            'path',
            'mean_clicked',
            'mean_realshow_blocks',
            'req_clicked_freq',
            'req_count',
            'blocks_dict',
            'mean_blocks',
        ),
        label_realshow=ne.custom(lambda c, r, : int(c > 0) if r else None, 'num_clicks', 'realshow'),
    ).put('$result_path')
    job.run()

if __name__ == "__main__":
    main()
