#!/usr/bin/env python
# -*- coding: utf-8 -*-
from __future__ import division
import sys
import os
import codecs
import argparse
import datetime
from nile.api.v1 import (
    filters as nf,
    aggregators as na,
    extractors as ne,
    clusters,
    Record
)
import re
import math
import json
import nile
import time


class Counter(object):

    def __init__(self, field):
        self.field = field

    def __call__(self, obj):
        sum_ = 0
        for key in obj:
            sum_ += (obj[key][self.field] or 0)
        return sum_


def parse_date(s):
    try:
        return datetime.datetime.strptime(s, '%Y-%m-%d').date()
    except (TypeError, ValueError, AttributeError):
        return


class CutoffByDate(object):

    def __init__(self, start_date, end_date):
        self.start_date = start_date
        self.end_date = end_date

    def __call__(self, records):
        for rec in records:
            result = rec.to_dict()
            result['data'] = {
                k: v for k, v in result['data'].items()
                if parse_date(k) >= self.start_date and parse_date(k) <= self.end_date
            }
            if result['data']:
                yield Record(**result)

class mapper(object):
    def __init__(self, stat_type="180Days", suffix=''):
        self.stat_type = stat_type
        self.suffix = suffix
    def __call__(self, recs):
        for rec in recs:
            clean_url = rec["canoUrl"]
            if not clean_url:
                return

            value = "LVTByLast" + self.stat_type + self.suffix + "=" + str(rec["lvt"])
            value += "\tTVTByLast" + self.stat_type + self.suffix + "=" + str(rec["tvt"])
            value += "\tShowsByLast" + self.stat_type + self.suffix + "=" + str(rec["shows"])
            value += "\tUsersByLast" + self.stat_type + self.suffix + "=" + str(rec["users"])
            yield Record(key=clean_url, subkey="", value=value)


def get_spy_last_date(cluster, table):
    recs = list(cluster.read(table))
    return recs[0].last_date

SPY_LOGS_FACTORS_TABLE_PREFIX = "//home/videoindex/static_factors_new_format/portions/spy_factors"
SPY_LOGS_FAST_FACTORS_TABLE_PREFIX = "//home/videoindex/ultra/static_factors/portions/spy_factors"

def main():

    parser = argparse.ArgumentParser()
    parser.add_argument('--days_threshold', type=int, default=180)
    parser.add_argument(
        '--additive',
        default='//home/videolog/selrank_stats/cano_url_spy_stat'
    )
    parser.add_argument(
        '--last_date_table',
        required=True
    )
    parser.add_argument('--field', default='canoUrl')
    parser.add_argument('--cluster', default='banach')
    args = parser.parse_args()

    if args.cluster == 'hahn':
        cluster = clusters.yt.Hahn().env(parallel_operations_limit=10,
                                     yt_spec_defaults=dict(
                                         pool_trees=["physical"],
                                         tentative_pool_trees=["cloud"]
                                     ),
                                     templates=dict(
                                         tmp_root='//tmp',
                                         title='SpyDataReduceAndPrepareToFactors'
                                     ))
    elif args.cluster == 'arnold':
        cluster = clusters.yt.Arnold().env(parallel_operations_limit=10,
                                     yt_spec_defaults=dict(
                                         pool_trees=["physical"],
                                         tentative_pool_trees=["cloud"]
                                     ),
                                     templates=dict(
                                         tmp_root='//tmp',
                                         title='SpyDataReduceAndPrepareToFactors'
                                     ))
    else:
        cluster = clusters.yt.Banach().env(parallel_operations_limit=10,
                                     yt_spec_defaults=dict(
                                         pool_trees=["physical"],
                                         tentative_pool_trees=["cloud"]
                                     ),
                                     templates=dict(
                                         tmp_root='//tmp',
                                         title='SpyDataReduceAndPrepareToFactors'
                                     ))

    table_additive = args.additive
    table_reduced = args.additive + "_reduced_" + str(args.days_threshold) + "_days"

    project_kwargs = {}
    for field in ['tvt', 'lvt', 'shows', 'users']:
        project_kwargs[field] = ne.custom(Counter(field), 'data')
    project_kwargs['lvt_neg'] = ne.custom(
        lambda x: -Counter('lvt')(x), 'data'
    )

    end_date = parse_date(get_spy_last_date(cluster, args.last_date_table))
    start_date = end_date - datetime.timedelta(args.days_threshold - 1)

    print('Will get data from {} to {}'.format(start_date, end_date))
    start_date_str = start_date.strftime('%Y-%m-%d')
    end_date_str = end_date.strftime('%Y-%m-%d')
    if cluster.driver.client.exists(table_reduced) and \
       cluster.driver.client.get_attribute(table_reduced, '_end_date', '') == end_date_str and \
       cluster.driver.client.get_attribute(table_reduced, '_start_date', '') == start_date_str:
        print('Data already ready')
    else:
        job = cluster.job()
        job.table(table_additive) \
           .map(CutoffByDate(start_date, end_date)) \
           .project(args.field, **project_kwargs) \
           .sort('lvt').put(table_reduced)
        job.run()
        cluster.driver.client.set_attribute(
            table_reduced,
            '_end_date',
            end_date_str
        )
        cluster.driver.client.set_attribute(
            table_reduced,
            '_start_date',
            start_date_str
        )
        cluster.driver.client.set_attribute(
            table_reduced,
            '_need_update_factors',
            True
        )

    print('Will prepare data for factors')

    VIDEOQUICK_INDEX_URLS = "//home/videoindex/ultra/docbase/adapted/media"
    need_update_factors = cluster.driver.client.get_attribute(table_reduced, '_need_update_factors', True)
    if not need_update_factors:
        print('Factors already dumped')
        return
    current_ts = int(time.time())
    stat_type = str(args.days_threshold) + "Days"
    if args.days_threshold == 1:
        stat_type = "1Day"
    factor_sufix = ''
    if "mobile" in table_additive:
        factor_sufix = "Mobile"
    table_to_transfer = "//tmp/spy_log_factors" + factor_sufix + stat_type + "." + str(current_ts)
    job = cluster.job()
    job.table(table_reduced) \
       .map(mapper(stat_type, factor_sufix)) \
       .sort('key', 'subkey') \
       .put(table_to_transfer)
    job.run()
    cluster.driver.copy(table_to_transfer,
                        SPY_LOGS_FACTORS_TABLE_PREFIX + factor_sufix + stat_type + "." + str(current_ts) + ".tab")
    if stat_type in ['7Days', '1Day']:
        job = cluster.job()
        job.table(VIDEOQUICK_INDEX_URLS) \
           .project('key') \
           .join(job.table(table_to_transfer), by='key', type='inner') \
           .groupby('key', 'subkey').aggregate(value=na.any('value')) \
           .sort('key', 'subkey') \
           .put(table_to_transfer + '_fast')
        job.run()
        cluster.driver.copy(table_to_transfer + '_fast',
                            SPY_LOGS_FAST_FACTORS_TABLE_PREFIX + factor_sufix + stat_type + "." + str(current_ts) + ".tab")
    cluster.driver.set_attribute(table_reduced, '_need_update_factors', False)

    print('finished')

if __name__ == "__main__":
    main()
