#!/usr/bin/env python
# -*- coding: utf-8 -*-
from __future__ import division
import sys
import os
import codecs
import argparse
import datetime
from nile.api.v1 import (
    filters as nf,
    aggregators as na,
    extractors as ne,
    clusters,
    Record
)
import re
import math
import json
import requests
import nile
import time


class Counter(object):

    def __init__(self, field):
        self.field = field

    def __call__(self, obj):
        sum_ = 0
        for key in obj:
            sum_ += (obj[key][self.field] or 0)
        return sum_


def parse_date(s):
    try:
        return datetime.datetime.strptime(s, '%Y-%m-%d').date()
    except (TypeError, ValueError, AttributeError):
        return


class CutoffByDate(object):

    def __init__(self, start_date, end_date):
        self.start_date = start_date
        self.end_date = end_date

    def __call__(self, records):
        for rec in records:
            result = rec.to_dict()
            result['data'] = {
                k: v for k, v in result['data'].items()
                if parse_date(k) >= self.start_date and parse_date(k) <= self.end_date
            }
            if result['data']:
                yield Record(**result)


def get_max_date(records):
    for rec in records:
        max_date = None
        max_date_str = None
        for k, v in rec['data'].items():
            if not max_date or parse_date(k) > max_date:
                max_date = parse_date(k)
                max_date_str = k
        if max_date:
            yield Record(max_date=max_date_str)


def main():
    FRAME_URL_TO_INDEX_URL_MAP = "//home/videolog/selrank_stats/frame_url_to_index_url_map"

    parser = argparse.ArgumentParser()
    parser.add_argument('--token')
    parser.add_argument('--end_date')
    parser.add_argument('--start_date')
    parser.add_argument('--days_threshold', type=int, default=180)
    parser.add_argument(
        '--additive',
        default='//home/videolog/selrank_stats/additive'
    )
    parser.add_argument(
        '--reduced',
        default='//home/videolog/selrank_stats/reduced'
    )
    parser.add_argument(
        '--banach_table',
        default='//home/videoindex/home/tadart/selrank/selrank_stats/reduced'
    )
    parser.add_argument('--field', default='frame_url')
    parser.add_argument('--debug', action='store_true')
    parser.add_argument('--use_frame_url_map', type=int, default=0)
    args = parser.parse_args()

    kwargs = {'token': os.environ['YT_TOKEN']}
    hahn = clusters.yt.Hahn(**kwargs).env(parallel_operations_limit=10,
                                          yt_spec_defaults=dict(
                                              pool_trees=["physical"],
                                              tentative_pool_trees=["cloud"]
                                          ))

    table_additive = args.additive
    table_reduced = args.reduced
    banach_table = args.banach_table
    use_frame_url_map = args.use_frame_url_map

    job = hahn.job()

    project_kwargs = {}
    for field in ['tvt', 'lvt', 'shows', 'users']:
        project_kwargs[field] = ne.custom(Counter(field), 'data')
    project_kwargs['lvt_neg'] = ne.custom(
        lambda x: -Counter('lvt')(x), 'data'
    )

    if not args.end_date:
        print("End date not set, will calc last date with data")
        ts = str(time.time())
        table_to_calc_end_date = "//tmp/max_spy_date_" + ts
        job.table(
            table_additive
        ).map(
            get_max_date
        ).aggregate(max_date=na.max('max_date')).put(table_to_calc_end_date)
        job.run()
        for rec in hahn.read(table_to_calc_end_date):
            end_date = parse_date(rec["max_date"])
        print("Last date is {}".format(end_date))
    else:
        end_date = parse_date(args.end_date)

    if args.start_date:
        start_date = parse_date(args.start_date)
    elif args.days_threshold:
        start_date = end_date - datetime.timedelta(args.days_threshold - 1)
    else:
        raise Exception('Start date and days threshold not set')

    print('Will get data from {} to {}'.format(start_date, end_date))
    start_date_str = start_date.strftime('%Y-%m-%d')
    end_date_str = end_date.strftime('%Y-%m-%d')
    if hahn.driver.client.exists(table_reduced) and \
       hahn.driver.client.get_attribute(table_reduced, '_end_date', '') == end_date_str and \
       hahn.driver.client.get_attribute(table_reduced, '_start_date', '') == start_date_str:
        print('Data already ready')
    else:
        job.table(
            table_additive
        ).map(
            CutoffByDate(start_date, end_date)
        ).project(
            args.field, **project_kwargs
        ).sort('lvt').put(
            table_reduced
        )
        job.run()
        if use_frame_url_map:
            job = hahn.job()
            job.table(table_reduced).join(job.table(FRAME_URL_TO_INDEX_URL_MAP), by="frame_url", type='inner').groupby('url').aggregate(
                url=na.any('url'), users=na.sum('users'), tvt=na.sum('tvt'), lvt=na.sum('lvt'), shows=na.sum('shows')).put(table_reduced)
            job.run()
        hahn.driver.client.set_attribute(
            table_reduced,
            '_end_date',
            end_date_str
        )
        hahn.driver.client.set_attribute(
            table_reduced,
            '_start_date',
            start_date_str
        )
        hahn.driver.client.set_attribute(
            table_reduced,
            '_need_update_factors',
            True
        )

    print('finished')
    if not args.banach_table or args.banach_table.lower() in {'none', 'null'}:
        print('Banach table not specified, exiting')
        sys.exit(0)

    req = requests.post(
        'http://transfer-manager.yt.yandex.net/api/v1/tasks/',
        json={
            'source_cluster': 'hahn',
            'source_table': table_reduced,
            'destination_cluster': 'banach',
            'destination_table': banach_table,
        },
        headers={'Authorization': 'OAuth {}'.format(kwargs['token'])}
    )
    if req.status_code == 200:
        print(
            'Table transfer from hahn to banach started, '
            'you can look it up here:'
            'https://transfer-manager.yt.yandex-team.ru/task?id='
            '{}'.format(req.content)
        )
    else:
        print(
            'Got error whily trying to transfer, status code {}, '
            'content: {}'.format(req.status_code, req.content)
        )


if __name__ == "__main__":
    main()
