#-*- coding: UTF-8 -*-
import nile
import argparse
import time
from nile.api.v1 import (
    filters as nf,
    aggregators as na,
    extractors as ne,
    statface as ns,
    clusters,
    Record
)
from qb2.api.v1 import (
    extractors as se,
    filters as sf,
    typing as qt
)
from copy import deepcopy
import datetime

def aggregate_spy_rows(groups):
    for key, recs in groups:
        data = {}
        url = ""
        playerData = ""
        staticData = ""
        for rec in recs:
            canoUrl = rec["canoUrl"]
            playerData = rec["playerData"]
            staticData = rec["staticData"]
            for date in rec["data"]:
                if date in data:
                    data[date]["lvt"] += rec["data"][date]["lvt"]
                    data[date]["shows"] += rec["data"][date]["shows"]
                    data[date]["tvt"] += rec["data"][date]["tvt"]
                    data[date]["users"] += rec["data"][date]["users"]
                else:
                    data[date] = {"lvt" : float(rec["data"][date]["lvt"]),
                                  "shows" : float(rec["data"][date]["shows"]),
                                  "tvt" : float(rec["data"][date]["tvt"]),
                                  "users" : float(rec["data"][date]["users"])}
        yield Record(canoUrl=canoUrl, data=data,
                     playerData=playerData, staticData=staticData)


def get_spy_last_date(cluster, table):
    ld_table = table + "_last_date"
    recs = list(cluster.read(ld_table))
    return recs[0].last_date


def main():
    parser = argparse.ArgumentParser()
    parser.add_argument('--cluster', required=True)
    parser.add_argument('--spy_data', required=True)
    parser.add_argument('--player_data', required=True)
    parser.add_argument('--player_data_fast', required=True)
    parser.add_argument('--prepared_spy_data', required=True)
    parser.add_argument('--joined_table', required=True)
    parser.add_argument('--joined_table_uniq_by_content', required=True)
    args = parser.parse_args()

    if args.cluster == 'banach':
        cluster = clusters.yt.Banach().env(parallel_operations_limit=10)
    elif args.cluster == 'arnold':
        cluster = clusters.yt.Arnold().env(parallel_operations_limit=10)
    else:
        cluster = clusters.yt.Hahn().env(parallel_operations_limit=10)

    spy_data_last_date = get_spy_last_date(cluster, args.spy_data)
    joined_table_last_date = cluster.driver.client.get_attribute(args.joined_table, '_max_date', '')

    if len(spy_data_last_date) > 0 and spy_data_last_date == joined_table_last_date:
        print "Already joined"
        return

    schema = {
        'canoUrl' : str,
        'data' : qt.Json,
        'playerData' : str,
        'staticData' : qt.Json,
    }

    joined_table_non_schematised = '{}_non_schematized'.format(args.joined_table)
    job = cluster.job().env(
        yt_spec_defaults=dict(pool_trees=["physical"],
                              tentative_pool_trees=["cloud"]),
        templates=dict(
            title='JoinCanonizedSpyDataWithIndex'
        )
    )
    uniq_spy_table =  job.table(args.prepared_spy_data) \
                     .filter(nf.custom(lambda x: "yandex.ru/" not in x, "canoUrl")) \
                     .groupby('canoUrl', 'page_url', 'frame_url') \
                     .aggregate(data=na.any('data'), playerData=na.any('playerData'), staticData=na.any('static_data'))
    player_data = job.concat(*[job.table(args.player_data), job.table(args.player_data_fast)])
    joined_by_url = uniq_spy_table.project(spyUrl='canoUrl', data='data', spyPlayerData='playerData', staticData='staticData') \
                                  .join(player_data,
                                        by_left="spyUrl", by_right="GroupingUrl", type='inner') \
                                  .filter(nf.custom(lambda x, y : x != y, 'playerData', 'spyPlayerData')) \
                                  .project(canoUrl='spyUrl', playerData='playerData', data='data', staticData='staticData')
    joined_by_player = uniq_spy_table.filter(nf.custom(lambda  x : x != "", 'playerData')) \
                                     .groupby('playerData').reduce(aggregate_spy_rows).project(ne.all(['canoUrl'])) \
                                     .join(player_data, by="playerData", type='inner') \
                                     .project(canoUrl='GroupingUrl', playerData='playerData', data='data', staticData='staticData')
    job.concat(*[joined_by_url, joined_by_player]) \
       .groupby('canoUrl') \
       .reduce(aggregate_spy_rows) \
       .sort('canoUrl') \
       .put(joined_table_non_schematised) \
       .groupby("playerData") \
       .aggregate(data=na.any('data')) \
       .put(args.joined_table_uniq_by_content)
    job.run()

    job = cluster.job()
    job.table(joined_table_non_schematised) \
       .put(args.joined_table, schema=schema)
    job.run()

    cluster.driver.client.set_attribute(args.joined_table, '_max_date', spy_data_last_date)
    cluster.driver.client.set_attribute(args.joined_table_uniq_by_content, '_max_date', spy_data_last_date)

if __name__ == '__main__':
    main()
