from nile.api.v1 import (
    filters as nf,
    aggregators as na,
    extractors as ne,
    statface as ns,
    clusters,
    Record
)

from qb2.api.v1 import extractors as se, filters as sf
import argparse
import nile
import datetime
import uatraits, json, re
import urllib, urlparse
import math,cgi
import pandas as pd
from itertools import product
import sys
import os


def parse_us(groups):
    import libra

    for key,recs in groups:
        uid = key.key

        try:
            s = libra.ParseSession(recs, './blockstat.dict')
        except Exception as e:
            continue

        for r in s:
            if r.IsA('TYandexVideoRequest'):
                ui = 'desktop video'
            elif r.IsA('TTouchYandexVideoRequest'):
                ui = 'touch video'
            elif r.IsA('TYandexWebRequest'):
                ui = 'desktop web'
            elif r.IsA('TTouchYandexWebRequest'):
                ui = 'touch web'
            elif r.IsA('TMobileAppYandexVideoRequest') or r.IsA('TMobileAppYandexVideoPortalRequest') or r.IsA('TMobileAppYandexRelatedVideoRequest'):
                ui = 'app video'
            else:
                continue

            if r.PageNo != 0:
                continue

            q = str(r.Query).lower()
            date = str(datetime.datetime.fromtimestamp(r.Timestamp).isoformat()).split('T')[0]
            spv = r.SearchPropsValues

            if 'video' in ui:
                object_id = spv.get('UPPER.VideoExtraItems.object_id')
            else:
                object_id = spv.get('UPPER.EntitySearch.Ontoid')

            if not object_id:
                continue

            vh_tvt = 0
            other_tvt = 0
            vid_clicks = 0
            other_clicks = 0
            vh_urls = set()
            other_urls = set()


            for bl in r.GetMainBlocks():
                result = bl.GetMainResult()
                if result.IsA("TVideoResult"):

                    url = str(result.Url)

                    duration = r.FindVideoDurationInfo(result)
                    if (duration):
                        dt = min(duration.PlayingDuration, duration.Duration)
                        dur = max(duration.PlayingDuration, duration.Duration)
                    else:
                        dt = 0
                        dur = 0

                    heartbeat = r.FindVideoHeartbeat(result, 'ANY')
                    if (heartbeat):
                        ht = heartbeat.Ticks
                    else:
                        ht = 0

                    dur = result.Duration

                    res_tvt = max(dt,ht)

                    if 'frontend.vh' in url or 'kinopoisk.ru' in url:
                        vh_tvt += res_tvt
                        vh_urls.add(url)
                    else:
                        other_tvt += res_tvt
                        other_urls.add(url)

                elif result.IsA('TWebResult') or result.IsA('TBlenderWizardResult') or result.IsA('TWizardResult'):
                    for cl in bl.GetClicks():
                        url = str(cl.Url)
                        if 'yandex.ru/video' in url or 'kinopoisk.ru' in url:
                            vid_clicks += 1
                            vh_urls.add(url)
                        elif not 'yandex.ru' in url:
                            other_clicks += 1
                            other_urls.add(url)

            for bl in r.GetParallelBlocks():
                result = bl.GetMainResult()
                if result.IsA("TVideoResult"):

                    url = str(result.Url)

                    duration = r.FindVideoDurationInfo(result)
                    if (duration):
                        dt = min(duration.PlayingDuration, duration.Duration)
                        dur = max(duration.PlayingDuration, duration.Duration)
                    else:
                        dt = 0
                        dur = 0

                    heartbeat = r.FindVideoHeartbeat(result, 'ANY')
                    if (heartbeat):
                        ht = heartbeat.Ticks
                    else:
                        ht = 0

                    dur = result.Duration

                    res_tvt = max(dt,ht)

                    if 'frontend.vh' in url or 'kinopoisk.ru' in url:
                        vh_tvt += res_tvt
                        vh_urls.add(url)
                    else:
                        other_tvt += res_tvt
                        other_urls.add(url)

                elif result.IsA('TWebResult') or result.IsA('TBlenderWizardResult') or result.IsA('TWizardResult'):
                    for cl in bl.GetClicks():
                        url = str(cl.Url)
                        if 'yandex.ru/video' in url or 'kinopoisk.ru' in url:
                            vid_clicks += 1
                            vh_urls.add(url)
                        elif not 'yandex.ru' in url:
                            other_clicks += 1
                            other_urls.add(url)

            yield Record(ui=ui, q=q, object_id=object_id, vh_tvt=vh_tvt, other_tvt=other_tvt,
                         vid_clicks=vid_clicks, other_clicks=other_clicks, vh_urls=list(vh_urls),
                         other_urls=list(other_urls), date=date
                        )


def recs_combination(recs):

    for rec in recs:

        vh_tvt = rec.vh_tvt
        other_tvt = rec.other_tvt
        vid_clicks = rec.vid_clicks
        other_clicks = rec.other_clicks
        reqs = rec.reqs
        date = rec.date

        recs_list = list(product(
            (rec.object_item, '_total_'),
            (rec.ui, '_total_'),
            (rec.vh_filter, '_total_')))

        for triplet in recs_list:
            yield Record(object_item=triplet[0], ui=triplet[1], vh_filter=triplet[2], vh_tvt=vh_tvt,
                        other_tvt=other_tvt, vid_clicks=vid_clicks, other_clicks=other_clicks, reqs=reqs,
                        date=date
                        )

def process_data_for_stat(date):

    cluster = clusters.yt.Hahn(
    ).env(templates=dict(job_root='home/videolog/vika-pavlova/2082-vh_and_clicks_by_onto_id/' + date),
          yt_spec_defaults=dict(pool_trees=["physical"],
                                tentative_pool_trees=["cloud"]
                               ),
          parallel_operations_limit=10
         )


    job = cluster.job()

    t = job.table('home/videoindex/full/video_hosting/episodes'
                 ).filter(sf.defined('onto_id'))

    tt = t.filter(sf.or_(sf.equals('is_film', True),
                        sf.equals('is_series', True)
                        )
                 )

    ttt = tt.put('$job_root/vh_urls_with_ontoids'
                                       ).groupby('onto_id'
                                                ).aggregate(urls=na.count()
                                                           ).sort('urls').put('$job_root/check_defined_ontoids')

    us = job.table('user_sessions/pub/search/daily/' + date + '/clean')

    reqs = us.groupby('key'
                     ).sort('subkey'
                           ).reduce(parse_us,
                                    files=[nile.files.RemoteFile('statbox/statbox-dict-last/blockstat.dict'),
                                           nile.files.RemoteFile('statbox/resources/libra.so')],
                                    memory_limit=4000,
                                    intensity='data',
                                   ).put('$job_root/reqs')

    aggr = reqs.groupby('ui','object_id', 'date').aggregate(reqs=na.count(),
                                                    vh_tvt=na.sum('vh_tvt'),
                                                    other_tvt=na.sum('other_tvt'),
                                                    vid_clicks=na.sum('vid_clicks'),
                                                    other_clicks=na.sum('other_clicks')
                                                   ).put('$job_root/aggr_objects')

    ttt.join(aggr, by_left = 'onto_id', by_right = 'object_id', type ='right'
             ).project('date', 'object_id', 'reqs', 'vh_tvt', 'vid_clicks', 'other_clicks', 'other_tvt', 'ui',
                       vh_filter = ne.custom(lambda x: 'is_vh' if x else 'not_vh', 'onto_id')
                      ).put('$job_root/join_onto_ids')

    job.run()


    job = cluster.job()

    t1 = job.table('home/videolog/vika-pavlova/2082-vh_and_clicks_by_onto_id/cards')
    t2 = job.table('$job_root/join_onto_ids')

    t1.join(t2, by_left = 'onto_id', by_right = 'object_id', type ='inner'
           ).project('date', 'object_id', 'reqs', 'vh_tvt', 'vid_clicks', 'other_clicks', 'other_tvt', 'ui',
                     'vh_filter', 'oo_query', 'ttype').put('$job_root/final')
    job.run()


    job = cluster.job()

    final_table = job.table('$job_root/final')

    final_table.project(ne.all(),type_1 = ne.custom(lambda x: x[0] if x else '-','ttype')
                       ).filter(sf.contains('type_1', 'Film')
                                    ).project('date', 'object_id', 'reqs', 'vh_tvt', 'vid_clicks',
                                              'other_clicks', 'other_tvt', 'ui', 'vh_filter', 'oo_query',
                                              'ttype'
                                             ).put('$job_root/final_for_films')

    job.run()


    job = cluster.job()

    t = job.table('$job_root/final_for_films')

    t.project('date', 'vid_clicks', 'other_clicks', 'vh_tvt', 'other_tvt', 'reqs', 'ui', 'vh_filter',
                   object_item=ne.custom(lambda a, b: a + ' ' + b, 'object_id', 'oo_query'),
                   total=ne.custom(lambda a, b: a + b, 'vh_tvt', 'other_tvt'),
                   negative_total=ne.custom(lambda a, b: (-1)*(a + b),'vh_tvt', 'other_tvt')
                  ).sort('negative_total').put('$job_root/prepare_for_top')

    job.run()


    job = cluster.job()

    prepare_for_top = job.table('$job_root/prepare_for_top')

    t1 = prepare_for_top.top(1000, by = 'total', mode=max).sort('negative_total').put('$job_root/temp_1')

    t2 = prepare_for_top.filter(sf.equals('vh_filter', 'is_vh')
                               ).join(t1, by=('object_item', 'ui'), type='left_only'
                                     ).sort('negative_total'
                                           ).put('$job_root/temp_2')

    temp = job.concat(t1, t2).put('$job_root/temp_1_2')

    t3 = prepare_for_top.join(temp, by=('object_item', 'ui'), type='left_only'
                             ).groupby('date', 'ui').aggregate(other_clicks=na.sum('other_clicks'),
                                                         other_tvt=na.sum('other_tvt'),
                                                         reqs=na.sum('reqs'),
                                                         vh_tvt=na.sum('vh_tvt'),
                                                         vid_clicks=na.sum('vid_clicks'),
                                                         total=na.sum('total'),
                                                         negative_total=na.sum('negative_total')
                                                        ).project(ne.all(),
                                                                object_item=ne.const('other'),
                                                                vh_filter=ne.const('not_vh')
                                                                ).put('$job_root/temp_3')

    job.concat(t1, t2, t3).put('$job_root/top')

    job.run()


    job = cluster.job()

    t = job.table('$job_root/top')

    tt = t.project('date', 'reqs', 'vh_tvt', 'vid_clicks', 'other_clicks', 'other_tvt', 'vh_filter',
                   'object_item', ui=ne.custom(lambda x: 'desktop' if 'desktop' in x else x, 'ui')
                  ).project('date', 'reqs', 'vh_tvt', 'vid_clicks', 'other_clicks', 'other_tvt', 'vh_filter',
                            'object_item', ui=ne.custom(lambda x: 'touch' if 'touch' in x else x, 'ui')
                           ).project('date', 'reqs', 'vh_tvt', 'vid_clicks', 'other_clicks', 'other_tvt',
                                     'vh_filter', 'object_item',
                                     ui=ne.custom(lambda x: 'app' if 'app' in x else x, 'ui')
                                    )

    tt.map(recs_combination, memory_limit=4000).groupby('date', 'object_item', 'ui', 'vh_filter'
                                                       ).aggregate(reqs=na.sum('reqs'),
                                                                   vh_tvt=na.sum('vh_tvt'),
                                                                   other_tvt=na.sum('other_tvt'),
                                                                   vid_clicks=na.sum('vid_clicks'),
                                                                   other_clicks=na.sum('other_clicks')
                                                                  ).put('$job_root/for_stat')

    job.run()


    job = cluster.job()

    final_for_stat = job.table('$job_root/for_stat')

    final_for_stat.project('reqs','vh_tvt', 'vid_clicks', 'other_tvt', 'other_clicks',
                           fielddate=ne.custom(lambda a: a, 'date'),
                           platform=ne.custom(lambda a: a, 'ui'),
                           ontoid=ne.custom(lambda a: a, 'object_item'),
                           is_on_vh=ne.custom(lambda a: a, 'vh_filter')
                          ).put('$job_root/final_for_stat')

    job.run()


def put_data_to_stat(date):

    client = ns.StatfaceClient(
        proxy = 'upload.stat.yandex-team.ru',
        token = os.environ['STAT_TOKEN']
    )
    ns.StatfaceReport().path('Video.All/tvt_vh_share') \
                       .scale('daily') \
                       .client(client) \
                       .remote_publish(proxy='hahn',
                                       table_path='//home/videolog/vika-pavlova/2082-vh_and_clicks_by_onto_id/' + date + '/final_for_stat',
                                       async_mode=False,
                                       upload_config=False)



def main():

    parser = argparse.ArgumentParser()
    parser.add_argument('--start_date', type=str, required=True)
    parser.add_argument('--end_date', type=str, required=True)
    args = parser.parse_args()

    for date in pd.date_range(start=args.start_date, end=args.end_date):
        date_str = str(date)[:10]
        process_data_for_stat(date_str)
        put_data_to_stat(date_str)


if __name__ == '__main__':
    main()
