# -*- coding: utf-8 -*-

from nile.api.v1 import (
    filters as nf,
    aggregators as na,
    extractors as ne,
    statface as ns,
    clusters,
    Record
)

from qb2.api.v1 import extractors as se, filters as sf
import argparse
import nile
import datetime
import uatraits, json, re
import urllib, urlparse
import math,cgi
import pandas as pd
from itertools import product
import sys
import os


cluster = clusters.yt.Hahn(pool='vika-pavlova'
      ).env(templates=dict(job_root='//home/videolog/vika-pavlova/5112-online_cinemas'
                          ),
            yt_spec_defaults=dict(pool_trees=["physical"],
                                  use_default_tentative_pool_trees = True),
            parallel_operations_limit=10
           )

def parse_direct(groups):
    import libra
    for key,recs in groups:
        uid = key.key
        try:
            s = libra.ParseSession(recs, "blockstat.dict", None, ["vid", "web"])
        except Exception as e:
            continue
        for r in s:
            q = str(r.Query).lower()
            date = str(datetime.datetime.fromtimestamp(r.Timestamp).isoformat()).split('T')[0]
            ts = r.Timestamp
            reqid = r.ReqID

            pos = 0
            Direct_Results = []

            for bl in r.GetMainBlocks():
                pos += 1
                result = bl.GetMainResult()
                is_direct_url = 0
                has_click = 0
                t = []
                if result.IsA('TDirectResult'):
                    Direct_Results.append({
                        "Url": str(result.Url),
                        "Position": result.Position,
                        "DwellTime_list": [cl.DwellTime for cl in bl.GetClicks()],
                        'Cliсks': len(bl.GetClicks()),
                    })
                else:
                    continue
                yield Record(Query = q,
                             ReqId = reqid, ts = ts, Direct_Results = Direct_Results,
                             UID = uid
                            )


def parse_us(recs):
     for rec in recs:
        hosts_top_3 = set()
        all_hosts = {}
        if rec["PageNo"] != 0:
            continue
        if "Results" in rec and rec["Results"]:
            pos = 1
            for result in rec["Results"]:
                if not result["Url"]:
                    continue
                if result["IsWizard"]:
                    continue
                old_host = urlparse.urlparse(result["Url"]).netloc
                if any(old_host.endswith(x) for x in
                       ['.ctc.ru', '.premier.one', '.tnt-online.ru', '.megogo.ru']):
                    host = '.'.join(old_host.split('.')[-2:])
                else:
                    host = old_host
                if host in ['more.tv', 'www.ivi.ru', 'ctc.ru', 'premier.one', 'tnt-online.ru', 'start.ru', 'Okko.tv', 'megogo.ru', 'www.kinopoisk.ru', 'hd.kinopoisk.ru']:
                    if host not in all_hosts:
                        all_hosts[host] = {'shows': 0, 'clicks': 0, 'real_positions': [],
                                           'dwelltime_clicks': [], 'positions': [], 'direct_shows': 0,
                                           'direct_clicks': 0}
                    all_hosts[host]['shows'] += 1
                    all_hosts[host]['real_positions'].append(result["Position"] + 1)
                    all_hosts[host]['positions'].append(pos)
                    if pos < 4:
                        hosts_top_3.add(host)
                pos += 1

        if "Clicks" in rec and rec["Clicks"]:
            for click in rec["Clicks"]:
                if not click["Url"]:
                    continue
                old_host = urlparse.urlparse(click["Url"]).netloc
                if any(old_host.endswith(x) for x in
                       ['.ctc.ru', '.premier.one', '.tnt-online.ru', '.megogo.ru']):
                    host = '.'.join(old_host.split('.')[-2:])
                else:
                    host = old_host
                if host in all_hosts:
                    all_hosts[host]['clicks'] += 1
                    if click["DwellTime"] > 120:
                        all_hosts[host]['dwelltime_clicks'].append(click["DwellTime"])

        if "Direct_Results" in rec and rec["Direct_Results"]:
            for result in rec["Direct_Results"]:
                if not result["Url"]:
                    continue

                old_host = urlparse.urlparse(result["Url"]).netloc
                if any(old_host.endswith(x) for x in
                       ['.ctc.ru', '.premier.one', '.tnt-online.ru', '.megogo.ru']):
                    host = '.'.join(old_host.split('.')[-2:])
                else:
                    host = old_host
                if host in ['more.tv', 'www.ivi.ru', 'ctc.ru', 'premier.one', 'tnt-online.ru', 'start.ru', 'Okko.tv', 'megogo.ru', 'www.kinopoisk.ru', 'hd.kinopoisk.ru']:
                    if host not in all_hosts:
                        all_hosts[host] = {'shows': 0, 'clicks': 0, 'real_positions': [],
                                           'dwelltime_clicks': [], 'positions': [], 'direct_shows': 0,
                                           'direct_clicks': 0
                                          }
                    all_hosts[host]['direct_shows'] = 1
                    all_hosts[host]['direct_clicks'] += result.get('Cliсks', 0)

        if all_hosts:
            for host, stat in all_hosts.iteritems():
                has_click = 1 if stat['clicks'] else 0
                has_shows = 1 if stat['shows'] else 0
                in_top_3 = 1 if host in hosts_top_3 else 0
                has_dwelltime_clicks = 1 if stat['dwelltime_clicks'] else 0
                yield Record(query = rec["Query"], uid = rec["UID"], reqid = rec['ReqId'],
                             normalized_query = rec['NormalizedQuery'],
                             clicks = stat['clicks'], has_click = has_click,
                             shows = stat['shows'], has_shows = has_shows,
                             real_positions = stat['real_positions'], positions = stat['positions'],
                             dwelltime_clicks = stat['dwelltime_clicks'],
                             has_dwelltime_clicks = has_dwelltime_clicks,
                             stat = stat, in_top_3 = in_top_3, host = host,
                             direct_shows = stat["direct_shows"],
                             direct_clicks = stat["direct_clicks"]
                            )


def parse_pos(recs):
    for rec in recs:
        for i in range(len(rec['real_positions'])):
            yield Record(host = rec['host'], fielddate = rec['fielddate'],
                         real_position = rec['real_positions'][i],
                         position = rec['positions'][i]
                        )


def prepare_for_stat(date):
    job = cluster.job()

    us_web = job.table('//user_sessions/pub/search/daily/' + date + '/clean')
    us_direct = job.table('user_sessions/pub/direct_urls/daily/' + date + '/clean')
    us = job.concat(us_direct, us_web)

    direct = us.groupby('key').sort('subkey'
                          ).reduce(parse_direct,
                                   files=[nile.files.RemoteFile('statbox/statbox-dict-last/blockstat.dict'),
                                          nile.files.RemoteFile('statbox/resources/libra.so'),
                                          nile.files.StatboxWheel('yandex_baobab_api')
                                         ],
                                   memory_limit=4000,
                                   intensity='data'
                                  ).sort('ts')

    web_squeeze = job.table('//home/dict/ontodb/squeezer/' + date + '/web'
                           )

    logs = direct.join(web_squeeze, by = ('UID', 'ReqId', 'Query'), type = 'right'
                      )


    total_queries = logs.filter(sf.equals("PageNo", 0)
                               ).aggregate(total_reqs = na.count()
                                          ).project(ne.all(),
                                                    fielddate = ne.const(date)
                                                    )
    parsed = logs.map(parse_us
                     ).project(ne.all(),
                               fielddate = ne.const(date)
                              )

    cinema_queries = parsed.aggregate(cinema_total = na.count()
                                   ).project(ne.all(),
                                             fielddate = ne.const(date)
                                            )
    host_pos = parsed.map(parse_pos
                      ).groupby('fielddate', 'host'
                               ).aggregate(mean_real_position = na.mean('real_position'),
                                           mean_position = na.mean('position'),
                                           median_real_position = na.median('real_position'),
                                           median_position = na.median('position')
                                          )
    total_pos = parsed.project('real_positions', 'positions', 'fielddate',
                            host = ne.const('_total_')
                           ).map(parse_pos
                                ).groupby('fielddate', 'host'
                                         ).aggregate(mean_real_position = na.mean('real_position'),
                                                     mean_position = na.mean('position'),
                                                     median_real_position = na.median('real_position'),
                                                     median_position = na.median('position')
                                                    )
    pos = job.concat(host_pos, total_pos)
    host_stat = parsed.groupby('fielddate', 'host'
                           ).aggregate(shows = na.sum('has_shows'),
                                       direct_shows = na.sum("direct_shows"),
                                       shows_top_3 = na.sum('in_top_3'),
                                       clicks = na.sum('clicks'),
                                       direct_clicks = na.sum('direct_clicks'),
                                       dwelltime_clicks = na.sum('has_dwelltime_clicks')
                                      )
    total_stat = parsed.project('has_shows', 'in_top_3', 'has_click', 'has_dwelltime_clicks', 'fielddate', 'clicks', 'direct_shows', 'direct_clicks',
                             host = ne.const('_total_')
                            ).groupby('fielddate', 'host'
                                     ).aggregate(shows = na.sum('has_shows'),
                                                 direct_shows = na.sum("direct_shows"),
                                                 shows_top_3 = na.sum('in_top_3'),
                                                 clicks = na.sum('clicks'),
                                                 direct_clicks = na.sum('direct_clicks'),
                                                 dwelltime_clicks = na.sum('has_dwelltime_clicks')
                                                )
    stat = job.concat(host_stat, total_stat)
    tmp = total_queries.join(cinema_queries, by = 'fielddate')
    tmp.join(pos, by = 'fielddate'
             ).join(stat, by = ('fielddate', 'host')
                   ).put('$job_root/final_' + date)
    job.run()


def put_data_to_stat(date):
    client = ns.StatfaceClient(
        proxy = 'upload.stat.yandex-team.ru',
        token = os.environ['STAT_TOKEN']
    )
    ns.StatfaceReport().path('Video.All/online_cinemas'
                            ).scale('daily'
                                   ).client(client
                                           ).remote_publish(proxy='hahn',
                                                            table_path='//home/videolog/vika-pavlova/5112-online_cinemas/final_' + date,
                                                            async_mode=False,
                                                            upload_config=False
                                                           )


def main():

    parser = argparse.ArgumentParser()
    parser.add_argument('--start_date', type=str, required=True)
    parser.add_argument('--end_date', type=str, required=True)
    args = parser.parse_args()

    for date in pd.date_range(start=args.start_date, end=args.end_date):
        date_str = str(date)[:10]
        prepare_for_stat(date_str)
        put_data_to_stat(date_str)


if __name__ == '__main__':
    main()
