# -*- coding: utf-8 -*-

from nile.api.v1 import (
    filters as nf,
    aggregators as na,
    extractors as ne,
    statface as ns,
    clusters,
    Record
)

from qb2.api.v1 import extractors as se, filters as sf
import argparse
import nile
import datetime
import uatraits, json, re
import urllib, urlparse
import math,cgi
import pandas as pd
from itertools import product
import sys
import os


cluster = clusters.yt.Hahn(pool='vika-pavlova'
      ).env(templates=dict(job_root='//home/videolog/vika-pavlova/mma-4662-video_hosts_cube'
                          ),
            yt_spec_defaults=dict(pool_trees=["physical"],
                                  use_default_tentative_pool_trees = True),
            parallel_operations_limit=10
           )


def parse_cube(recs):

    for rec in recs:
        theme = 'other'

        if rec['SearchPropsValues'] and type(rec['SearchPropsValues']) == dict:
            if rec['SearchPropsValues'].get("VIDEO.VideoPorno.vidprn") == "ipq1":
                theme = 'porno'
        if rec['obj_type'] and rec['obj_type'] == "Film/Film/":
            theme = 'film'
        if rec['obj_type'] and rec['obj_type'] == "Film/Series/":
            theme = 'series'

        for result in rec['results']:
            host = urlparse.urlparse(result['url']).netloc

            is_related = 'yes' if rec.is_related == True else 'no'

            yield Record(uid = rec.uid, ReqID = rec.ReqID, is_related = is_related,
                         fielddate = rec.fielddate, ui = rec.ui,
                         host = host, tvt = result['tvt'],
                         lvt = result['lvt'], long_watches = rec['long_watches'],
                         pos = result['pos'], player = result['player'],
                         clicks = result['clicks'], duration = result['duration'],
                         theme = theme, url = rec.url
                        )


def recs_combination(recs):
    for rec in recs:
        recs_list = list(product(
            (rec.ui, '_total_'),
            (rec.is_related, '_total_'),
            (rec.theme, '_total_'),
            (rec.new_host, '_total_')
        )
                        )
        for item in recs_list:
            yield Record(fielddate = rec.fielddate,
                         ui = item[0],
                         is_related = item[1],
                         theme = item[2],
                         host = item[3],
                         tvt = rec.tvt, lvt = rec.lvt, long_watches = rec.long_watches,
                         pos = rec.pos, player = rec.player, clicks = rec.clicks,
                         duration = rec.duration, reqid = rec.ReqID
                         )


def find_top(date):

    job = cluster.job()

    raw_search = job.table('//home/videolog/24julia/video_queries_cube/' + date
                   )
    raw_related = job.table('//home/videolog/24julia/video_queries_cube.related/' + date)

    raw = job.concat(raw_search, raw_related
                    ).map(parse_cube
                         ).put('$job_root/raw')

    aggr = raw.groupby('host'
                      ).aggregate(clicks = na.count()
                                 )

    top = aggr.top(100, by = 'clicks'
                  ).put('$job_root/top')

    job.run()


def prepare_for_stat(top, date):

    job = cluster.job()

    raw = job.table('$job_root/raw'
                   ).sort('host'
                         ).project(ne.all(),
                                   new_host = ne.custom(lambda x: x if x in top else 'other', 'host')
                                  )

    raw.map(recs_combination, memory_limit=10000
           ).groupby('fielddate', 'ui', 'is_related', 'theme', 'host'
                    ).aggregate(tvt = na.sum('tvt'),
                                clicks = na.sum('clicks'),
                                lvt = na.sum('lvt'),
                                long_watches = na.sum('long_watches'),
                                shows = na.count_distinct('reqid'),
                                shows_top_1 = na.count_distinct('reqid', predicate=nf.custom(lambda x: x == 0, 'pos')),
                                shows_top_5 = na.count_distinct('reqid', predicate=nf.custom(lambda x: x < 5, 'pos'))
                                ).put('$job_root/final_' + date)

    job.run()


def put_data_to_stat(date):
    client = ns.StatfaceClient(
        proxy = 'upload.stat.yandex-team.ru',
        token = os.environ['STAT_TOKEN']
    )
    ns.StatfaceReport().path('Video.All/video_hosts_report'
                            ).scale('daily'
                                   ).client(client
                                           ).remote_publish(proxy='hahn',
                                                            table_path='//home/videolog/vika-pavlova/mma-4662-video_hosts_cube/final_' + date,
                                                            async_mode=False,
                                                            upload_config=False
                                                           )



def main():

    parser = argparse.ArgumentParser()
    parser.add_argument('--start_date', type=str, required=True)
    parser.add_argument('--end_date', type=str, required=True)
    args = parser.parse_args()

    for date in pd.date_range(start=args.start_date, end=args.end_date):
        date_str = str(date)[:10]

        find_top(date_str)

        top = {"www.youtube.com", "vk.com", "ok.ru", "video.mail.ru",
          "frontend.vh.yandex.ru", "www.24video.site", "www.xvideos.com",
          "www.pornhub.com", "www.instagram.com", "www.tiktok.com", "www.facebook.com", "twitter.com", 'www.kinoposk.ru', 'sportrecs.com',    'russian.rt.com', 'ren.tv'}

        for rec in cluster.read('//home/videolog/vika-pavlova/mma-4662-video_hosts_cube/top'):
            top.add(str(rec['host']).decode('utf-8'))

        prepare_for_stat(top, date_str)

        put_data_to_stat(date_str)

        cluster.driver.remove('//home/videolog/vika-pavlova/mma-4662-video_hosts_cube/raw')
        cluster.driver.remove('//home/videolog/vika-pavlova/mma-4662-video_hosts_cube/top')


if __name__ == '__main__':
    main()
