# -*- coding: utf-8 -*-

from nile.api.v1 import (
    filters as nf,
    aggregators as na,
    extractors as ne,
    statface as ns,
    cli,
    with_hints,
    clusters,
    Record
)
from qb2.api.v1 import (
    extractors as se,
    filters as sf
)
from qb2.api.v1 import typing as qt

import nile
import json
import math
import urlparse

@with_hints(output_schema=dict(yandexuid=str, guid=qt.Optional[qt.String], url=str, viewtime=float, duration=float, src_url=str, format=str))

def parse_bs(recs):
    for rec in recs:
        try:
            parsed_vc = json.loads(rec.vc)
        except:
            continue


        for elem in parsed_vc:
            if 'data' in elem.keys() and 'url' in elem.keys():
                url = elem['url']
                for video_info in elem['data']:
                    guid = video_info['uid']
                    viewtime = video_info.get('total_played_duration', 0)
                    duration = video_info.get('duration', 0)
                    if duration == 'live':
                        duration = -1
                    src_url = video_info.get('media_url', '')
                    yield Record(yandexuid=rec.yandexuid, guid=guid, url=url, viewtime=viewtime, duration=duration, src_url=src_url, format='new')
                    yield Record(yandexuid=rec.yandexuid, guid=guid, url='_total_', viewtime=viewtime, duration=duration, src_url=src_url, format='new')
                    if urlparse.urlparse(url).netloc.lower() == "yandex.ru":
                        yield Record(yandexuid=rec.yandexuid, guid=guid, url='yandex.ru (всего)', viewtime=viewtime, duration=duration, src_url=src_url, format='new')
                        if "blob:https://yastatic.net/" in src_url or "strm.yandex.ru/" in src_url:
                            yield Record(yandexuid=rec.yandexuid, guid=guid, url='Видеохостинг на yandex.ru', viewtime=viewtime, duration=duration, src_url=src_url, format='new')

            else:
                url = elem.keys()[0]
                try:
                    p = elem[url].get('p')
                except:
                    continue
                for video_info in p:
                    guid = video_info[0]
                    cor = 0
                    if len(str(guid)) <= 1:
                        guid = None
                        cor = -1
                    viewtime = video_info[5 + cor]
                    duration = video_info[4 + cor]
                    if duration == 'live':
                        duration = -1
                    src_url = video_info[9 + cor]
                    yield Record(yandexuid=rec.yandexuid, guid=guid, url=url, viewtime=viewtime, duration=duration, src_url=src_url, format='old')
                    yield Record(yandexuid=rec.yandexuid, guid=guid, url='_total_', viewtime=viewtime, duration=duration, src_url=src_url, format='old')
                    if urlparse.urlparse(url).netloc.lower() == "yandex.ru":
                        yield Record(yandexuid=rec.yandexuid, guid=guid, url='yandex.ru (всего)', viewtime=viewtime, duration=duration, src_url=src_url, format='old')
                        if "blob:https://yastatic.net/" in src_url or "strm.yandex.ru/" in src_url:
                            yield Record(yandexuid=rec.yandexuid, guid=guid, url='Видеохостинг на yandex.ru', viewtime=viewtime, duration=duration, src_url=src_url, format='old')


@with_hints(output_schema=dict(
    uid=str, host=str, query=str, url=str, reqid=str, is_porno_query=int, is_film_serial_query=int)
           )

def parse_categories(groups):
    for key, recs in groups:
        import libra
        try:
            session = libra.ParseSession(recs, './blockstat.dict')
        except Exception as e:
            continue

        for req in session:
            if req.IsA('TYandexWebRequest'):
                if req.ServiceDomRegion != 'ru':
                    continue

                is_porno_query, is_film_serial_query = 0, 0
                if req.SearchPropsValues.get('WEB.Porno.pl', 0) == '100':
                    is_porno_query = 1
                if req.RelevValues.get('vserial', 0) > 0:
                    is_film_serial_query = 1
                else:
                    for block in req.GetBSBlocks():
                        if 'entity_search' in block.Path and 'object-badge' in block.Path:
                            for v in block.GetVars():
                                if v[0] == '-type' and 'Film' in v[1]:
                                    is_film_serial_query = 1

                if is_porno_query or is_film_serial_query:
                    for click in req.GetClicks():
                        try:
                            host = urlparse.urlparse(click.Url).netloc
                            yield Record(uid=key.key, host=host, query=req.Query, url=click.Url, reqid=req.ReqID, is_porno_query=is_porno_query, is_film_serial_query=is_film_serial_query)
                        except:
                            pass

def my_host(url):
    if url not in ['_total_', 'yandex.ru (всего)', 'Видеохостинг на yandex.ru']:
        host = urlparse.urlparse(url).netloc.lower()
        if host == 'yandex.ru' and len(urlparse.urlparse(url).path.split('/')) >= 2:
            host = 'yandex.ru/' + urlparse.urlparse(url).path.split('/')[1]
    else:
        host = url
    return host

@cli.statinfra_job
def make_job(job, options, statface_client):

    job = job.env(
        yt_spec_defaults=dict(pool_trees=["physical"], use_default_tentative_pool_trees=True),
        templates=dict(job_root='//home/videolog/liza-p/mma-2667/regular_calc'))

    report = ns.StatfaceReport() \
        .path('Video/Others/compareTVT/TVTbyHosts')\
        .scale('daily')\
        .client(statface_client)

    prepared_recs = job.table('statbox/bar-navig-log/$date',  weak_schema=dict(ip_numeric=qt.Optional[qt.String])
                             ).qb2(log='bar-navig-log',
                                   fields=['yandexuid',
                                           se.dictitem('decoded_vc', from_='parsed_http_params'),
                                           se.custom('vc', lambda x: x[0] if x else None, 'decoded_vc').with_type(qt.Json)
                                          ],
                                   filters=[sf.defined('vc', 'yandexuid'),
                                            sf.equals('yasoft', 'yabrowser'),
                                            sf.region_belongs([225], field='geo_id')]
                                  ).map(parse_bs, memory_limit=4000
                                       ).put('$job_root/$date spy-log (prepared records categories)')

    has_guid = prepared_recs.filter(sf.defined('guid')
                                   ).groupby('yandexuid', 'guid', 'url'
                                            ).aggregate(
                                                        src_url=na.any('src_url'),
                                                        viewtime=na.max('viewtime'),
                                                        duration=na.max('duration')
                                                        )

    no_guid = prepared_recs.filter(nf.not_(sf.defined('guid')))

    prepared_recs_2 = job.concat(has_guid, no_guid).put('$job_root/$date spy-log (prepared records 2 categories)')

    hosts_stats = prepared_recs_2.project(yandexuid='yandexuid',
                            host = ne.custom(lambda url: my_host(url)).with_type(str),
                            tvt=ne.custom(lambda viewtime, duration: min(viewtime, duration) if duration != -1 else 0).with_type(float)
                           ).project(ne.all(),
                                     lvt=ne.custom(lambda tvt: math.log(tvt-25.0) if tvt >= 30 else 0).with_type(float)
                                    ).groupby('host').aggregate(tvt=na.sum('tvt'),
                                                                lvt=na.sum('lvt'),
                                                                uids=na.count_distinct('yandexuid')
                                                                ).project(ne.all(),
                                                                          fielddate=ne.const(options.dates[0]).with_type(str),
                                                                          platform=ne.const('desktop').with_type(str)
                                                                          ).filter(
                                                                                sf.or_(
                                                                                    sf.custom(lambda uids: uids > 5000),
                                                                                    sf.custom(lambda host: host in ['_total_', 'yandex.ru (всего)', 'Видеохостинг на yandex.ru'])
                                                                                    )
                                                                                  ).sort('uids'
                                                                                        ).put('$job_root/$date spy-log (stats hosts categories)'
                                                                                             )
    hosts_category = job.table('//user_sessions/pub/search/daily/$date/clean').take(1).groupby('key').sort('subkey') \
          .reduce(parse_categories,
                  files=[nile.files.RemoteFile('statbox/statbox-dict-last/blockstat.dict'),
                         nile.files.RemoteFile('statbox/resources/libra.so')],
                  memory_limit=4000
                 ).groupby('host').aggregate(porno_clicks = na.sum('is_porno_query'),
                                             film_serial_clicks = na.sum('is_film_serial_query'))

    hosts_porno = hosts_category.filter(nf.custom(lambda host: host not in ['yandex.ru', 'www.yandex.ru', 'www.youtube.com', 'vk.com', 'ok.ru', 'yandex.ru (всего)', 'Видеохостинг на yandex.ru'])
                          ).top(50, by='porno_clicks'
                               ).project(host='host', category=ne.const('porn').with_type(str))
    hosts_film_serial = hosts_category.filter(
                            nf.custom(
                                lambda host: host not in ['yandex.ru', 'www.yandex.ru', 'www.youtube.com', 'vk.com', 'ok.ru', 'yandex.ru (всего)', 'Видеохостинг на yandex.ru'])
                                ).top(50, by='film_serial_clicks'
                                     ).project(host='host', category=ne.const('film/series').with_type(str))

    hosts = job.concat(hosts_porno, hosts_film_serial).put('$job_root/$date hosts categories')

    host_stats_category_0 = hosts_stats.join(hosts, type='left', by='host'
                    ).project(ne.all(exclude=('category')), category=ne.custom(lambda category: category if category else 'other').with_type(str))

    host_stats_category = job.concat(host_stats_category_0,
                                     host_stats_category_0.groupby('host'
                                                                  ).aggregate(uids=na.any('uids'), tvt=na.any('tvt'), lvt=na.any('lvt'), platform=na.any('platform'),
                                                                              fielddate=na.any('fielddate')
                                                                             ).project(ne.all(exclude='tvt'), tvt=ne.custom(lambda tvt: tvt/3600.0).with_type(float),category=ne.const('_total_').with_type(str))
                                    ).put('$job_root/$date result').publish(report)

    return job

if __name__ == '__main__':
    cli.run()
