from nile.api.v1 import (
    filters as nf,
    aggregators as na,
    extractors as ne,
    statface as ns,
    cli,
    with_hints,
    clusters,
    Record
)
from qb2.api.v1 import (
    extractors as se,
    filters as sf
)
from qb2.api.v1 import typing as qt

import nile
import json
import math
import urlparse

@with_hints(output_schema=dict(
    yandexuid=str, guid=qt.Optional[qt.String], url=str, viewtime=float, duration=float, src_url=str, format=str)
           )

def parse_bs(recs):
    for rec in recs:
        parsed_vc = json.loads(rec.vc)

        for elem in parsed_vc:
            if 'data' in elem.keys() and 'url' in elem.keys():
                url = elem['url']
                for video_info in elem['data']:
                    guid = video_info['uid']
                    viewtime = video_info['total_played_duration']
                    duration = video_info['duration']
                    if duration == 'live':
                        duration = -1
                    src_url = video_info['media_url']
                    yield Record(yandexuid=rec.yandexuid, guid=guid, url=url, viewtime=viewtime, duration=duration, src_url=src_url, format='new')

            else:
                url = elem.keys()[0]
                try:
                    p = elem[url].get('p')
                except:
                    continue
                for video_info in p:
                    guid = video_info[0]
                    cor = 0
                    if len(str(guid)) <= 1:
                        guid = None
                        cor = -1
                    viewtime = video_info[5 + cor]
                    duration = video_info[4 + cor]
                    if duration == 'live':
                        duration = -1
                    src_url = video_info[9 + cor]
                    yield Record(yandexuid=rec.yandexuid, guid=guid, url=url, viewtime=viewtime, duration=duration, src_url=src_url, format='old')

def my_host(url):
    host = urlparse.urlparse(url).netloc.lower()
    if host == 'yandex.ru' and len(urlparse.urlparse(url).path.split('/')) >= 2:
        host = 'yandex.ru/' + urlparse.urlparse(url).path.split('/')[1]
    return host

@cli.statinfra_job
def make_job(job, options, statface_client):

    job = job.env(
        yt_spec_defaults=dict(pool_trees=["physical"], tentative_pool_trees=["cloud"], job_io={'table_writer': {'max_row_weight': 128 * 1024 * 1024}}),
        templates=dict(job_root='//home/videolog/liza-p/mma-2667/regular_calc'))

    report = ns.StatfaceReport() \
        .path('Video.All/Others/compareTVT2')\
        .scale('daily')\
        .client(statface_client)

    prepared_recs = job.table('statbox/bar-navig-log/$date'
                             ).qb2(log='bar-navig-log',
                                   fields=['yandexuid',
                                           se.dictitem('decoded_vc', from_='parsed_http_params'),
                                           se.custom('vc', lambda x: x[0] if x else None, 'decoded_vc').with_type(qt.Json)
                                          ],
                                   filters=[sf.defined('vc', 'yandexuid'),
                                            sf.equals('yasoft', 'yabrowser'),
                                            sf.region_belongs([225], field='geo_id')]
                                  ).map(parse_bs, memory_limit=4000
                                       ).put('$job_root/$date spy-log (prepared records)')

    has_guid = prepared_recs.filter(sf.defined('guid')
                                   ).groupby('yandexuid', 'guid'
                                            ).aggregate(url=na.any('url'),
                                                        src_url=na.any('src_url'),
                                                        viewtime=na.max('viewtime'),
                                                        duration=na.max('duration'))

    no_guid = prepared_recs.filter(nf.not_(sf.defined('guid')))

    prepared_recs_2 = job.concat(has_guid, no_guid).put('$job_root/$date spy-log (prepared records 2)')

    prepared_recs_2.project(yandexuid='yandexuid',
                            host = ne.custom(lambda url: my_host(url)).with_type(str),
                            tvt=ne.custom(lambda viewtime, duration: min(viewtime, duration) if duration != -1 else 0).with_type(float)
                           ).project(ne.all(),
                                     lvt=ne.custom(lambda tvt: math.log(tvt-25.0) if tvt >= 30 else 0).with_type(float)
                                    ).groupby('host').aggregate(tvt=na.sum('tvt'),
                                                                lvt=na.sum('lvt'),
                                                                uids=na.count_distinct('yandexuid')
                                                                ).project(ne.all(), fielddate=ne.const(options.dates[0]).with_type(str)
                                                                          ).filter(sf.custom(lambda uids: uids > 5000)
                                                                                  ).sort('uids'
                                                                                        ).put('$job_root/$date spy-log (stats hosts)'
                                                                                             ).publish(report)

    return job

if __name__ == '__main__':
    cli.run()
