# -*-coding: utf8 -*-

from qb2.api.v1.typing import *
from nile.api.v1 import clusters, Record, files, cli, with_hints, aggregators as na, extractors as ne


output_schema_0 = {
    'pos': UInt8,
    'url': String,
    'reqid': String,
}


weak_schema_0 = {
    'key': String,
}


@with_hints(output_schema_0)
def reduce_libra(groups):
    import libra
    for key, records in groups:
        try:
            requests = libra.ParseSession(records, 'blockstat.dict')
        except Exception as e:
            continue
        for r in requests:
            if not r.IsA('TRelatedVideoRequestProperties') or r.PageNo != 0:
                continue
            pos = 0
            for block in r.GetMainBlocks():
                res = block.GetMainResult()
                url = getattr(res, 'Url', None)
                if not res.IsA('TVideoResult') or not url:
                    continue
                url = url.split('//', 1)
                if len(url) == 1:
                    continue
                yield Record(reqid=r.ReqID, url=url[1], pos=pos)
                pos += 1
                if pos == 10:
                    break


@cli.statinfra_job
def make_job(job, options):
    job = job.env(
        templates={
            'authors': '//home/videoindex/full/docbase/prevdata/full_index/authors',
            'job_root': '//home/imgdev/ndchikin/MMA-5618',
            'logs_root': '//home/user_sessions/pub/video/daily',
        },
        yt_spec_defaults={
            'pool_trees': ['physical'],
            'use_default_tentative_pool_trees': True,
        },
    )

    date = options.dates[0]

    table_data = job.table('@job_root/data')
    table_video = job.table('@logs_root/{}/clean'.format(date))
    table_authors = job.table('@authors', weak_schema=weak_schema_0)

    data = table_video.groupby(
        'key',
    ).sort(
        'subkey',
    ).reduce(
        reduce_libra,
        memory_limit=5 * 1024,
        files=[
            files.RemoteFile('//statbox/resources/libra.so'),
            files.RemoteFile('//statbox/statbox-dict-last/blockstat.dict'),
        ],
    ).join(
        table_authors,
        type='left',
        by_left='url',
        by_right='GroupingUrl',
        force_unique_right=True,
    ).checkpoint(
        'data',
        '@job_root/data',
    )

    total = data.aggregate(
        total=na.count_distinct('reqid'),
        total_distinct=na.count_distinct('key'),
    ).project(
        ne.all(),
        fielddate=ne.const(date),
    )

    reqid = data.groupby(
        'reqid',
    ).aggregate(
        distinct=na.count_distinct(
            'key',
            in_memory=True,
        ),
    ).aggregate(
        reqid_distinct=na.sum('distinct'),
    ).project(
        ne.all(),
        fielddate=ne.const(date),
    )

    total.join(
        reqid,
        'fielddate',
    ).project(
        'fielddate',
        'total_distinct',
        reqid_distinct=ne.custom(lambda a, b: float(a) / b, 'reqid_distinct', 'total').with_type(Float),
    ).concat(
        table_data
    ).sort(
        'fielddate',
    ).put(
        '@job_root/data',
    )

    return job


if __name__ == '__main__':
    cli.run()
