#!/usr/bin/env python2.7
# coding: utf8

import argparse
import nile
import codecs

from nile.api.v1 import (
    clusters, Record,
    aggregators as na,
    extractors as ne)


def parse_args():
    parser = argparse.ArgumentParser()
    parser.add_argument(
        '-o', '--output', type=argparse.FileType('w'), required=True)
    parser.add_argument('-t', '--token', required=True)
    parser.add_argument(
        '-r', '--root', default="//home/search-research/24julia/mma-775")
    parser.add_argument('-hl', '--history_len', default=7, type=int)
    parser.add_argument('-u', '--us_data',
                        default='//user_sessions/pub/search/daily')
    parser.add_argument('-c', '--cluster_name', default='hahn')
    parser.add_argument('-n', '--names', required=True)
    return parser.parse_args()


def exists_and_not_empty(path, yt):
    return yt.exists(path) and not yt.is_empty(path)


def get_tabs(args, yt):
    us_tabs = sorted(yt.list(args.us_data))[-(args.history_len + 14):]
    us_tabs = sorted(
        [x for x in us_tabs
         if exists_and_not_empty(
             args.us_data + '/' + x + '/clean', yt
         )])[-args.history_len:]
    us_tabs = [args.us_data + '/' + x + '/clean' for x in us_tabs]
    return us_tabs


class ParseUSData(object):

    def __init__(self, names):
        self.names = names

    def __call__(self, groups):
        import libra
        for key, recs in groups:
            try:
                session = libra.ParseSession(recs, './blockstat.dict')
            except:
                continue
            for r in session:
                if (
                    not r.IsA('TVideoRequestProperties') or
                    r.IsA('TYandexRelatedVideoRequest')
                ):
                    continue
                if r.SerpID != r.ReqID:
                    continue
                if (
                    not r.IsA('TMiscRequestProperties') or
                    'vserial' not in r.RelevValues
                ):
                    continue
                if not any(
                    x in r.RelevValues['vsertitle'].decode(
                        'utf8', errors='replace'
                    ) for x in self.names
                ):
                    continue
                if 'vseason' not in r.RelevValues:
                    continue
                if not r.RelevValues.get('vepisode'):
                    continue
                yield Record(
                    domain=r.ServiceDomRegion,
                    query=r.Query,
                    episode=';'.join(
                        [r.RelevValues['vsertitle'],
                         r.RelevValues['vseason'], 'сезон',
                         r.RelevValues['vepisode'], 'серия']
                    ))


def main():
    args = parse_args()
    with codecs.open(args.names, 'r', 'utf8') as f:
        names = {l for l in f.read().split('\n') if l}
    root = args.root
    if args.cluster_name == 'hahn':
        cluster = clusters.yt.Hahn(
            token=args.token
        ).env(
            templates=dict(jr=root),
            parallel_operations_limit=2
        )
    else:
        cluster = clusters.YT(
            args.cluster_name + '.yt.yandex.net',
            token=args.token
        ).env(
            templates=dict(jr=root)
        )
    yt = cluster.driver.client
    us_tabs = get_tabs(args, yt)
    job = cluster.job(name='mma-774 us')
    something_new = False
    last_calculated_time = '1000-00-00'
    tabs = []
    for tab in us_tabs:
        d_str = tab.split('/')[-2]
        path0 = root + '/us/' + d_str
        if exists_and_not_empty(path0, yt):
            tabs.append(job.table(path0))
            calc_time = yt.get_attribute(path0, 'modification_time')[:19]
            if calc_time > last_calculated_time:
                last_calculated_time = calc_time
            continue
        something_new = True
        us_by_day = job.table(tab) \
            .groupby('key').sort('subkey') \
            .reduce(
                ParseUSData(names),
            files=[nile.files.RemoteFile(
                'statbox/statbox-dict-last/blockstat.dict'
            ),
                nile.files.RemoteFile(
                'statbox/resources/libra.so'
            )],
            memory_limit=4000
        ) \
            .groupby('query', 'episode') \
            .aggregate(amount=na.count()) \
            .project(ne.all(), date=ne.const(d_str)) \
            .sort('amount') \
            .put(path0)
        tabs.append(us_by_day)
    if something_new or last_calculated_time > yt.get_attribute(
            root + '/queries', 'modification_time'
        )[:19]:
        res = job.concat(*tabs) \
            .groupby('query', 'episode') \
            .aggregate(amount=na.sum('amount')) \
            .put('$jr/queries')
    job.run()
    args.output.write(root + '/queries')
    args.output.close()

if __name__ == '__main__':
    main()
