#!/usr/bin/env python
# -*- coding: utf-8 -*-
from __future__ import division
import os
import argparse
from nile.api.v1 import (
    clusters,
    filters as nf,
    extractors as ne,
    aggregators as na,
    statface as ns,
    with_hints,
    Record
)
import nile.files as nfi
from qb2.api.v1 import typing as qt
import re
import datetime


QUICK = 'UPPER.ApplyVideoBlender.IntentWeight/VIDEOQUICK'


def normalize_query(q):
    if not isinstance(q, unicode):
        q = q.decode('utf8', errors='replace')
    q = q.lower()
    q = re.sub(ur'[^a-zа-яё]', u' ', q)
    q = re.sub(ur' +', u' ', q)
    q = q.strip()
    return q


class GetFreshQueries(object):

    def __init__(self, threshold):
        self.threshold = threshold

    def __call__(self, groups):
        import libra
        for key, recs in groups:
            for rec in recs:
                try:
                    s = libra.ParseSession(recs, './blockstat.dict')
                except Exception as e:
                    continue

                for r in s:
                    if (
                        r.IsA('TYandexVideoRequest') or
                        r.IsA('TYandexVideoMordaRequest') or
                        r.IsA('TYandexRelatedVideoRequest')
                    ):
                        ui = 'desktop'
                    elif (
                        r.IsA('TTouchYandexVideoRequest') or
                        r.IsA('TTouchYandexVideoPortalRequest') or
                        r.IsA('TTouchYandexRelatedVideoRequest')
                    ):
                        ui = 'touch'
                    else:
                        continue

                    if r.ServiceDomRegion != 'ru':
                        continue

                    relev = r.RelevValues
                    spv = r.SearchPropsValues

                    try:
                        if relev['vserial'] > 0:
                            serial = 1
                        else:
                            serial = 0
                    except:
                        serial = 0

                    if str(spv.get('VIDEO.VideoPorno.vidprn')) == 'ipq1':
                        porn = 1
                    else:
                        porn = 0

                    if 'Film/Film' in str(spv.get('UPPER.VideoExtraItems.entity')):
                        film = 1
                    else:
                        film = 0

                    # if spv.get(QUICK, 0) or 0 > self.threshold:
                    #     yield Record(
                    #         query=normalize_query(r.Query),
                    #         serial=serial,
                    #         film=film,
                    #         porn=porn
                    #     )

                    quick = 0
                    i = 0
                    for bl in r.GetMainBlocks():
                        result = bl.GetMainResult()
                        if not result.IsA('TVideoResult'):
                            continue

                        if i > 4:
                            break

                        i += 1
                        VideoSource = result.VideoSource
                        if (
                            'QUICK' in VideoSource or
                            'quick' in VideoSource or
                            'Quick' in VideoSource
                        ):
                            quick += 1
                    yield Record(
                        query=normalize_query(r.Query),
                        quick=quick
                    )


def main():
    parser = argparse.ArgumentParser()
    parser.add_argument('--threshold', type=float, default=0.1)
    parser.add_argument(
        '--root', default='//user_sessions/pub/search/fast'
    )
    parser.add_argument('--pool')
    parser.add_argument('--input_table')
    parser.add_argument('--top', default=0, type=int)
    parser.add_argument('--output_table', required=True)
    args = parser.parse_args()

    yproxy = os.environ['YT_PROXY'].split('.')[0].title()
    cluster = getattr(clusters.yql, yproxy)(
        token=os.environ['YT_TOKEN'],
        yql_token=os.environ['YQL_TOKEN']
    ).env(templates=dict(title='Get fresh queries | YQL'))
    if args.pool:
        cluster = cluster.update(pool=args.pool)

    yt = cluster.driver.yt_driver.client

    if args.input_table:
        input_table = args.table
    else:
        input_table = sorted(
            list(yt.search(
                root=args.root,
                path_filter=lambda x: (x or '').endswith('/clean')
            )), reverse=True
        )[0]

    job = cluster.job()

    stream = job.table(
        input_table
    ).groupby('key').reduce(
        with_hints(output_schema=dict(
            query=qt.String,
            quick=qt.Integer
        ))(
            GetFreshQueries(args.threshold)
        ),
        files=[
            nfi.RemoteFile(
                'statbox/statbox-dict-last/blockstat.dict'
            ),
            nfi.RemoteFile(
                'statbox/resources/libra.so'
            )
        ],
        memory_limit=4000
    ).groupby(
        'query'
    ).aggregate(
        count=na.count(),
        median_quick=na.median('quick')
    ).filter(
        nf.equals('median_quick', 5)
    ).sort(
        'count'
    )

    if args.top:
        stream = stream.top(args.top, by='count')

    stream = stream.sort(
        'count'
    ).put(
        args.output_table
    )

    job.run()


if __name__ == "__main__":
    main()
