# -*- coding: UTF-8 -*-
from nile.api.v1 import filters as nf, aggregators as na, extractors as ne, statface as ns, clusters, Record, cli, with_hints

from qb2.api.v1 import extractors as se, filters as sf
import argparse
import nile
import datetime
import json, re
import urllib, urlparse
import math, cgi
import pandas as pd
from itertools import product
import sys
import os
import re

from nile.api.v1 import files as nfi
from qb2.api.v1 import resources as qr
from random import shuffle, sample, random

from qb2.api.v1.typing import String, Int32, Int64, UInt64, Float


TRANSLATION = None


output_schema_0 = {
    'query': String,
    'norm_query': String,
    'service': String,
    'platform': String,
    'domain': String,
    'regionId': Int32,
    'country': String,
    'fresh_intent': Float,
    'wizard_position': UInt64,
}


def build_translation():

    global TRANSLATION

    if TRANSLATION is None:
        import sys
        import unicodedata

        TRANSLATION = {
            index: u' ' for index in xrange(sys.maxunicode) if unicodedata.category(unichr(index)).startswith('P')
        }

        for char in u'\t\n\x0b\x0c\r$+<=>^`|~':
            TRANSLATION[ord(char)] = u' '

    return TRANSLATION


def normalize_query(query):

    try:
        query = query.decode('utf8')
    except UnicodeDecodeError:
        return

    query = query.translate(build_translation())
    query = query.lower()
    query = query.strip()
    query = re.sub(r'\s\s+', ' ', query)

    return query.encode('utf8')


def get_country(region):

    for reg in region.path:
        if reg.type == 3:
            return reg.short_name
    return "UNK"


@with_hints(output_schema_0, ensure_optional_types=True)
def parse_us(session):
    import libra

    geobase = qr.get('Geobase')

    for key, rcont in session:
        uid = key
        requests = rcont.GetRequests()
        for r in requests:
            if r.IsA('TYandexVideoRequest'):
                ui = 'desktop video'
            elif r.IsA('TTouchYandexVideoRequest'):
                ui = 'touch video'
            elif r.IsA('TYandexWebRequest'):
                ui = 'desktop web'
            elif r.IsA('TTouchYandexWebRequest'):
                ui = 'touch web'
            elif (
                r.IsA('TMobileAppYandexVideoRequest')
                or r.IsA('TMobileAppYandexVideoPortalRequest')
                or r.IsA('TMobileAppYandexRelatedVideoRequest')
            ):
                ui = 'app video'
            elif r.IsA('TPadYandexWebRequest'):
                ui = 'pad web'
            elif r.IsA('TPadYandexVideoRequest'):
                ui = 'pad video'
            else:
                continue

            spv = r.SearchPropsValues
            if 'video' in ui:
                fresh_intent = float(spv.get("VIDEOQUICK.QueryFreshintent", 0))
            else:
                fresh_intent = float(spv.get("Fresh.IntentProbability", 0))
            wizard_position = int(spv.get('UPPER.ApplyBlender.IntentPos/VIDEOWIZ', 9999))

            regionId = r.UserRegion
            country = get_country(geobase.region_by_id(int(regionId)))

            if country not in ["RU", "UA", "BY", "KZ", "UZ"]:
                continue

            service = 'video' if 'video' in ui else 'web'
            platform = 'desktop' if 'desktop' in ui else 'touch'

            yield Record(
                query=r.Query,
                norm_query=normalize_query(r.Query),
                service=service,
                platform=platform,
                domain=r.ServiceDomRegion.upper(),
                regionId=regionId,
                country=country,
                fresh_intent=fresh_intent,
                wizard_position=wizard_position,
            )


@cli.statinfra_job
def make_job(job, options):
    job = job.env(
        templates=dict(
            job_root='//home/videolog/vika-pavlova/fresh_queries',
        ),
        yt_spec_defaults=dict(
            pool_trees=["physical"],
            use_default_tentative_pool_trees=True,
        ),
    )

    date = options.dates[0]

    us_web = job.table('user_sessions/pub/search/daily/' + date + '/columns/clean')
    us_video = job.table('user_sessions/pub/video/daily/' + date + '/columns/clean')

    final = job.concat(
        us_web,
        us_video,
    ).libra(
        parse_us,
        libra_file=nile.files.RemoteFile("//statbox/resources/libra_nile_udf2.7.so"),
        files=[nfi.StatboxDict('Geobasev6.bin')],
        memory_limit=5 * 1024,
    ).filter(
        sf.custom(
            lambda x, y, z, a: a and not a.isdigit() and ((x == 'web' and y < 9) or (x == 'video' and z > 0.5)),
            'service',
            'wizard_position',
            'fresh_intent',
            'norm_query',
        ),
    ).groupby(
        'norm_query',
        'service',
        'platform',
        'country',
    ).aggregate(
        freq=na.count(),
        regionId=na.any("regionId"),
        fresh_intent=na.mean('fresh_intent'),
        wizard_position=na.mean('wizard_position'),
    ).project(
        ne.all(),
        neg_freq=ne.custom(lambda x: x * (-1), 'freq').with_type(Int64),
    ).sort(
        'neg_freq',
    ).put(
        '$job_root/aggr',
    ).filter(
        sf.custom(
            lambda x, y: x == 'video' or (x == 'web' and y < 3),
            'service',
            'wizard_position',
        ),
    )

    final.filter(
        sf.equals('platform', 'desktop'),
    ).put(
        '$job_root/desktop_' + date,
    )

    final.filter(
        sf.equals('platform', 'touch'),
    ).put(
        '$job_root/touch_' + date,
    )

    return job


if __name__ == '__main__':
    cli.run()
