from nile.api.v1 import (
    filters as nf,
    aggregators as na,
    extractors as ne,
    statface as ns,
    clusters,
    Record
)

from qb2.api.v1 import extractors as se, filters as sf
import argparse
import nile
import datetime
import uatraits, json, re
import urllib, urlparse
import math, cgi
import pandas as pd
from itertools import product
import sys
import os

cluster = clusters.yt.Hahn().env(templates=dict(job_root='home/videolog/vika-pavlova/1812_for_tv_online'),
                                 yt_spec_defaults=dict(pool_trees=["physical"],
                                                    tentative_pool_trees=["cloud"]
                                                   ),
                                 parallel_operations_limit=10
                                )

TRANSLATION = None

def build_translation():
    global TRANSLATION
    if TRANSLATION is None:
        import sys
        import unicodedata
        TRANSLATION = {
            index: u' ' for index in xrange(sys.maxunicode)
            if unicodedata.category(unichr(index)).startswith('P')
        }
        for char in u'\t\n\x0b\x0c\r$+<=>^`|~':
            TRANSLATION[ord(char)] = u' '
    return TRANSLATION


def normalize_query(query):

    try:
        query = query.decode('utf8')
    except UnicodeDecodeError:
        return

    query = query.translate(build_translation())
    query = query.lower()
    query = query.strip()
    query = re.sub(r'\s\s+', ' ', query)

    return query.encode('utf8')

def parse_us(groups):
    import libra

    for key,recs in groups:
        uid = key.key

        try:
            s = libra.ParseSession(recs, './blockstat.dict')
        except Exception as e:
            continue

        for r in s:
            if r.IsA('TYandexWebRequest'):
                ui = 'desktop web'
            else:
                continue

            if str(r.PageNo) != '0':
                continue

            q = normalize_query(str(r.Query))
            date = str(datetime.datetime.fromtimestamp(r.Timestamp).isoformat()).split('T')[0]
            time = r.Timestamp

            for bl in r.GetMainBlocks():
                result = bl.GetMainResult()
                if not result.IsA("TWizardResult") and not result.IsA('TBlenderWizardResult'):
                    continue

                name = result.Name
                if name == 'tv':
                    url = str(result.Url)
                    try:
                        content_id = url.split('stream_id=')[1].split('&')[0]
                    except:
                        content_id = '0'

                    yield Record(uid=uid,ui=ui,q=q,date=date,time=time,
                                 reqid=r.ReqID,url=url,name=name,content_id=content_id)
                    break

def five_min_time(timest):

    short_timest = int(timest)-int(timest)%300

    return short_timest

def us_process(date):

    job = cluster.job()

    us = job.table('user_sessions/pub/search/daily/' + date + '/clean')


    reqs = us.groupby('key').sort('subkey').reduce(parse_us,
                                  files=[nile.files.RemoteFile('statbox/statbox-dict-last/blockstat.dict'),
                                           nile.files.RemoteFile('statbox/resources/libra.so')
                                          ],
                                               memory_limit=4000
                                              ).sort('uid').put('$job_root/reqs_full')

    job.run()

    job = cluster.job()

    t = job.table('$job_root/reqs_full')

    tt = t.groupby('q','url','content_id').aggregate(freq=na.count(),
                                                     min_timest=na.min('time'),
                                                     mean_timest=na.mean('time'),
                                                     med_timest=na.median('time'))

    tt.project('q','content_id','freq',
               min_time=ne.custom(five_min_time,'min_timest') ,
               mean_time=ne.custom(five_min_time,'mean_timest'),
               med_time=ne.custom(five_min_time,'med_timest')).sort('freq').sort('q').put('$job_root/sort_table_' + date,
                                                                                schema={"q": str,
                                                                                        "content_id": str,
                                                                                        "freq": int,
                                                                                        "mean_time": int,
                                                                                        "med_time": int,
                                                                                        "min_time": int
                                                                                       }
                                                                               )

    job.run()

def main():

    parser = argparse.ArgumentParser()
    parser.add_argument('--start_date', type=str, required=True)
    parser.add_argument('--end_date', type=str, required=True)
    args = parser.parse_args()

    for date in pd.date_range(start=args.start_date, end=args.end_date):
        date_str = str(date)[:10]
        us_process(date_str)


if __name__ == '__main__':
    main()
