from nile.api.v1 import (
    filters as nf,
    aggregators as na,
    extractors as ne,
    statface as ns,
    clusters,
    Record
)

from qb2.api.v1 import extractors as se, filters as sf

import nile
import datetime
import argparse
import uatraits, json, re
import urllib, urlparse
import math,cgi
import pandas as pd
import sys

TRANSLATION = None

def build_translation():
    global TRANSLATION
    if TRANSLATION is None:
        import sys
        import unicodedata
        TRANSLATION = {
            index: u' ' for index in xrange(sys.maxunicode)
            if unicodedata.category(unichr(index)).startswith('P')
        }
        for char in u'\t\n\x0b\x0c\r$+<=>^`|~':
            TRANSLATION[ord(char)] = u' '
    return TRANSLATION


def normalize_query(query):

    try:
        query = query.decode('utf8')
    except UnicodeDecodeError:
        return

    query = query.translate(build_translation())
    query = query.lower()
    query = query.strip()
    query = re.sub(r'\s\s+', ' ', query)

    return query.encode('utf8')

def parse_us(groups):
    import libra

    reqids = []

    for key,recs in groups:
        uid = key.key.replace('y','').replace('uu/','')

        try:
            s = libra.ParseSession(recs, './blockstat.dict')
        except Exception as e:
            continue

        for r in s:
            if r.IsA('TYandexVideoRequest'):
                ui = 'desktop video'
            elif r.IsA('TTouchYandexVideoRequest'):
                ui = 'touch video'
            elif r.IsA('TPadYandexVideoRequest'):
                ui = 'pad video'
            elif r.IsA('TMobileAppYandexVideoRequest'):
                ui = 'app video'
            else:
                continue

            if r.ServiceDomRegion != 'ru':
                continue

            q = normalize_query(r.Query)
            relev = r.RelevValues

            track = 0
            music = 0

            if str(relev.get('mut')) == '5' or str(relev.get('muvt')) == '2':
                track = 1
            if str(relev.get('mu')) == '1' or str(relev.get('muvt')) == '1' or str(relev.get('muvt')) == '2':
                music = 1

            if music + track == 0:
                continue

            tvt = 0
            top_url = None
            for bl in r.GetMainBlocks():
                result = bl.GetMainResult()
                if not result.IsA("TVideoResult"):
                    continue

                url = str(result.Url)
                if not top_url:
                    top_url = url

                duration = r.FindVideoDurationInfo(result)
                if (duration):
                    dt = min(duration.PlayingDuration, duration.Duration)
                    dur = max(duration.PlayingDuration, duration.Duration)
                else:
                    dt = 0
                    dur = 0

                heartbeat = r.FindVideoHeartbeat(result, 'ANY')
                if (heartbeat):
                    ht = heartbeat.Ticks
                else:
                    ht = 0

                dur = result.Duration

                res_tvt = max(dt,ht)
                tvt += res_tvt

                if res_tvt > 0:
                    watched = 1
                else:
                    watched = 0


            yield(Record(uid=uid,reqid=r.ReqID,ui=ui,track=-1*track,tvt=tvt,neg=-1,q=q, top_url = top_url))

def url_map(records):
    for rec in records:
        best_url = None
        best_freq = 0
        if not "urls" in rec:
            print >> sys.stderr, rec
            continue
        for url in rec['urls']:
            best_freq = max(best_freq, url[1])
            if best_freq == url[1]:
                best_url = url[0]
        yield(Record(freq = rec['freq'], tvt = rec['tvt'], neg = rec['neg'], url = best_url, query = rec['q'], track = rec['track']))

def main():

    parser = argparse.ArgumentParser()
    parser.add_argument('--dates', dest='date', help="Last date (YYYY-MM-DD)")
    args, unknown = parser.parse_known_args()

    cluster = clusters.yt.Hahn(pool = 'videolog',
                       ).env(templates=dict(job_root='home/videolog/itajn/music_queries'),
                       parallel_operations_limit=10)

    enddate = datetime.datetime.strptime(args.date, "%Y-%m-%d")
    startdate =  enddate - datetime.timedelta(6)
    final_name = startdate.strftime("%Y-%m-%d") + "_" + enddate.strftime("%Y-%m-%d")

    while startdate <= enddate:
        date = startdate.strftime("%Y-%m-%d")
        startdate += datetime.timedelta(1)
        job = cluster.job()

        us = job.table('user_sessions/pub/search/daily/%s/clean' % date)

        result = us.groupby('key').sort('subkey').reduce(parse_us,
                                                files=[nile.files.RemoteFile('statbox/statbox-dict-last/blockstat.dict'),
                                                       nile.files.RemoteFile('statbox/resources/libra.so')
                                                      ],
                                                memory_limit=4000
                                               ).put('$job_root/reqs1w', append = True)
        job.run()

    job = cluster.job()
    reqs_agg = job.table('$job_root/reqs1w'
             ).groupby('q','track'
                          ).aggregate(freq=na.count(),
                                tvt=na.sum('tvt'),
                                neg=na.sum('neg'),
                                urls = na.histogram('top_url')
                                ).map(url_map).top(10000, by = 'freq').sort('neg'
                                ).put('$job_root/%s' % final_name)
    job.run()
    cluster.driver.remove('home/videolog/itajn/music_queries/reqs1w')

if __name__ == "__main__":
    main()
