#-*-coding: utf8 -*-
from nile import (
    Record,
    aggregators as na,
    filters as nf
)
from nile.api.v1 import clusters
import nile
from datetime import datetime
import uatraits, cgi, urlparse
import random
import libra

def Reduce(groups):
    for key,records in groups:
        uid = key.key

        try:
            session = libra.ParseSession(records, 'blockstat.dict')
        except:
            continue

        for r in session:
            if not r.IsA('TYandexWebRequest'):
                continue

            q = str(r.Query)
            ts =  str(datetime.fromtimestamp(r.Timestamp).isoformat()).split('T')[0]
            spv = r.SearchPropsValues

            isUpper = 0
            for bl in r.GetBSBlocks():
                p = bl.Path

                if 'parallel/object-answer' in p:
                    isUpper = 1
                    break

            ruw = '-'
            es = '-'
            isF = 0
            isAccept = 0

            if 'UPPER.EntitySearch.Log' in spv:
                es = spv['UPPER.EntitySearch.Log']
                if 'Film' in es:
                    isF = 1
                    f = 0
                    l = es.split('|')
                    for el in l:
                        if f == 1:
                            ruw = el
                            break
                        if 'Film' in el:
                            f = 1
                else:
                    isF = 0

            if 'UPPER.EntitySearch.Accept' in spv:
                acc = spv['UPPER.EntitySearch.Accept']
                if str(acc) == '1':
                    isAccept = 1
                else:
                    isAccept = 0

            if isF == 1 and isAccept == 1:
                for cl in r.GetClicks():
                    url = str(cl.Url)
                    p = str(cl.ConvertedPath)
                    try:
                        host = urlparse.urlparse(url).netloc
                    except:
                        host = url

                    yield Record(uid=uid,q=q,ruw=ruw,es=es,ts=ts,p=p,host=host)


cluster = clusters.yt.Hahn().env(templates=dict(
                                                job_root='home/search-research/ensuetina/KP_QUERIES'
                                            ))


dates = ['01','02','03','04','05','06','07','08','09','10','11','12','13','14','15','16','17','18','19','20']
dates = ['21','22','23','24','25','26','27',]#'28','29','30','31']
#dates = ['01']
for date in dates:
    continue
    job = cluster.job()

    us = job.table('user_sessions/pub/search/daily/2016-03-' + date + '/clean')

    t = us.groupby('key').sort('subkey').reduce(Reduce,files=[nile.files.LocalFile('/home/ensuetina/blockstat.dict'),nile.files.LocalFile('/home/ensuetina/libra.so')],memory_limit=2000)

    t.put('$job_root/clicks', append=True)

    job.run()

job = cluster.job()

t = job.table('$job_root/data').groupby('q').aggregate(reqs=na.count()).sort('reqs').put('$job_root/aggr_queries')

#t = job.table('$job_root/data').groupby('ruw').aggregate(count=na.count()).project('count',ontoid='ruw').put('$job_root/wiki_ids')

#mapping = job.table('$job_root/mapping')

#t.join(mapping,by='ontoid',type='inner').put('$job_root/joined_data')

#t.groupby('ruw').aggregate(count=na.count()).put('$job_root/wiki_ids')
#t.groupby('q','ruw').aggregate(count=na.count()).put('$job_root/result_table')

job.run()



