# -*- coding: UTF-8 -*-

from datetime import datetime, timedelta, date
import urllib, re, random, cgi
from nile import (
    clusters,
    Record,
    filters as nf,
    aggregators as na
)
import nile
import uatraits, libra
import urlparse

def Reduce(groups, podval, soo_clicks, adv_serps, adv_clicks, other):
    c = '21464'
    e1 = '21465'
    e2 = '21466'

    detector = uatraits.detector('/usr/share/uatraits/browser.xml')

    for key, records in groups:
        uid = key.key
        if uid[0] != 'y':
            continue

        try:
            s = libra.ParseSession(records, 'blockstat.dict')
        except Exception as e:
#            other(Record(fail='parse sessions - ' + str(e)))
            continue


        for r in s:
            if r.IsA('TYandexWebRequest'):
                ui = 'Desktop'
            else:
                continue

            if r.ServiceDomRegion != 'ru':
                continue

            ts =  str(datetime.fromtimestamp(r.Timestamp).isoformat()).split('T')[0]

            q = r.Query

            if r.HasTestID(c):
                slot = 'control ' + c
            elif r.HasTestID(e1):
                slot = 'exp ' + e1
            elif r.HasTestID(e2):
                slot = 'exp ' + e2
            else:
                continue

            ua = str(r.UserAgent)
            req = str(r.FullRequest)

            uap = detector.detect(ua)

            BR = str(uap.get('BrowserName'))
            OS = str(uap.get('OSName'))

            qs = req.split('?',1)
            if len(qs) < 2:
                continue

            qs = qs[1]
            data = cgi.parse_qs(qs)

            within = '-'
            rstr = '-'
            site = '-'
            word = '-'
            lang = '-'
            mime = '-'
            from_d = '-'
            to_d = '-'

            for k in data.keys():
                if 'within' in k:
                    within = k
                if 'rstr' in k:
                    rstr = k
                if 'site' in k:
                    site = k
                if 'wordforms' in k:
                    word = k
                if 'lang' in k:
                    lang = k
                if 'mime' in k:
                    mime = k
                if 'from_date_full' in k:
                    from_d = k
                if 'to_date_full' in k:
                    to_d = k

            params = [within, rstr, site, word, lang, mime, from_d, to_d]

            isP = 0
            for tech in r.GetYandexTechEvents():
                if not tech.IsA('TYandexTechEvent'):
                    continue
                p = tech.Path
                if '690.405.487' in p or 'tech.pager.show' in p:
                    isP = 1
                    break

            if isP == 1:
                podval(Record(uid=uid,slot=slot,ts=ts,BR=BR,OS=OS,params=str(params))) # podval serp

            clicks = 0
            isHead = 0
            for bl in r.GetBSBlocks():
                p = bl.Path
                if 'head/advanced-search' in p:
                    isHead = 1
                    break

            if isHead == 1:
                for cl in r.GetClicks():
                    clicks += 1
                    p = cl.ConvertedPath
                    dw = cl.DwellTimeOnService
                    url = str(cl.Url)

                    adv_clicks(Record(uid=uid,slot=slot,ts=ts,q=q,isHead=isHead,p=p,dw=dw,url=url)) # clicks on adv serps

                adv_serps(Record(uid=uid,slot=slot,ts=ts,q=q,isHead=isHead,params=str(params),OS=OS,BR=BR)) # adv serps

            for cl in r.GetClicks():
                p = cl.ConvertedPath
                if 'soo/' in p:
                    soo_clicks(Record(uid=uid,slot=slot,ts=ts,q=q,p=p,podval=isP,url=str(cl.Url),OS=OS,BR=BR)) # soo clicks

cluster = clusters.YT(proxy="hahn.yt.yandex.net").env(
                                                        job_root=('home/search-research/ensuetina/SOO_EXP')
                                                     )

dates = nile.api.path.DateRange('2016-02-20', '2016-02-29')
for date in dates:
    continue
    job = cluster.job()

    if int(str(date).replace('-','')) < 20160225:
        path = 'userdata/user_sessions/search/daily/' + str(date) + '/clean'
    else:
        path = 'user_sessions/pub/search/daily/' + str(date) + '/clean'

    print path

    us = job.table(path)

    podval, soo_clicks, adv_serps, adv_clicks, other= us.groupby('key').sort('subkey').reduce(Reduce,files=[nile.files.LocalFile('blockstat.dict'),nile.files.LocalFile('libra.so')])
    podval.put('$job_root/podval',append=True)
    soo_clicks.put('$job_root/soo_clicks',append=True)
    adv_serps.put('$job_root/adv_serps',append=True)
    adv_clicks.put('$job_root/adv_clicks',append=True)
    other.put('$job_root/other')

    job.run()

job = cluster.job()

#t = job.table('$job_root/podval')
#t.groupby('slot','ts').aggregate(podvals=na.count()).put('$job_root/podval_aggr')

t1 = job.table('$job_root/adv_serps_aggr').project('uid','q','ts','params')
#t1.groupby('uid','q','slot','ts','isHead','params').aggregate(adv_serps=na.count()).put('$job_root/adv_serps_aggr')

t2 = job.table('$job_root/adv_clicks')
#t2.groupby('uid','q','slot','ts','isHead','p').aggregate(adv_serps=na.count()).put('$job_root/adv_clicks_aggr')

join = t2.join(t1,by=['uid','q','ts'],type='inner').put('$job_root/adv_clicks_join')

join.groupby('slot','ts','isHead','p','params').aggregate(clicks=na.count()).put('$job_root/adv_clicks_join_aggr')

job.run()



