# -*- coding: UTF-8 -*-
from nile import (
    clusters,
    Record,
    filters as nf,
    aggregators as na
)
import nile
import uatraits, libra

from datetime import datetime

import urlparse
import cgi

def Reduce(groups,reqs,clicks):
    c = '21142'
    e = '21143'

    detector = uatraits.detector('/usr/share/uatraits/browser.xml')

    for key, records in groups:
        uid = key.key
        if uid[0] != 'y':
            continue

        try:
            session = libra.ParseSession(records, 'blockstat.dict')
        except:
            continue

        for r in session:
            if r.IsA('TTouchYandexWebRequest'):
                ui = 'touch'
            else:
                continue

            ts =  str(datetime.fromtimestamp(r.Timestamp).isoformat()).split('T')[0]

            if r.HasTestID(c):
                slot = 'control ' + c
            elif r.HasTestID(e):
                slot = 'exp ' + e
            else:
                continue

            reqid = r.ReqID
            ua = str(r.UserAgent)
            page = str(r.PageNo)
            ref = str(r.Referer)

            d = detector.detect(ua)

            BR = ''
            BV = ''
            OS = ''
            OSV = ''

            if 'OSName' in d:
                OS = d['OSName']
            if 'OSVersion' in d:
                OSV = d['OSVersion']
            if 'BrowserName' in d:
                BR = d['BrowserName']
            if 'BrowserVersion' in d:
                BV = d['BrowserVersion']

            all_cl = 0
            direct_cl = 0

            for bl in r.GetMainBlocks():
                m = bl.GetMainResult()
                if m.IsA('TDirectResult'):
                    for cl in bl.GetClicks():
                        direct_cl += 1
                else:
                    for cl in bl.GetClicks():
                        all_cl += 1
                        p = cl.ConvertedPath
                        dw = cl.DwellTimeOnService
                        rref = cl.Referer

                        clicks(Record(uid=uid,slot=slot,ts=ts,reqid=reqid,page=page,ref=ref,BR=BR,BV=BV,OS=OS,OSV=OSV,p=p,dw=dw,cl_ref=rref))

            reqs(Record(uid=uid,slot=slot,ts=ts,reqid=reqid,page=page,ref=ref,BR=BR,BV=BV,OS=OS,OSV=OSV,all_cl=all_cl,direct_cl=direct_cl))


def map_sess(lines):
    for line in lines:
        uid = line.key
        sk = line.subkey
        line = line.value

        data = dict([d.split('=', 1) for d in line.split('\t') if '=' in d])
        ts =  str(datetime.fromtimestamp(float(sk)).isoformat()).split('T')[0]

        rtype = data.get('type')
        serv = data.get('service')
        if not rtype or not serv:
            continue

        if rtype == 'ACCESS':
            if serv != 'www.yandex':
                continue

            rreqid = data.get('reqid')
            rrequest = data.get('request')
            if not rrequest:
                continue

            qs = rrequest.split('?',1)
            if len(qs) < 2:
                continue

            qs = qs[1]
            dd = cgi.parse_qs(qs)

            if not 'clid' in dd:
                continue

            yield Record(uid=uid,ts=ts,reqid=rreqid,clid=str(dd['clid']),request=rrequest)



cluster = clusters.YT(proxy="hahn.yt.yandex.net").env(
                                                        job_root=('home/search-research/ensuetina/HTTPS_NEW_UIDS')
                                                     )

dates = ['2016-03-17','2016-03-18','2016-03-19','2016-03-20','2016-03-21','2016-03-22','2016-03-23'] #nile.api.path.DateRange('2016-03-15', '2016-03-23')
for date in dates:
    continue
    job = cluster.job().env(date=date)

    us = job.table('user_sessions/pub/search/daily/$date/clean')

    clids = us.map(map_sess).put('$job_root/clids',append=True)
    reqs, clicks = us.groupby('key').sort('subkey').reduce(Reduce,files=[nile.files.LocalFile('/home/ensuetina/blockstat.dict'),nile.files.LocalFile('/home/ensuetina/lib/libra.so')])

    reqs.put('$job_root/reqs',append=True)
    clicks.put('$job_root/clicks', append=True)

    job.run()

job = cluster.job()

clids = job.table('$job_root/clids').project('reqid','clid','request')
reqs = job.table('$job_root/reqs')

reqs.join(clids,by='reqid',type='inner').put('$job_root/reqs_with_clids')
#t = job.table('$job_root/reqs').groupby('ts').aggregate(count=na.count()).put('$job_root/dates')

job.run()

