#-*-coding: utf8 -*-

from nile import (
    clusters,
    Record,
    filters as nf,
    aggregators as na
)
import nile
import uatraits, libra

import urlparse

def Reduce(groups):
    c = '20435'
    e = '20437'
    for key, records in groups:
        uid = key.key
        #if uid[0] != 'y':
        #    continue

        try:
            session = libra.ParseSession(records, 'blockstat.dict')
        except:
            continue

        for r in session:
            if r.IsA('TTouchYandexWebRequest'):
                ui = 'TOUCH'
            else:
                continue

            if r.HasTestID(c):
                slot = 'control ' + c
            elif r.HasTestID(e):
                slot = 'exp ' + e
            else:
                continue

            yield Record(uid=uid,slot=slot)

def map_csp(lines):
    for line in lines:
        try:
            content = line.path
        except:
            content = '-'

        if content == '-':
            continue

        i = content.find('yandexuid=')
        if i < 0 :
            continue

        yuid = content[i:]
        j = yuid.find('&')
        if j > 0:
            yuid = yuid[:j]
        else:
            j = yuid.find(';')
            if j > 0:
                yuid = yuid[:j]
            else:
                yuid = '-'

        if yuid != '-':
            uid = 'y' + yuid.split('=')[1]
            yield Record(uid=uid)

def map_uids(lines):
    for line in lines:
        line.uid = line.uid.split('\"')[0]
        yield line

username = 'ensuetina'
cluster = clusters.Plato().env(
                                job_root=('home/search-research/' + username + '/'
                                         'nile/CSP') # directory on plato where i plan to store results
                            )
dates = nile.api.path.DateRange('2016-01-30', '2016-02-05')
for date in dates:
    continue
    job = cluster.job().env(date=date)

#    log = job.table('userdata/user_sessions/$date')
    csp = job.table('statbox/csp-log/$date')

    result = csp.map(map_csp).put('$job_root/csp_uids', append = True)

#    result.groupby('uid').aggregate(count=na.count()).put('$job_root/aggr_uids')
#    result.groupby('yuid').aggregate(count=na.count()).put('$job_root/aggr_yuids')

#    queries = log.groupby('key').sort('subkey').reduce(Reduce,files=[nile.files.LocalFile('blockstat.dict'),nile.files.LocalFile('libra.so')]).put('$job_root/result')

#    queries.groupby('ui','reg').aggregate(freq=na.count()).put('$job_root/aggr_result')

#    aggr_queries = queries.groupby('query').aggregate(freq=aggregators.count()).put('$job_root/queries')

#    filtered_queries = aggr_queries.filter(
#                        nf.custom(lambda x: x > 1000, 'freq')
#                   ).put('$job_root/queries_freq_more_1K') # filtering freq > 1000 and store filtered table


#    join1 = tr.join(filtered_queries, by='query').filter( nf.custom(bool, 'freq')).put('$job_root/join1') # join table of hosts with table of queries
    job.run()

job = cluster.job()

t = job.table('$job_root/1/csp_uids')

csp_uids = t.map(map_uids).groupby('uid').aggregate(count=na.count()).put('$job_root/1/aggr_csp_uids')

#t = job.table('$job_root/aggr_csp_uids2')

control = job.table('$job_root/control_uids')
exp = job.table('$job_root/exp_uids')

join_exp = csp_uids.join(exp, by_left='uid', by_right='key', type='inner').put('$job_root/1/csp_exp')
join_control = csp_uids.join(control, by_left='uid', by_right='key', type='inner').put('$job_root/1/csp_control')


#join_exp = t.join(exp, by='query').filter( nf.custom(bool, 'freq')).put('$job_root/join1')
#t.map(map_uids).groupby('uid').aggregate(count=na.sum('count')).put('$job_root/aggr_csp_uids2')
#t.groupby('uid').aggregate(count=na.count()).put('$job_root/aggr_csp_uids')

job.run()


