# -*- coding: UTF-8 -*-

from mapreducelib import MapReduce, Record
from collections import defaultdict
from datetime import datetime
import libra
import urllib, re, random, sys, cgi

def Reduce(key, recs):
    uid = key
    if uid[0] != 'y':
        return

    try:
        s = libra.ParseSession(recs, './blockstat.dict')
    except:
        return

    c = '17982'
    e = '17983'

    for r in s:
        if not r.IsA('TTouchYandexWebRequest'):
            continue

        if r.ServiceDomRegion != 'ru':
            continue

        ts =  str(datetime.fromtimestamp(r.Timestamp).isoformat()).split('T')[0]

        if r.HasTestID(c):
            TI = 0
            slot = 'control ' + c
        elif r.HasTestID(e):
            TI = 1
            slot = 'exp ' + e
        else:
            continue

#        yield Record(uid,'',slot + '\t' + ts,tableIndex = TI)

        if r.GetSuggest():
            yield Record(uid,'',slot + '\t' + r.Query + '\t' + r.ReqID,tableIndex = TI + 2)
            isSugg = 1
#        else:
#            isSugg = 0

#        yield Record(uid,'',slot + '\t' + ts + '\t' + str(isSugg),tableIndex = TI)

class getSess:
    def __init__(self,c,e):
        self.c = c
        self.e = e

    def __call__(self,rec):
        uid = rec.key
        ts = rec.subkey
        line = rec.value

        if uid == '':
            return

        if uid in self.c:
            TI = 0
        elif uid in self.e:
            TI = 1
        else:
            return

        if 'type=SUGGEST' in line:
            yield Record(uid,ts,line,tableIndex = TI) # 0 - suggest control, 1 - suggest exp
        #elif 'type=REQUEST\tservice=www.yandex' in line:
        #    yield Record(uid,ts,line,tableIndex = TI + 2) # 2 - request control, 3 - request exp

def map_sess(rec):
    uid = rec.key
    sk = rec.subkey
    line = rec.value

    data = dict([d.split('=', 1) for d in line.split('\t') if '=' in d])

    rtype = data.get('type')
    if rtype == 'SUGGEST':
        rinput = data.get('input')
        rstatus = data.get('status')
        rquery = data.get('query')
        rsuggest_reqid = data.get('suggest_reqid')
        yield Record(uid,sk,rtype + '\t' + str(rinput) + '\t' + str(rstatus) + '\t' + str(rquery) + '\t' + str(rsuggest_reqid))
    elif rtype == 'REQUEST':
        rquery = data.get('query')
        rreqid = data.get('reqid')
        yield Record(uid,sk,rtype + '\t' + str(rquery) + '\t' + str(rreqid))
    elif rtype == 'ACCESS':
        rreqid = data.get('reqid')
        rrequest = data.get('request')
        yield Record(uid,sk,rtype + '\t' + str(rreqid) + '\t' + str(rrequest))

def map_keys(rec):
    uid = rec.key
    sk = rec.subkey
    line = rec.value

    l = line.split('\t')
    rtype = l[0]

    if rtype == 'SUGGEST':
        suggest_reqid = l[4]
        yield Record(suggest_reqid[:200],sk,uid + '\t' + line, tableIndex = 0)
        yield Record(suggest_reqid[:200],sk,uid + '\t' + line, tableIndex = 1)

        q = l[3]
        q_st = q.strip()
        yield Record(q[:200],sk,uid + line, tableIndex = 2)
        yield Record(q_st[:200],sk,uid + line, tableIndex = 3)

    elif rtype == 'REQUEST':
        reqid = l[2]
        q = l[1]
        q_st = q.strip()
        yield Record(reqid[:200],sk,uid + '\t' + line, tableIndex = 0)
        yield Record(q[:200],sk,uid + line, tableIndex = 2)
        yield Record(q_st[:200],sk,uid + line, tableIndex = 3)

    elif rtype == 'ACCESS':
        #if 'callback' in line and not 'suggest_reqid' in line:
        #    yield Record(uid[:200],sk,line,tableIndex = 3)

        reqid = l[1]
        yield Record(reqid[:200],sk,uid + '\t' + line, tableIndex = 0)

        request = l[2]
        try:
            qs = request.split('?',1)[1]
        except:
            return

        data = cgi.parse_qs(qs)
        if not 'suggest_reqid' in data:
            return

        suggest_reqid = data['suggest_reqid'][0]
        yield Record(suggest_reqid[:200],sk,uid + '\t' + line, tableIndex = 1)

def aggr(key,recs):
    k = key
#    if len(k) < 5:
#        return

    rrecs = []
    i = 0
    for rec in recs:
        i += 1
        rrecs.append(rec)

    if i > 1:
        yield Record(k,'','')
#        for rec in rrecs:
#            yield Record(rec.key,rec.subkey,rec.value)


def main():

    MapReduce.useDefaults(
                            server   = 'sakura.search.yandex.net:8013',
                            username = 'research',
                            mrExec   = '/Berkanavt/bin/mapreduce-dev',
                            verbose  = True,
                            #testMode = True,
                         )

    with open('c.txt') as f:
        c = f.read().replace('\r','').split('\n')
    with open('e.txt') as f:
        e = f.read().replace('\r','').split('\n')


    dd = ['1105']
    for d in dd:
        src = 'user_sessions/2015' + d

        src = 'ensuetina/TOUCH_SUGGEST_EXP/compare_suggests/example/1'

        dt0 = 'ensuetina/TOUCH_SUGGEST_EXP/compare_suggests/example/suggests_libra_c'
        dt1 = 'ensuetina/TOUCH_SUGGEST_EXP/compare_suggests/example/suggests__libra_e'
        dt2 = 'ensuetina/TOUCH_SUGGEST_EXP/validate/NEW/2/requests_c_!'
        dt3 = 'ensuetina/TOUCH_SUGGEST_EXP/validate/NEW/2/requests_e_!'

        MapReduce.runReduce(Reduce,
                            srcTable = src,
                            dstTables = [dt2,dt3,dt0,dt1],
                            files = ['/home/ensuetina/data/blockstat.dict'],
                            sortMode = True
                           )

#        MapReduce.runMap(getSess(c,e),
#                            srcTable = src,
#                            dstTables = [dt0,dt1,dt2,dt3],
#                            files = ['/home/ensuetina/data/blockstat.dict'],
#                            #appendMode = True,
#                            sortMode = True
#                            )

    MapReduce.runMap(map_sess,
                     srcTable = src,
                     dstTable = src + '_mapped',
                     sortMode = True
                    )
    MapReduce.runMap(map_keys,
                     srcTable = src + '_mapped',
                     dstTables = [src + '_reqids',src + '_suggest_reqids',src + '_queries',src + '_striped_queries'],
                     sortMode = True
                    )

    MapReduce.runReduce(aggr,
                        srcTable = src + '_reqids',
                        dstTable = src + '/aggr/reqids',
                        sortMode = True
                       )
    MapReduce.runReduce(aggr,
                        srcTable = src + '_suggest_reqids',
                        dstTable = src + '/aggr/suggest_reqids',
                        sortMode = True
                       )
    MapReduce.runReduce(aggr,
                        srcTable = src + '_queries',
                        dstTable = src + '/aggr/queries',
                        sortMode = True
                       )
    MapReduce.runReduce(aggr,
                        srcTable = src + '_striped_queries',
                        dstTable = src + '/aggr/striped_queries',
                        sortMode = True
                       )
#    MapReduce.runMap(map_sess,
#                     srcTable = dt1,
#                     dstTable = dt1 + '_mapped',
#                     sortMode = True
#                    )

    d0 = 'ensuetina/TOUCH_SUGGEST_EXP/validate/reqids_'
    d1 = 'ensuetina/TOUCH_SUGGEST_EXP/validate/suggest_reqids_'
    d2 = 'ensuetina/TOUCH_SUGGEST_EXP/validate/queries_'
    d3 = 'ensuetina/TOUCH_SUGGEST_EXP/validate/callbacks_'


#    d_0 = 'ensuetina/TOUCH_SUGGEST_EXP/validate/aggr/1/reqids_'
#    d_1 = 'ensuetina/TOUCH_SUGGEST_EXP/validate/aggr/1/suggest_reqids_'
#    d_2 = 'ensuetina/TOUCH_SUGGEST_EXP/validate/aggr/1/queries_'
#    d_3 = 'ensuetina/TOUCH_SUGGEST_EXP/validate/aggr/1/callbacks_'


if __name__ == '__main__':
    main()
