# -*- coding: UTF-8 -*-

from mapreducelib import MapReduce, Record
from collections import defaultdict
from datetime import datetime
import libra
import urllib, re, random, sys

def Reduce(key, recs):
    uid = key
    if uid[0] != 'y':
        return

    try:
        s = libra.ParseSession(recs, './blockstat.dict')
    except:
        return

    c = '17982'
    e = '17983'

    for r in s:
        if not r.IsA('TTouchYandexWebRequest'):
            continue

        if r.ServiceDomRegion != 'ru':
            continue

        ts =  str(datetime.fromtimestamp(r.Timestamp).isoformat()).split('T')[0]

        if r.HasTestID(c):
            TI = 0
            slot = 'control ' + c
        elif r.HasTestID(e):
            TI = 1
            slot = 'exp ' + e
        else:
            continue

        if r.GetSuggest():
            isSugg = 1
        else:
            isSugg = 0

        reqid = r.ReqID
        q = str(r.Query)

        yield Record(uid,'',slot + '\t' + ts + '\t' + q + '\t' + reqid + '\t' + str(isSugg),tableIndex = TI)


def aggr(key,recs):
    yield Record(key,'','')

def parse_redir(rec):
    line = rec.value

    uid = 'y' + line.split('@@')[-1]
    if len(uid) < 10:
        return
    try:
        ts = line.split('@@')[-3]
        date = str(datetime.fromtimestamp(float(ts)).isoformat()).split('T')[0]
    except:
        return

    data = dict([d.split('=', 1) for d in line.split('@@') if '=' in d])

    path = str(data.get('path'))
    suggest_reqid = str(data.get('suggest_reqid'))
    service = ''
    if path.startswith('morda_ru'):
        service = 'morda'
    elif path.startswith('serp_ru'):
        service = 'serp'
    else:
        return

    suggest_reqid = str(data.get('suggest_reqid'))
    tpath_log = str(data.get('tpah_log'))
    ratio = str(data.get('ratio'))
    t = str(data.get('since_first_change'))
    inp = str(data.get('user_input'))
    q = str(data.get('prev_query'))

    if not 'not_used' in path and 'word,p' in tpath_log:
        isCTR = 'ctr true'
    else:
        isCTR = 'ctr false'

    yield Record(uid,'',date + '\t' + path + '\t' + service + '\t' + suggest_reqid + '\t' + tpath_log + '\t' + isCTR + '\t' + ratio + '\t' + t + '\tinput=' + inp + '\tquery=' + q)

def parse_date(rec):
    uid = rec.key
    line = rec.value
    l1 = line.split('\t')[0]
    if '2015-' in l1:
        yield Record(uid + '\t' + l1,'',line, tableIndex = 0)
    else:
        try:
            l2 = line.split('\t')[1]
        except:
            yield Record(uid,'',line, tableIndex = 1)
            return
        if '2015-' in l2:
            yield Record(uid + '\t' + l2,'',line, tableIndex = 0)


def join(key, recs):
    uid = key
    rrecs = []
    isS = 0
    for rec in recs:
        tmp = rec
        if len(tmp.value) == 10:
            isS = 1
            continue
        if len(tmp.value.split('\t')) != 3:
            rrecs.append(tmp)

    if isS == 1:
        for rec in rrecs:
            yield Record(uid,'',rec.value)

def join1(key, recs):
    uid = key
    rrecs = []
    isS = 0
    for rec in recs:
        tmp = rec
        if len(tmp.value) == 0:
            isS = 1
            continue
        rrecs.append(tmp)

    if isS == 1:
        for rec in rrecs:
            yield Record(uid,'',rec.value)

def count_suggest(rec):
    line = rec.value
    s = line.split('\t')[2]
    if str(s) == '1':
        yield Record(rec.key,'',line)

def get_100_uids(key,recs):
    if random.randint(1,6458) == 1:
        for rec in recs:
            date = rec.value.split('\t')[0]
            break
        yield Record(key + '\t' + date,'',date)

def cut_extra(rec):
    uid = rec.key
    line = rec.value
    if len(line.split('\t')) == 3:
        return

    yield Record(uid,'',line)


def main():

    MapReduce.useDefaults(
                            server   = 'sakura.search.yandex.net:8013',
                            username = 'userstats',
                            mrExec   = '/Berkanavt/bin/mapreduce-dev',
                            verbose  = True,
                            #testMode = True,
                         )

    dd = ['17','18','19','20','21','22','23','24','25','26','27','28','29','30']
    for d in dd:
        src = 'user_sessions/201512' + d
        redir = 'redir_log/201512' + d

        d0 = 'ensuetina/SUGGEST_CTR/us_reqs_control'
        d1 = 'ensuetina/SUGGEST_CTR/us_reqs_exp'

        dt0 = 'ensuetina/SUGGEST_CTR/us_uids_control'
        dt1 = 'ensuetina/SUGGEST_CTR/us_uids_exp'

        #continue

        redir_suggests = 'ensuetina/SUGGEST_CTR/1/redir_suggests'

        continue

        MapReduce.runMap(parse_redir,
                         srcTable = redir,
                         dstTable = redir_suggests,
                         appendMode = True,
                         sortMode = True
                        )
        continue

        MapReduce.runReduce(Reduce,
                            srcTable = src,
                            dstTables = [d0,d1],
                            files = ['/home/ensuetina/data/blockstat.dict'],
                            appendMode = True,
                            sortMode = True
                            )

#    MapReduce.runMap(count_suggest,
#                     srcTable = d0,
#                     dstTable = d0 + '_hasSuggest',
#                     sortMode = True
#                    )
#    MapReduce.runMap(count_suggest,
#                     srcTable = d1,
#                     dstTable = d1 + '_hasSuggest',
#                     sortMode = True
#                    )
#    sys.exit(1)


#    MapReduce.runReduce(aggr,
#                        srcTable = d0,
#                        dstTable = dt0,
#                        sortMode = True
#                       )
#    MapReduce.runReduce(aggr,
#                        srcTable = d1,
#                        dstTable = dt1,
#                        sortMode = True
#                       )


    redir_control = 'ensuetina/SUGGEST_CTR/1/redir_control'
    redir_exp = 'ensuetina/SUGGEST_CTR/1/redir_exp'

#    MapReduce.runReduce(join,
#                        srcTables = [redir_suggests,dt0],
#                        dstTable = redir_control,
#                        sortMode = True
#                       )
#    MapReduce.runReduce(join1,
#                        srcTables = [redir_suggests,dt1],
#                        dstTable = redir_exp,
#                        sortMode = True
#                       )

    MapReduce.runReduce(get_100_uids,
                        srcTable = redir_exp,
                        dstTable = 'ensuetina/SUGGEST_CTR/100_redir_uids',
                        sortMode = True
                       )

    err = 'ensuetina/SUGGEST_CTR/get_date_errors'

#    MapReduce.runMap(parse_date,
#                     srcTable = d1,
#                     dstTables = [d1 + '_date_parsed',err],
                     #appendMode = True,
#                     sortMode = True
#                    )
#    MapReduce.runMap(parse_date,
#                     srcTable = redir_exp,
#                     dstTables = [redir_exp + '_date_parsed',err],
                     #appendMode = True,
#                     sortMode = True
#                    )
#    MapReduce.runMap(parse_date,
#                     srcTable = 'ensuetina/SUGGEST_CTR/100_redir_uids',
#                     dstTables = ['ensuetina/SUGGEST_CTR/100_redir_uids_date_parsed',err],
                     #appendMode = True,
#                     sortMode = True
#                    )
    MapReduce.runReduce(join,
                        srcTables = [d1 + '_date_parsed', redir_exp + '_date_parsed','ensuetina/SUGGEST_CTR/100_redir_uids'],
                        dstTable = 'ensuetina/SUGGEST_CTR/100_sessions',
                        sortMode = True
                       )

#    MapReduce.runMap(cut_extra,
#                     srcTable = 'ensuetina/SUGGEST_CTR/100_sessions',
#                     dstTable = 'ensuetina/SUGGEST_CTR/100_sessions_clean',
#                     sortMode = True
#                    )

if __name__ == '__main__':
    main()
