# -*- coding: UTF-8 -*-

from mapreducelib import MapReduce, Record
from collections import defaultdict
from datetime import datetime, timedelta, date
import libra
import urllib, re,random
import csv
import StringIO
from urlparse import urlparse, parse_qs

access_names = 'remote_ip blank1 blank2 time tz url http_status respond_size referer user_agent virtual_host x_forwarded_for cookies cts respond_time_sec respond_time_msec cluster_no reqid apache_pid balancer_ip yuid fuid compress_ratio internal ruip x-yandex-suspected-robot x-yandex-internal-request passport_uid scheme test_ids headers'.split()

def Reduce(key, recs):
    uid = key
    if uid[0] != 'y':
        return

    try:
        s = libra.ParseSession(recs, './blockstat.dict')
    except:
        return

    c = '19546'
    e = '19547'

    for r in s:
        if r.IsA('TTouchYandexWebRequest'):
            ui = 'Touch'
        else:
            continue

        if r.ServiceDomRegion != 'ru':
            continue


        if r.HasTestID(c):
            slot = 'control ' + c
            TI = 0
        elif r.HasTestID(e):
            slot = 'exp ' + e
            TI = 1
        else:
            continue

        yield Record(uid,'','',tableIndex = TI)
        return

def aggr(key,recs):
    yield Record(key,'','')

class get_errors:
    def __init__(self,c,e):
        self.exp = e
        self.control = c

    def __call__(self,rec):
        key = rec.key
        sk = rec.subkey
        line = rec.value

        logs = ['access_log','redir_log','blockstat_log','reqans_log']

        if not key in logs:
            return

        yield Record(key,sk,line)
        return

        for c in self.control:
            if c in line:
                yield Record(key,sk,c + '\t' + line, tableIndex = 0)
                return

        for e in self.exp:
            if e in line:
                yield Record(key,sk,e + '\t' + line, tableIndex = 1)
                return

class map_uids:
    def __init__(self,c,e):
        self.exp = e
        self.control = c

    def __call__(self,rec):
        key = rec.key
        sk = rec.subkey
        line = rec.value

        logs = ['access_log','redir_log','blockstat_log','reqans_log']

        if key == logs[0]:
            buf = StringIO.StringIO(line)
            reader = csv.reader(buf, delimiter=' ', quotechar='"')
            try:
                data = reader.next()
            except csv.Error, e:
                return

            data = dict(zip(access_names, data))

            try:
                uid = data['yuid']
            except:
                return

        elif key == logs[1]:
            try:
                uid = line.split('@@')[-1]
            except:
                return

        elif key == logs[2]:
            try:
                uid = line.split('\t')[11]
            except:
                return

        elif key == logs[3]:
            try:
                uid = line.split('@@')[2].split('=')[1]
            except:
                return
        else:
            yield Record(key,'','',tableIndex = 2)
            return

        if uid in self.control and len(uid) > 0:
            yield Record(uid,sk,key + '\t' + line, tableIndex = 0)
            yield Record(key,'','',tableIndex = 2)
        elif uid in self.exp and len(uid) > 0:
            yield Record(uid,sk,key + '\t' + line, tableIndex = 1)
            yield Record(key,'','',tableIndex = 2)

def map_errors(rec):
    uid = rec.key
    error = rec.subkey
    line = rec.value

    log = line.split('\t')[0]

    yield Record(log[:200],'','', tableIndex = 0)
    yield Record((log + '\t' + error)[:200],'','', tableIndex = 1)

def aggr_count(key,recs):
    i = 0
    for rec in recs:
        i += 1

    yield Record(key,'',str(i))





def main():

    MapReduce.useDefaults(
                            server   = 'sakura.search.yandex.net:8013',
                            username = 'userstats',
                            mrExec   = '/Berkanavt/bin/mapreduce-dev',
                            verbose  = True,
                            #testMode = True,
                         )

    cur_d1 = datetime.strptime('20151216', '%Y%m%d')
    cur_d1 = cur_d1.date()

    cur_d = str(cur_d1).replace('-','')
    while int(cur_d)<int(20151223):

        print cur_d
        src = 'user_sessions/' + cur_d

        dt0 = 'ensuetina/NODE_JS_EXP/c'
        dt1 = 'ensuetina/NODE_JS_EXP/e'

#        MapReduce.runReduce(Reduce,
#                            srcTable = src,
#                            dstTables = [dt0,dt1],
#                            files = ['/home/ensuetina/data/blockstat.dict'],
#                            appendMode = True,
#                            sortMode = True
#                            )

        cur_d1 = cur_d1 + timedelta(days=1)

        cur_d = str(cur_d1).replace('-','')

#    MapReduce.runReduce(aggr,
#                        srcTable = dt0,
#                        dstTable = dt0 + '_aggr',
#                        sortMode = True
#                        )

#    MapReduce.runReduce(aggr,
#                        srcTable = dt1,
#                        dstTable = dt1 + '_aggr',
#                        sortMode = True
#                        )

#    with open('c.txt') as f:
#        c = f.read().replace('\t','').replace('\r','').replace('y','').split('\n')

#    with open('e.txt') as f:
#        e = f.read().replace('\t','').replace('\r','').replace('y','').split('\n')

#    print len(c)
#    print len(e)

#    print c[0]
#    print e[0]

    d0 = 'ensuetina/NODE_JS_EXP/errors_control'
    d1 = 'ensuetina/NODE_JS_EXP/errors_exp'
    d2 = 'ensuetina/NODE_JS_EXP/errors_logs'

    d = 'ensuetina/NODE_JS_EXP/1/'

#    MapReduce.runMap(map_uids(c,e),
#                     srcTable = 'ensuetina/NODE_JS_EXP/errors',
#                     dstTables = [d0,d1,d2],
#                     sortMode = True
#                    )
#    MapReduce.runReduce(aggr,
#                        srcTable = d2,
#                        dstTable = d2 + '_aggr',
#                        sortMode = True
#                       )

    MapReduce.runMap(map_errors,
                     srcTable = d0,
                     dstTables = [d+'logs_control',d+'logs_errors_control'],
                     sortMode = True
                    )
    MapReduce.runReduce(aggr_count,
                        srcTable = d+'logs_control',
                        dstTable = d+'aggr_logs_control',
                        sortMode = True
                       )
    MapReduce.runReduce(aggr_count,
                        srcTable = d+'logs_errors_control',
                        dstTable = d+'aggr_logs_errors_control',
                        sortMode = True
                       )
    MapReduce.runMap(map_errors,
                     srcTable = d1,
                     dstTables = [d+'logs_exp',d+'logs_errors_exp'],
                     sortMode = True
                    )
    MapReduce.runReduce(aggr_count,
                        srcTable = d+'logs_exp',
                        dstTable = d+'aggr_logs_exp',
                        sortMode = True
                       )
    MapReduce.runReduce(aggr_count,
                        srcTable = d+'logs_errors_exp',
                        dstTable = d+'aggr_logs_errors_exp',
                        sortMode = True
                       )

#    dd = ['16','17','18','19','20','21','22']
#    for d in dd:

#        src = 'user_sessions/201512' + d + '/raw/errors'

#        MapReduce.runMap(get_errors('c','e'),
#                         srcTable = src,
#                         dstTable = 'ensuetina/NODE_JS_EXP/errors',
#                         dstTables = [dt0 + '_errors', dt1 + '_errors'],
#                         appendMode = True,
#                         sortMode = True
#                       )


if __name__ == '__main__':
    main()
