# -*- coding: UTF-8 -*-

from mapreducelib import MapReduce, Record
from collections import defaultdict
from datetime import datetime
import libra
import urllib, re, random, urlparse, cgi

TRANSLATION = None

def build_translation():
    global TRANSLATION
    if TRANSLATION is None:
        import sys
        import unicodedata
        TRANSLATION = {
            index: u' ' for index in xrange(sys.maxunicode)
            if unicodedata.category(unichr(index)).startswith('P')
        }
        for char in u'\t\n\x0b\x0c\r$+<=>^`|~':
            TRANSLATION[ord(char)] = u' '
    return TRANSLATION


def normalize_query(query):

    try:
        query = query.decode('utf8')
    except UnicodeDecodeError:
        return

    query = query.translate(build_translation())
    query = query.lower()
    query = query.strip()
    query = re.sub(r'\s\s+', ' ', query)

    return query.encode('utf8')


def Reduce(key, recs):
    import uatraits

    detector = uatraits.detector('browser.xml')

    uid = key
    if uid[0] != 'y':
        return

    try:
        uid_date = uid[-10:]
    except:
        return

    if int(uid_date) > 1444424400:
        cookie = 'NEW COOKIE'
    else:
        cookie = 'OLD COOKIE'

    uid_ts = str(datetime.fromtimestamp(float(uid_date)).isoformat()).split('T')[0]

    pad_c = '17655'
    pad_e = '17654'

    try:
        s = libra.ParseSession(recs, './blockstat.dict')
    except:
        return

    TI = -1

    for r in s:
        if not r.IsA('TPadYandexWebRequest'):
            continue

#        if r.ServiceDomRegion != 'ru':
#            continue

        reg = r.ServiceDomRegion
        ts =  str(datetime.fromtimestamp(r.Timestamp).isoformat()).split('T')[0]
        BR = r.GetBrowser()[0]
        BV = r.GetBrowser()[1]
        ua = r.UserAgent

        d = detector.detect(ua)
        BrowserName = d.get('BrowserName')
        BrowserVersion = d.get('BrowserVersion')
        OSName = d.get('OSName')

        uatr = str(BrowserName) + '\t' + str(BrowserVersion) + '\t' + str(OSName)

        serp_url = r.FullRequest
        if 'callback' in serp_url:
            isCallBack = 1
        else:
            isCallBack = 0

        qs = serp_url.split('?',1)
        if len(qs) < 2:
            continue

        qs = qs[1]
        data = cgi.parse_qs(qs)
        if 'reload' in data:
            rld = data['reload']
        else:
            rld = '-'

        if r.HasTestID(pad_c):
            TI = 0
            slot = 'control ' + pad_c
        elif r.HasTestID(pad_e):
            TI = 1
            slot = 'exp ' + pad_e
        else:
            continue

        yield Record(uid,'',cookie + '\t' + slot + '\t' + ts + '\t' + str(uid_ts) + '\t' + str(BR) + '\t' + str(BV) + '\t' + uatr + '\t' + str(rld) + '\t' + str(isCallBack), tableIndex = TI) # all reqs

        for bl in r.GetMainBlocks():
            m = bl.GetMainResult()
            if m.IsA('TWebResult'):
                for cl in bl.GetClicks():
                    url = str(cl.Url)
                    host = urlparse.urlparse(url).netloc
                    yield Record(uid,'',cookie + '\t' + slot + '\t' + ts + '\t' + str(uid_ts) + '\t' + str(BR) + '\t' + str(BV) + '\t' + uatr + '\t' + str(rld) + '\t' + str(isCallBack) + '\t' + host, tableIndex = TI + 2)

class map_redir:
    def __init__(self,exp,control):
        self.exp = ''
        self.control = ''

    def __call__(self,rec):
        line = rec.value
        l = line.split('@@')
        uid = l[-1]
        if len(uid) < 10:
            return
        uid = 'y' + uid
#        if uid in self.control:
#            TI = 0
#        elif uid in self.exp:
#            TI = 1
#        else:
#            return

        data = dict([d.split('=', 1) for d in line.split('@@') if '=' in d])

        if 'dtype' in data:
            dt = data['dtype']
        else:
            return

        if dt != 'jserror':
            return

        if 'url' in data:
            url = urllib.unquote(data['url'])
        else:
            url = '-'

        yield Record(uid,'',dt + '\t' + url)


def transform(rec):
    uid = rec.key
    line = rec.value
    br = line.split('\t')[4]
    if br != 'AndroidBrowser':
        return

    yield Record(uid,'',line)

def aggr(key,recs):
    yield Record(key,'','us')

def join(key,recs):
    uid = key
    rrecs = []
    isUs = 0
    for rec in recs:
        tmp = rec
        if tmp.value == 'us':
            isUs = 1
            continue
        else:
            rrecs.append(tmp)

    if isUs == 1:
        for rec in rrecs:
            yield Record(uid,'',rec.value)


def main():

    MapReduce.useDefaults(
                            server   = 'sakura.search.yandex.net:8013',
                            username = 'userstats',
                            mrExec   = '/Berkanavt/bin/mapreduce-dev',
                            verbose  = True,
                            #testMode = True,
                         )

#    with open('uids/control.txt') as f:
#        c = f.read().replace('\r','').split('\n')

#    with open('uids/exp.txt') as f:
#        e = f.read().replace('\r','').split('\n')

#    print len(c)
#    print len(e)

#    print c[0]+'!!'
#    print e[0] + '??'

    dd = ['23','24','25','26','27']
#    dd = ['26']
    for d in dd:

        us = 'user_sessions/201510' + d
        redir = 'redir_log/201510' + d

        dt0 = 'ensuetina/HTTPS_EXPERIMENTS/PADS/new/1/pads_control'
        dt1 = 'ensuetina/HTTPS_EXPERIMENTS/PADS/new/1/pads_exp'

        dt2 = 'ensuetina/HTTPS_EXPERIMENTS/PADS/new/1/pads_control_clicks'
        dt3 = 'ensuetina/HTTPS_EXPERIMENTS/PADS/new/1/pads_exp_clicks'

        dst = 'ensuetina/HTTPS_EXPERIMENTS/PADS/new/1/errors'

        continue

        MapReduce.runMap(map_redir('',''),
                         srcTable = redir,
                         dstTable = dst,
                         appendMode = True,
                         sortMode = True
                        )

        MapReduce.runReduce(Reduce,
                            srcTable = us,
                            dstTables = [dt0,dt1,dt2,dt3],
                            files = ['/home/ensuetina/data/blockstat.dict',
                                     '/home/ensuetina/lib/UATRAITS/uatraits.so',
                                     '/home/ensuetina/lib/UATRAITS/libboost_python-py27.so.1.46.1',
                                     '/home/ensuetina/lib/UATRAITS/libuatraits.so.0',
                                     '/home/ensuetina/lib/UATRAITS/browser.xml'],
                            appendMode = True,
                            sortMode = True
                            )

    MapReduce.runMap(transform,
                        srcTable = dt0,
                        dstTable = dt0 + '_android',
                        sortMode = True
                       )
    MapReduce.runMap(transform,
                        srcTable = dt1,
                        dstTable = dt1 + '_android',
                        sortMode = True
                       )

    MapReduce.runReduce(aggr,
                        srcTable = dt0 + '_android',
                        dstTable = dt0 + '_uids',
                        sortMode = True
                       )
    MapReduce.runReduce(aggr,
                        srcTable = dt1 + '_android',
                        dstTable = dt1 + '_uids',
                        sortMode = True
                       )

    MapReduce.runReduce(join,
                        srcTables = [dt0+'_uids',dst],
                        dstTable = dt0 + '_joined_errors',
                        sortMode = True
                       )
    MapReduce.runReduce(join,
                        srcTables = [dt1 + '_uids',dst],
                        dstTable = dt1 + '_joined_errors',
                        sortMode = True
                       )


if __name__ == '__main__':
    main()
