# -*- coding: UTF-8 -*-

from mapreducelib import MapReduce, Record
from collections import defaultdict
from datetime import datetime, timedelta, date
import libra
import urllib, re, urlparse

TRANSLATION = None

def build_translation():
    global TRANSLATION
    if TRANSLATION is None:
        import sys
        import unicodedata
        TRANSLATION = {
            index: u' ' for index in xrange(sys.maxunicode)
            if unicodedata.category(unichr(index)).startswith('P')
        }
        for char in u'\t\n\x0b\x0c\r$+<=>^`|~':
            TRANSLATION[ord(char)] = u' '
    return TRANSLATION


def normalize_query(query):

    try:
        query = query.decode('utf8')
    except UnicodeDecodeError:
        return

    query = query.translate(build_translation())
    query = query.lower()
    query = query.strip()
    query = re.sub(r'\s\s+', ' ', query)

    return query.encode('utf8')


def Reduce(key, recs):
    uid = key
    if uid[0] != 'y':
        return

    try:
        s = libra.ParseSession(recs, './blockstat.dict')
    except:
        return

    c = '17408'
    e = '17405'

    for r in s:
        if r.IsA('TTouchYandexWebRequest'):
            ui = 'web'
        elif r.IsA('TTouchYandexImagesRequest'):
            ui = 'images'
        elif r.IsA('TYandexVideoRequest'):
            ui = 'video'
        elif r.IsA('TTouchYandexPortalRequest'):
            ui = 'morda'
        else:
            continue

        try:
            reqid = r.ReqID
            reg = r.ServiceDomRegion
        except:
            reqid = '-'
            reg = '-'
#        reg = r.ServiceDomRegion
        ts =  str(datetime.fromtimestamp(r.Timestamp).isoformat()).split('T')[0]
#        q = normalize_query(r.Query)


        if r.HasTestID(c):
            slot = 'control ' + c
            TI = 0
        elif r.HasTestID(e):
            slot = 'exp ' + e
            TI = 1
        else:
            continue

        for cl in r.GetClicks():
            p = cl.ConvertedPath
            url = urllib.unquote(str(cl.Url))
            host = urlparse.urlparse(url).netloc
            vv = str(cl.GetVars())
            if 'dynamic_click' in vv or 'dynamic-click' in vv:
                continue

            if 'yandex.' in url:
                yield Record(uid,'',ui + '\t' + slot + '\t' + ts + '\t' + reg + '\t' + p + '\t' + host + '\t' + url, tableIndex = TI)


def main():

    MapReduce.useDefaults(
                            server   = 'sakura.search.yandex.net:8013',
                            username = 'userstats',
                            mrExec   = '/Berkanavt/bin/mapreduce-dev',
                            verbose  = True,
                            #testMode = True,
                         )

    dd = ['1023','1024','1025','1026','1027','1028','1029','1030','1103','1104','1105','1106','1107','1108','1109','1110','1111','1112','1113','1114','1115','1116','1117']
    for d in dd:
        src = 'user_sessions/2015' + d

        dt0 = 'ensuetina/NEW_TOUCH_HEAD_TRAFFIC/1/data_control'
        dt1 = 'ensuetina/NEW_TOUCH_HEAD_TRAFFIC/1/data_exp'

        MapReduce.runReduce(Reduce,
                            srcTable = src,
                            dstTables = [dt0,dt1],
                            files = ['/home/ensuetina/data/blockstat.dict'],
                            appendMode = True,
                            sortMode = True
                            )



if __name__ == '__main__':
    main()
