# -*- coding: UTF-8 -*-

from mapreducelib import MapReduce, Record
from collections import defaultdict
from datetime import datetime, timedelta, date
import libra
import urllib, re,random
from libkernelgeo import TRegionsDB

TRANSLATION = None

def build_translation():
    global TRANSLATION
    if TRANSLATION is None:
        import sys
        import unicodedata
        TRANSLATION = {
            index: u' ' for index in xrange(sys.maxunicode)
            if unicodedata.category(unichr(index)).startswith('P')
        }
        for char in u'\t\n\x0b\x0c\r$+<=>^`|~':
            TRANSLATION[ord(char)] = u' '
    return TRANSLATION


def normalize_query(query):

    try:
        query = query.decode('utf8')
    except UnicodeDecodeError:
        return

    query = query.translate(build_translation())
    query = query.lower()
    query = query.strip()
    query = re.sub(r'\s\s+', ' ', query)

    return query.encode('utf8')


def Reduce(key, recs):
    uid = key
    if uid[0] != 'y':
        return

    try:
        s = libra.ParseSession(recs, './blockstat.dict')
    except:
        return

    geobase = TRegionsDB('./geodata3.bin')

    prev_ts = -1
    sess = 1

    for r in s:
        if r.IsA('TPadYandexWebRequest'):
            ui = 'Tablet'
        else:
            continue

        if r.ServiceDomRegion != 'ru':
            continue

        date =  str(datetime.fromtimestamp(r.Timestamp).isoformat())
        ts = r.Timestamp

        if prev_ts == -1:
            prev_ts = ts

        if ts - prev_ts > 1800:
            sess += 1

        q = normalize_query(r.Query)
        reg = r.UserRegion
        try:
            regName = geobase.GetName(int(reg))
        except:
            regName = 'no reg found - regCode=' + str(reg)

        cl_str = ''

        for cl in r.GetClicks():
            p = cl.ConvertedPath
            url = urllib.unquote(str(cl.Url))
            secs = cl.DelayAfterRequest

            cl_str = cl_str + '\t' + p + ' (' + str(secs) + ') --> ' + url + ' ; '

        yield Record(uid + '\t' + str(sess),'',date + '\t' + regName + '\t' + q + cl_str) # uid

        prev_ts = ts

def aggr(key,recs):
    yield Record(key,'','')

def get_sample(key,recs):
    uid = key
    if random.randint(1,10) != 1:
        return

    for rec in recs:
        yield Record(rec.key,'',rec.value)

def main():

    MapReduce.useDefaults(
                            server   = 'sakura.search.yandex.net:8013',
                            username = 'userstats',
                            mrExec   = '/Berkanavt/bin/mapreduce-dev',
                            verbose  = True,
                            #testMode = True,
                         )

    cur_d1 = datetime.strptime('20151125', '%Y%m%d')
    cur_d1 = cur_d1.date()

    cur_d = str(cur_d1).replace('-','')
    while int(cur_d)<int(20151205):

        print cur_d
        src = 'sample_by_yuid_1p/user_sessions/' + cur_d

        dt = 'ensuetina/GET_PAD_SESSIONS/output'

#        MapReduce.runReduce(Reduce,
#                            srcTable = src,
#                            dstTable = dt,
#                            files = ['/home/ensuetina/data/blockstat.dict','/home/ensuetina/data/geodata3.bin'],
#                            appendMode = True,
#                            sortMode = True
#                            )

        cur_d1 = cur_d1 + timedelta(days=1)

        cur_d = str(cur_d1).replace('-','')

    MapReduce.runReduce(get_sample,
                        srcTable = dt,
                        dstTable = dt + '_sample',
                        sortMode = True
                       )

    MapReduce.runReduce(aggr,
                        srcTable = dt + '_sample',
                        dstTable = dt + '_sample_aggr',
                        sortMode = True
                       )


if __name__ == '__main__':
    main()
