# -*- coding: UTF-8 -*-

from mapreducelib import MapReduce, Record
from collections import defaultdict
from datetime import datetime
import libra
import urllib, re,random

TRANSLATION = None

def build_translation():
    global TRANSLATION
    if TRANSLATION is None:
        import sys
        import unicodedata
        TRANSLATION = {
            index: u' ' for index in xrange(sys.maxunicode)
            if unicodedata.category(unichr(index)).startswith('P')
        }
        for char in u'\t\n\x0b\x0c\r$+<=>^`|~':
            TRANSLATION[ord(char)] = u' '
    return TRANSLATION


def normalize_query(query):

    try:
        query = query.decode('utf8')
    except UnicodeDecodeError:
        return

    query = query.translate(build_translation())
    query = query.lower()
    query = query.strip()
    query = re.sub(r'\s\s+', ' ', query)

    return query.encode('utf8')


def Reduce(key, recs):
    uid = key
    if uid[0] != 'y':
        return

    try:
        s = libra.ParseSession(recs, './blockstat.dict')
    except:
        return

    for r in s:
        if r.IsA('TYandexWebRequest'):
            ui = 'DESKTOP'
            TI = 0
        elif r.IsA('TPadYandexWebRequest'):
            ui = 'TABLET'
            TI = 1
        else:
            continue

        if r.ServiceDomRegion != 'ru':
            continue

        ts =  str(datetime.fromtimestamp(r.Timestamp).isoformat()).split('T')[0]
        ua = r.UserAgent
        BR = r.GetBrowser()[0]
        q = normalize_query(r.Query)
        reqid = r.ReqID

        k = ts + '\t' + ui + '\t' + BR

        isOA = None
        isImg = None
        isVid = None
        isNews = None
        isAdr = None

        for bl in r.GetBSBlocks():
            p = bl.Path
            if 'snippet/images/' in p:
                isImg = 'snippet/images/'
            if 'snippet/video/' in p:
                isVid = 'snippet/video/'
            if 'snippet/news' in p:
                isNews = 'snippet/news'
            if 'snippet/companies/map' in p:
                isAdr = 'snippet/companies/map'
            if 'parallel/object-answer/' in p:
                isOA = 'parallel/object-answer/'


        if isOA:
            yield Record(k + '\t' + 'SHOW\t' + isOA,'','', tableIndex = 0)
        if isImg:
            yield Record(k + '\t' + 'SHOW\t' + isImg,'','', tableIndex = 0)
        if isVid:
            yield Record(k + '\t' + 'SHOW\t' + isVid,'','', tableIndex = 0)
        if isNews:
            yield Record(k + '\t' + 'SHOW\t' + isNews,'','', tableIndex = 0)
        if isAdr:
            yield Record(k + '\t' + 'SHOW\t' + isAdr,'','', tableIndex = 0)

        for cl in r.GetClicks():
            p = cl.ConvertedPath
            dw = cl.DwellTime
            if int(dw) > 30:
                cl_type = 'long click'
            else:
                cl_type = 'short click'

            if 'snippet/images/' in p:
                yield Record(k + '\t' +'CLICK\t' + cl_type + '\tsnippet/images/' + '\t' + p ,'',str(dw), tableIndex = 1)
            if 'snippet/video/' in p:
                yield Record(k + '\t' +'CLICK\t' + cl_type + '\tsnippet/video/' + '\t' + p,'',str(dw), tableIndex = 1)
            if 'snippet/news' in p:
                yield Record(k + '\t' +'CLICK\t' + cl_type + '\tsnippet/news' + '\t' + p,'',str(dw), tableIndex = 1)
            if 'snippet/companies/map' in p:
                yield Record(k + '\t' +'CLICK\t' + cl_type + '\tsnippet/companies/map' + '\t' + p,'',str(dw), tableIndex = 1)
            if 'parallel/object-answer/' in p:
                yield Record(k + '\t' +'CLICK\t' + cl_type + '\tparallel/object-answer/' + '\t' + p,'',str(dw), tableIndex = 1)



def aggr(key,recs):
    freq = 0
    for rec in recs:
        freq += 1

    yield Record(key,'',str(freq))


def main():

    MapReduce.useDefaults(
                            server   = 'sakura.search.yandex.net:8013',
                            username = 'userstats',
                            mrExec   = '/Berkanavt/bin/mapreduce-dev',
                            verbose  = True,
                            #testMode = True,
                         )

    dd = ['19','20','21','22','23','24','25']
    for d in dd:
        src = 'user_sessions/201510' + d

        dt0 = 'ensuetina/RESEARCH-1536/recalc/shows'
        dt1 = 'ensuetina/RESEARCH-1536/recalc/clicks'

#        continue

        MapReduce.runReduce(Reduce,
                            srcTable = src,
                            dstTables = [dt0,dt1],
                            files = ['/home/ensuetina/data/blockstat.dict'],
                            appendMode = True,
                            sortMode = True
                            )

    MapReduce.runReduce(aggr,
                        srcTable = dt0,
                        dstTable = dt0 + '_aggr',
                        sortMode = True
                       )
    MapReduce.runReduce(aggr,
                        srcTable = dt1,
                        dstTable = dt1 + '_aggr',
                        sortMode = True
                       )


if __name__ == '__main__':
    main()
