# -*- coding: UTF-8 -*-

from mapreducelib import MapReduce, Record
from collections import defaultdict
from datetime import datetime, timedelta, date
import libra
import urllib, re,random

TRANSLATION = None

def build_translation():
    global TRANSLATION
    if TRANSLATION is None:
        import sys
        import unicodedata
        TRANSLATION = {
            index: u' ' for index in xrange(sys.maxunicode)
            if unicodedata.category(unichr(index)).startswith('P')
        }
        for char in u'\t\n\x0b\x0c\r$+<=>^`|~':
            TRANSLATION[ord(char)] = u' '
    return TRANSLATION


def normalize_query(query):

    try:
        query = query.decode('utf8')
    except UnicodeDecodeError:
        return

    query = query.translate(build_translation())
    query = query.lower()
    query = query.strip()
    query = re.sub(r'\s\s+', ' ', query)

    return query.encode('utf8')


def Reduce(key, recs):
    uid = key
    if uid[0] != 'y':
        return

    try:
        s = libra.ParseSession(recs, './blockstat.dict')
    except:
        return

    markers = ['жд ', 'билет', 'поезд', 'авиа', 'самолет', 'экскурси', 'тур', 'путешестви','путевк', 'отель', 'отели', 'санатори']

    for r in s:
        if r.IsA('TTouchYandexWebRequest'):
            ui = 'Touch'
        elif r.IsA('TPadYandexWebRequest'):
            ui = 'Tablet'
        elif r.IsA('TYandexWebRequest'):
            ui = 'Desktop'
        else:
            continue

        if r.ServiceDomRegion != 'ru':
            continue

        ts =  str(datetime.fromtimestamp(r.Timestamp).isoformat()).split('T')[0]

        q = normalize_query(r.Query)

        if len(q) == 0:
            continue

        if 'квартир'in q or 'outlet'in q or 'village'in q or 'аутлет'in q or'магазин' in q:
            continue

        if 'жилой' in q or 'арендов' in q or 'аренда' in q or ('жк' in q and not 'держк' in q):
            continue

        if 'ауди' in q or 'audi' in q or 'авито' in q or 'avito' in q:
            continue

        if 'android' in q or 'samsung' in q or 'lenovo' in q or 'андроид' in q or 'самсунг' in q or 'phillips' in q or 'nikon' in q or 'windows' in q or 'beats' in q:
            continue

        isM = 0
        for m in markers:
            if m in q:
                isM = 1
                marker = m
                break

        if isM == 0:
            continue

        yield Record(uid,'',ts + '\t' + ui + '\t' + marker + '\t' + q)

def swap_keys(rec):
    key = rec.value
    uid = rec.key
    k = key.split('\t')

    markers = {'жд ': 'поезд',
               'билет': 'билеты',
               'поезд': 'поезд',
               'авиа': 'самолет',
               'самолет': 'самолет',
               'экскурси': 'экскурсии',
               'тур': 'туры',
               'путешестви': 'путешествие',
               'путевк': 'путевки',
               'отель': 'отели',
               'отели': 'отели',
               'санатори': 'санатории'
               }

    ts = k[0]
    ui = k[1]
    m = k[2]
    q = k[3][:200]

    marker = str(markers.get(m))

    yield Record(ts + '\t' + ui + '\t' + marker + '\t' + q,'','')

def aggr(key,recs):
    i = 0
    for rec in recs:
        i += 1

    yield Record(key,'',str(i))


def main():

    MapReduce.useDefaults(
                            server   = 'sakura.search.yandex.net:8013',
                            username = 'userstats',
                            mrExec   = '/Berkanavt/bin/mapreduce-dev',
                            verbose  = True,
                            #testMode = True,
                         )

    cur_d1 = datetime.strptime('20151126', '%Y%m%d')
    cur_d1 = cur_d1.date()

    cur_d = str(cur_d1).replace('-','')
    while int(cur_d) < int(20160615):
#    dd = ['20151125']
#    for cur_d in dd:

        print cur_d
        src = 'sample_by_yuid_1p/user_sessions/' + cur_d

        dt = 'ensuetina/TRAVEL/outout_2016'

        break

        MapReduce.runReduce(Reduce,
                            srcTable = src,
                            dstTable = dt,
                            files = ['/home/ensuetina/data/blockstat.dict'],
                            appendMode = True,
                            sortMode = True
                            )

        cur_d1 = cur_d1 + timedelta(days=random.randint(1,3))

        cur_d = str(cur_d1).replace('-','')

    d = dt + '_swapped'

    MapReduce.runMap(swap_keys,
                     srcTable = dt,
                     dstTable = d,
                     sortMode = True
                    )
    MapReduce.runReduce(aggr,
                        srcTable = d,
                        dstTable = d + '_aggr',
                        sortMode = True
                       )


if __name__ == '__main__':
    main()
