# -*- coding: UTF-8 -*-

from mapreducelib import MapReduce, Record
from collections import defaultdict
from datetime import datetime, timedelta, date
import libra
import urllib, re,random

TRANSLATION = None

def build_translation():
    global TRANSLATION
    if TRANSLATION is None:
        import sys
        import unicodedata
        TRANSLATION = {
            index: u' ' for index in xrange(sys.maxunicode)
            if unicodedata.category(unichr(index)).startswith('P')
        }
        for char in u'\t\n\x0b\x0c\r$+<=>^`|~':
            TRANSLATION[ord(char)] = u' '
    return TRANSLATION


def normalize_query(query):

    try:
        query = query.decode('utf8')
    except UnicodeDecodeError:
        return

    query = query.translate(build_translation())
    query = query.lower()
    query = query.strip()
    query = re.sub(r'\s\s+', ' ', query)

    return query.encode('utf8')


def Reduce(key, recs):
    uid = key
    if uid[0] != 'y':
        return

    try:
        s = libra.ParseSession(recs, './blockstat.dict')
    except:
        return

    for r in s:
        if r.IsA('TTouchYandexWebRequest'):
            ui = 'Desktop'
        else:
            continue

        if r.ServiceDomRegion != 'ru':
            continue

        date =  str(datetime.fromtimestamp(r.Timestamp).isoformat()).split('T')[0]
        ts = r.Timestamp

        q = normalize_query(r.Query)

        if len(q) == 0:
            continue

        yield Record((uid + '\t' + q)[:200],str(ts),'')



def aggr_deltas(key,recs):
    i = 0
    for rec in recs:
        i += 1

    yield Record('',key,str(i))


def map_dates(rec):
    dates = ['2016-02-01','2016-02-02','2016-02-03','2016-02-04','2016-02-05','2016-02-06','2016-02-07']
    k = rec.key
    ts = rec.subkey

    date = str(datetime.fromtimestamp(float(ts)).isoformat()).split('T')[0]

    if date in dates:
        yield Record(k,'',date)


def aggr_queries(key,recs):
    l = key.split('\t')
    uid = l[0]
    q = l[1]

    yield Record(uid,'',q)

def aggr_sum(key,recs):
    i = 0
    j = 0
    for rec in recs:
        i += int(rec.value)
        j += 1

    yield Record(str(j),'',str(i))

def map_vals(rec):
    yield Record('',rec.value,rec.subkey)

def count_med(key,recs):
    i = 0
    med = 0
    for rec in recs:
        if i > 15310765:
            break

        if i == 15310764 or i == 15310765:
            yield Record('','',str(rec.subkey))

        i += 1



def main():

    MapReduce.useDefaults(
                            server   = 'sakura.search.yandex.net:8013',
                            username = 'userstats',
                            mrExec   = '/Berkanavt/bin/mapreduce-dev',
                            verbose  = True,
                            #testMode = True,
                         )


    dt = 'ensuetina/QUERIES_HISTOGRAM/TOUCH/all_q'
    dt_week = 'ensuetina/QUERIES_HISTOGRAM/TOUCH/1/week'
    dt_uniq = 'ensuetina/QUERIES_HISTOGRAM/TOUCH/1/week_uniq'
    dt_aggr = 'ensuetina/QUERIES_HISTOGRAM/TOUCH/1/week_uniq_aggr'
    dt_total = 'ensuetina/QUERIES_HISTOGRAM/TOUCH/1/week_uniq_aggr_total'

#    MapReduce.runMap(map_dates,
#                     srcTable = dt,
#                     dstTable = dt_week,
#                     sortMode = True
#                    )

#    MapReduce.runReduce(aggr_queries,
#                        srcTable = dt_week,
#                        dstTable = dt_uniq,
#                        sortMode = True
#                       )

#    MapReduce.runReduce(aggr_deltas,
#                        srcTable = dt_uniq,
#                        dstTable = dt_aggr,
#                        sortMode = True
#                       )

#    MapReduce.runReduce(aggr_sum,
#                        srcTable = dt_aggr,
#                        dstTable = dt_total,
#                        sortMode = True
#                       )

    MapReduce.runMap(map_vals,
                     srcTable = dt_aggr,
                     dstTable = dt_aggr + '_sk',
                     sortMode = True
                    )
    MapReduce.runReduce(count_med,
                        srcTable = dt_aggr + '_sk',
                        dstTable = dt_aggr + '_med',
                        sortMode = True
                       )


if __name__ == '__main__':
    main()
