#!/usr/bin/env python 
# -*- coding: utf-8 -*-

from mapreducelib import MapReduce, Record
from collections import defaultdict
from urlparse import parse_qs, urlparse
import datetime
import sys
import os
import optparse
import re
import libra
import random

def parseOptions(argv):
    usage='usage: %prog <args> test-ids to parse'
    parser = optparse.OptionParser(usage=usage)
    parser.add_option("-d", dest="date", type='string', help = "date")

    options, testids = parser.parse_args()

    if not options.date:
        parser.error('Please, specify date')

    return options

TRANSLATION = None
def build_translation():
    global TRANSLATION
    if TRANSLATION is None:
        import sys
        import unicodedata
        TRANSLATION = {
            index: u' ' for index in xrange(sys.maxunicode)
            if unicodedata.category(unichr(index)).startswith('P')
        }
        for char in u'\t\n\x0b\x0c\r$+<=>^`|~':
            TRANSLATION[ord(char)] = u' '
    return TRANSLATION

def normalize_query(query):
    try:
        query = query.decode('utf8')
    except UnicodeDecodeError:
        return query

    query = query.translate(build_translation())
    query = query.lower()
    query = query.strip()
    query = re.sub(r'\s\s+', ' ', query)

    return query.encode('utf8')

class Extractor:
    def __init__(self, date):
        self.date = date

    def __call__(self, key, recs):
        uid = key
        if uid[0] != 'y':
            return

        try:
            s = libra.ParseSession(recs, './blockstat.dict')
        except RuntimeError as e:
            if 'fat user' in str(e):
                return
            if 'ParseSession can' in str(e):
                return
            else:
                raise e

        for r in s:
            req_type = None
            if r.IsA('TYandexWebRequest'):
                req_type = 'desktop'
            if r.IsA('TPadYandexWebRequest'):
                req_type = 'pad'
            if not req_type:
                continue

            if r.ServiceDomRegion != 'ru':
                continue
            if r.PageNo != 0:
                continue

            query = r.Query
            norm_query = normalize_query(query)

            yield Record(uid, req_type, '\t'.join([query, norm_query]))

def main(options):
#    MapReduce.useDefaults(server=os.environ['DEF_MR_SERVER'],
#                          username='userstats',
#                          #username='tmp',
#                          mrExec='/Berkanavt/bin/mapreduce-dev',
#                          verbose=True,
#                          )
#    MapReduce.useDefaults(testMode=True)
#    in_table  = 'user_sessions/'+options.date
#    in_table  = 'sample_by_yuid_1p/user_sessions/'+options.date
#    out_table = 'shining/padsearch_flow_sample/'+options.date

    MapReduce.useDefaults(server='plato.yt.yandex.net',
                            mrExec='mapreduce-yt',
                            verbose=True,
                            saveSource=True,
                            lenvalMode=True,
                            loggerName=None,
                            )
    in_table  = '//userdata/user_sessions/'+options.date
    out_table = '//tmp/shining/pasearch_flow_sample/'+options.date

    MapReduce.runReduce(Extractor(options.date),
                        srcTable=in_table,
                        dstTable=out_table,
                        sortMode=True,
                        files=['/home/shining/data/blockstat.dict'],
                        )

if __name__ == '__main__':
    options = parseOptions(sys.argv)
    main(options)
