#!/usr/bin/env python 
# -*- coding: utf-8 -*-

from mapreducelib import MapReduce, Record
from collections import defaultdict
from urlparse import parse_qs, urlparse
from datetime import datetime
import httpagentparser
import sys
import os
import optparse
import re
import libra

def parseOptions(argv):
    usage='usage: %prog <args> test-ids to parse'
    parser = optparse.OptionParser(usage=usage)
    parser.add_option("-d", dest="date", type='string', help = "date")

    options, testids = parser.parse_args()

    if not options.date:
        parser.error('Please, specify date')

    return options

TRANSLATION = None
def build_translation():
    global TRANSLATION
    if TRANSLATION is None:
        import sys
        import unicodedata
        TRANSLATION = {
            index: u' ' for index in xrange(sys.maxunicode)
            if unicodedata.category(unichr(index)).startswith('P')
        }
        for char in u'\t\n\x0b\x0c\r$+<=>^`|~':
            TRANSLATION[ord(char)] = u' '
    return TRANSLATION

def normalize_query(query):
    try:
        query = query.decode('utf8')
    except UnicodeDecodeError:
        return query

    query = query.translate(build_translation())
    query = query.lower()
    query = query.strip()
    query = re.sub(r'\s\s+', ' ', query)

    return query.encode('utf8')

PATHS = ['65.66', '65.568', '65.176']

class Extractor:
    def __init__(self, date):
        self.date = date

    def __call__(self, key, recs):
        uid = key
        if uid[0] != 'y':
            return

        try:
            s = libra.ParseSession(recs, './blockstat.dict')
        except RuntimeError as e:
            if 'fat user' in str(e):
                return
            if 'ParseSession can' in str(e):
                return
            else:
                raise e

        for r in s:
            req_type = None
            if r.IsA('TYandexWebRequest'):
                req_type = 'web'
            elif r.IsA('TPadYandexWebRequest'):
                req_type = 'pad'
            elif r.IsA('TTouchYandexWebRequest'):
                req_type = 'touch'

            if not req_type:
                continue

            domain = r.ServiceDomRegion
            query = r.Query
            nquery = normalize_query(query)
            browser = ' '.join(r.GetBrowser())
            region = r.UserRegion
            time = datetime.fromtimestamp(r.Timestamp).strftime('%Y-%m-%d %H:%M:%S')

            mb_clicks = 0
            pb_clicks = 0

            clicks_times = []

            for b in r.GetMainBlocks():
                for c in b.GetClicks():
                    mb_clicks += 1
                    clicks_times.append(c.Timestamp)

            for b in r.GetParallelBlocks():
                for c in b.GetClicks():
                    pb_clicks += 1
                    clicks_times.append(c.Timestamp)

            if clicks_times:
                clicks_times = sorted(clicks_times)
                min_time = clicks_times[0] - r.Timestamp
                max_time = clicks_times[-1] - r.Timestamp
            else:
                min_time = ''
                max_time = ''

            out = None
            out_time = 0

            for c in r.GetClicks():
                if c.Path in PATHS:
                    out = c.ConvertedPath
                    out_time = c.Timestamp - r.Timestamp

            if out:
                yield Record(uid, nquery[0:1024], '\t'.join([str(x) for x in [self.date, req_type, out,
                                                                      out_time, min_time, max_time,
                                                                      domain, browser, region, time,
                                                                      mb_clicks, pb_clicks]]),
                                                                      tableIndex=0)
            else:
                if r.PageNo == 0:
                    yield Record(nquery[0:1024], req_type, '\t'.join([str(x) for x in [self.date, domain]]), tableIndex=1)


def main(options):
    MapReduce.useDefaults(server=os.environ['DEF_MR_SERVER'],
                            username='userstats',
                            #username='tmp',
                            mrExec='/Berkanavt/bin/mapreduce-dev',
                            verbose=True,
                            )
#    MapReduce.useDefaults(testMode=True)
    in_table  = 'user_sessions/'+options.date
#    in_table  = 'sample_by_yuid_1p/user_sessions/'+options.date
    out_table1 = 'shining/soo_queries/' + options.date
    out_table2 = 'shining/soo_queries/' + options.date + '/all_reqs'

    MapReduce.runReduce(Extractor(options.date),
                        srcTable=in_table,
                        dstTables=[out_table1, out_table2],
                        sortMode=True,
                        files=['/home/shining/data/blockstat.dict'],
                        )

if __name__ == '__main__':
    options = parseOptions(sys.argv)
    main(options)
