#!/usr/bin/env python 
# -*- coding: utf-8 -*-

from mapreducelib import MapReduce, Record
from collections import defaultdict
from urlparse import parse_qs, urlparse
from datetime import datetime, timedelta
import httpagentparser
import sys
import os
import optparse
import re
import libabt
import libra

def parseOptions(argv):
    usage='usage: %prog <args> test-ids to parse'
    parser = optparse.OptionParser(usage=usage)
    parser.add_option("-s", dest="start", type='string', help = "start date")
    parser.add_option("-f", dest="finish", type='string', help = "finish date")

    options, testids = parser.parse_args()

    if not options.start:
        parser.error('Please, specify start date')
    if not options.finish:
        parser.error('Please, specify finish date')

    return options

def daterange(start_date, end_date):
    start_date = datetime.strptime(start_date, '%Y%m%d')
    end_date = datetime.strptime(end_date, '%Y%m%d')
    for n in range(int ((end_date - start_date).days) + 1):
        yield (start_date + timedelta(n)).strftime('%Y%m%d')

TRANSLATION = None
def build_translation():
    global TRANSLATION
    if TRANSLATION is None:
        import sys
        import unicodedata
        TRANSLATION = {
            index: u' ' for index in xrange(sys.maxunicode)
            if unicodedata.category(unichr(index)).startswith('P')
        }
        for char in u'\t\n\x0b\x0c\r$+<=>^`|~':
            TRANSLATION[ord(char)] = u' '
    return TRANSLATION

def normalize_query(query):
    try:
        query = query.decode('utf8')
    except UnicodeDecodeError:
        return query

    query = query.translate(build_translation())
    query = query.lower()
    query = query.strip()
    query = re.sub(r'\s\s+', ' ', query)

    return query.encode('utf8')

FEATURES = 'web.abandonment web.abandonment_long web.full_abandonment web.inversion_rate web.only_under3_clicks web.no_top3_clicks web.no_good_top3_clicks web.wizards_ctr web.dwell_time web.sole_navig_click web.abandonment_with_direct web.wizards_part web.full_abandonment_long web.quick_dwelltime_rate serp_ratio_hidden_web_results'.split()

class Extractor:
    def __init__(self, date):
        self.date = date

    def __call__(self, key, recs):
        uid = key
        if uid[0] != 'y':
            return

        try:
            s = libra.ParseSession(recs, './blockstat.dict')
        except RuntimeError as e:
            if 'fat user' in str(e):
                return
            if 'ParseSession can' in str(e):
                return
            else:
                raise e

        for r in s:
            req_type = None
            if r.IsA('TYandexWebRequest'):
                req_type = 'web'
            elif r.IsA('TPadYandexWebRequest'):
                req_type = 'pad'
            elif r.IsA('TTouchYandexWebRequest'):
                req_type = 'touch'

            if not req_type:
                continue

            if r.PageNo != 0:
                continue

            domain = r.ServiceDomRegion
            query = r.Query
            nquery = normalize_query(query)
            browser = ' '.join(r.GetBrowser())
            region = r.UserRegion
            time = datetime.fromtimestamp(r.Timestamp).strftime('%Y-%m-%d %H:%M:%S')

            features = libabt.ExtractRequestFeatures(r)
            selected_features = '\t'.join([x + '=' + ' '.join([str(f) for f in features[x]]) for x in FEATURES if x in features.keys()])

            out = '\t'.join([str(x) for x in [time, region, browser, selected_features]])

            yield Record('\t'.join([nquery[0:2000], req_type, domain]), uid, out)

def aggregate(key, recs):
    features = defaultdict(float)
    features_count = defaultdict(int)
    count = 0

    for rec in recs:
        count += 1
        fs = rec.value.split('\t')[3:]

        for f in fs:
            name, values = f.split('=')
            values = values.split(' ')
            for v in values:
                features[name] += float(v)
                features_count[name] += 1

    out = ''
    for k, v in sorted(features.items()):
        out += k + '=' + str(v/features_count[k]) + ' '

    if count > 10:
        yield Record(key, str(count), out)


def main(options):
    MapReduce.useDefaults(server=os.environ['DEF_MR_SERVER'],
                            username='userstats',
                            #username='tmp',
                            mrExec='/Berkanavt/bin/mapreduce-dev',
                            verbose=True,
                            )
#    MapReduce.useDefaults(testMode=True)

    for date in daterange(options.start, options.finish):
        in_table  = 'user_sessions/' + date
#        in_table  = 'sample_by_yuid_1p/user_sessions/' + date
        out_table = 'shining/strange_queries_abtfeatures/' + options.start+ '_' + options.finish

        MapReduce.runReduce(Extractor(date),
                            srcTable=in_table,
                            dstTable=out_table,
#                            sortMode=True,
                            appendMode=True,
                            files=['/home/shining/data/blockstat.dict'],
                            )

    in_table = 'shining/strange_queries_abtfeatures/' + options.start+ '_' + options.finish
    out_table = 'shining/strange_queries_abtfeatures/' + options.start+ '_' + options.finish + '/aggregated'

    MapReduce.runReduce(aggregate,
                        srcTable=in_table,
                        dstTable=out_table,
                        sortMode=True,
                        )

if __name__ == '__main__':
    options = parseOptions(sys.argv)
    main(options)
