#!/usr/bin/env python
# -*- coding: UTF-8 -*-

import yt.wrapper
import sys
import urlparse
import re
import urllib
import urllib2
import math
import json
import time
import datetime
from datetime import date, datetime, timedelta
import argparse
import ast
import libra

def parse_args():
    parser = argparse.ArgumentParser(add_help=True, description='Suggest metrics calc')
    parser.add_argument('--timestamp', help='date timestamp for calculation')
    parser.add_argument('--from_date', default='2017-02-28',  help='from date for calculation (format: YYYY-MM-DD)')
    parser.add_argument('--to_date', default='2017-02-28', help='to date for calculation (format: YYYY-MM-DD)')
    parser.add_argument("--bs", dest="blockstat", help="path to blockstat.dict", default='/home/galamaj/blockstat.dict', required=False)
    # parser.add_argument('--mr_server', default='hahn', help='MR server (hahn, banach, ...)')
    # parser.add_argument('--yt_pool', default='robot-suggestor-dev', help='YT pool')
    args = parser.parse_args()
    return args

def get_dates(timestamp, from_date, to_date):
    dates = []
    date_format = '%Y-%m-%d'
    if timestamp:
        date = datetime.fromtimestamp(int(timestamp[:10]))
        dates.append(date.strftime(date_format))
    if from_date and not to_date:
        to_date = datetime.strftime(datetime.now(), date_format)
    if from_date and to_date:
        current_date = datetime.strptime(from_date, date_format)
        while current_date <= datetime.strptime(to_date, date_format):
            dates.append(current_date.strftime(date_format))
            current_date += timedelta(1)
    return dates


def parse_redir(row):
    VERSION = re.compile(r'=1_9_\d')
    VOICE = re.compile(r'voice\.voice_in\.')
    DAYUSE = re.compile(r'\-dayuse=0\,\-')
    vars = {}
    value = row['value']
    key = row['key']
    ts = row['subkey']
    if VERSION.search(value) and VOICE.search(value): # and DAYUSE.search(value):
        ip_uid = key.split('/')
        if len(ip_uid) == 2:
            yuid = "y" + ip_uid[-1]
        else:
            yuid = None
        records = value.split('\t')
        for rec in records:
            if rec == "url=":
                vars["url"] = None
            if rec.startswith('vars='):
                items = rec.lstrip("vars=").split(",")
                for item in items:
                    param = item.split("=")[0]
                    val = item.split("=")[1]
                    vars[param] = val
            elif rec.startswith("path"):
                if rec.find(".p0.") != -1:
                    action = rec.split(".p0.")[-1]
                if rec.find(".p1.") != -1:
                    action = rec.split(".p1.")[-1]
                vars["action"] = action
            else:
                data = rec.split("=")
                param = data[0]
                val = data[1]
                vars[param] = val
        if vars["action"] == "web":
            yield {
                'yandexuid' : yuid,
                'dayuse' : vars["-dayuse"],
                'ui' : vars["-ui"],
                'voiceid' : vars["-voiceid"],
                'query' : vars["text"],
                'user_input' : vars["user_input"],
                'log' : vars["log"],
                'ts' : ts
            }

def parse_sessions(key,recs):
    d = {}
    voice = None
    try:
        session = libra.ParseSession(recs, './blockstat.dict')
        for request in session:
            if not request.IsA("TYandexWebRequest"):
                continue
            referer = request.Referer
            req_ts = request.Timestamp
            if len(request.Query) > 250:
                continue
            if referer.find("query_source=voice") == -1:
                continue
            for block in request.GetMainBlocks():
                c = 0
                for click in block.GetClicks():
                    c += 1
                    path = str(click.ConvertedPath)
                    dwellTime = str(click.DwellTime)
                    delay = str(click.DelayAfterRequest)
                    url = str(click.Url)
                    d[str(c)] = [path, dwellTime, delay, url]
            yield {
                "query" : request.Query,
                "timestamp" : req_ts,
                "yandexuid" : key["key"],
                "referers" : referer,
                "clicks" : len(d),
                "clicks_params" : d,
                "voice" : voice
                }
    except:
        return

def join_logs(key, recs):
    clicks = None
    clicks_params = None
    ts = None
    dayuse = None
    ui = None
    user_input = None
    log = None
    action = None
    for rec in recs:
        try:
            uid = rec["yandexuid"]
            query = rec["query"]
            if "voice" in rec:
                clicks = rec["clicks"]
                clicks_params = rec["clicks_params"]
                ts = rec["timestamp"]
            else:
                dayuse = rec["dayuse"]
                ui = rec["ui"]
                user_input = rec["user_input"]
                log = rec["log"]
                action = rec["action"]
                ts = rec["ts"]
        except:
            continue
#    if clicks is not None and dayuse is not None:
    if uid is not None:
        yield {
                "yandexuid" : uid,
                "query" : query,
                "clicks" : clicks,
                "clicks_params" : clicks_params,
                "ts" : ts,
                "dayuse" : dayuse,
                "ui" : ui,
                "user_input" : user_input,
                "log" : log,
                "action" : action
        }


def check_key(row):
    if len(row["query"]) <= 200:
        yield row

def calculations(date):
    inputTable = '//home/desktop/searchband/sessions/' + date
    inputTable2 = '//user_sessions/pub/search/daily/' + date + '/clean'
    outputTable = '//home/suggest-dev/galamaj/yastroka/web_sessions/only_voice/' + date
    tmpTable1 = '//home/suggest-dev/galamaj/tmp/redir_' + date
    tmpTable2 = '//home/suggest-dev/galamaj/tmp/us_' + date
    tmpTable3 = '//home/suggest-dev/galamaj/tmp/reduce_' + date
    yt.wrapper.config['memory_limit'] = 100 * 1024 * 1024 * 1024
    yt.wrapper.create("table", path = outputTable, recursive = True, ignore_existing = True, attributes = None)
    outputPath = yt.wrapper.TablePath(outputTable) #, append = True
    # spec = {"job_io": {"table_writer": {"max_key_weight": 21275L}}}
    # print "%s   %s -> %s" % (str(datetime.now()), inputTable, outputTable)
    # yt.wrapper.run_map(parse_redir, inputTable, tmpTable1) #парсим логи Я.Строки
    print "%s   %s -> %s" % (str(datetime.now()), inputTable, tmpTable1)
    # yt.wrapper.run_sort(tmpTable1,sort_by=['yandexuid', 'query'])
    # yt.wrapper.run_reduce(parse_sessions, inputTable2, tmpTable2, reduce_by='key', files=["./blockstat.dict"]) #либрой парсим сессии и достаем клики по выдаче
    yt.wrapper.run_map(check_key, tmpTable2, tmpTable2)
    yt.wrapper.run_sort(tmpTable2,sort_by=['yandexuid', 'query'])
    # print "%s   %s -> %s" % (str(datetime.now()), tmpTable2, tmpTable2)
    yt.wrapper.run_reduce(join_logs, [tmpTable1, tmpTable2], outputPath, reduce_by= ['yandexuid', 'query'])#, spec=spec)
    yt.wrapper.run_sort(outputTable, sort_by=['yandexuid', 'ts'])
    print "%s   %s -> %s" % (str(datetime.now()), [tmpTable1, tmpTable2], outputPath)





if __name__ == '__main__':
    args = parse_args()
    dates = get_dates(args.timestamp, args.from_date, args.to_date)
    print dates
    for date in dates:
        calculations(date)
