import yt.wrapper

import threading
import sys
import urlparse
import re
import urllib
import urllib2
import math
import time
import datetime
from datetime import date, datetime, timedelta
import argparse


def mapper(row):
    allowed_countries = (225, 187, 149, 159, 983, 171, 170, 209, 168, 167, 169, 208, 207)
    fulltext = 0.
    history = 0.
    empty_pref_history = 0.
    try:
        ip = row["country_by_ip"]
        atps_sfc = None
        input_time_sfc = None
        input_time_tit = None
        atps_tit = None
        pure_input_time = None
        pure_input_time_normalized = None
        atps_pure = None
        atps_pure_normalized = None
        if (ip in allowed_countries) and 0 < row["query_length"] < 101 and row["text"] != '' and row["text"] != ' ':
            if row["text"].endswith(' '):
                len_query = float(row["query_length"]) - 1
            else:
                len_query = float(row["query_length"])
            if len_query > 100:
                len_query = 100
            if (row["since_first_change"] is not None and row["since_last_change"] is not None):
                pure_input_time = row["since_first_change"] - row["since_last_change"]
                if 0 < pure_input_time:
                    if pure_input_time > 60000:
                        pure_input_time_normalized = 60000
                    else:
                        pure_input_time_normalized = pure_input_time
                    atps_pure = pure_input_time/len_query
                    atps_pure_normalized = pure_input_time_normalized/len_query
            if row["since_first_change"] is not None:
                input_time_sfc = row["since_first_change"]
                if 0 < input_time_sfc:
                    if input_time_sfc > 60000:
                        input_time_sfc = 60000
                    atps_sfc = input_time_sfc/len_query   #avg_time_per_symbol using since_first_change
            if row["total_input_time"] is not None:
                input_time_tit = row["total_input_time"]
                if 0 < input_time_tit:
                    if input_time_tit > 60000:
                        input_time_tit = 60000
                    if len_query > 100:
                         len_query = 100
                    atps_tit = input_time_tit/len_query  #avg_time_per_symbol using total_input_time
            tpah_log = row["tpah_log"]
            if tpah_log is not None:
                if tpah_log.find("fulltext") != -1 or tpah_log.find("phrase") != -1 or tpah_log.find("history") != -1:
                    fulltext = 1.
                if tpah_log.find("history") != -1:
                    history = 1.
                if row["user_input"] is None and (tpah_log.find("history") != -1 or tpah_log.find("phrase") != -1):
                    empty_pref_history = 1.
            yield {'ip' : ip,
                'avg_time_per_symbol_sfc' : atps_sfc,
                'avg_time_per_symbol_tit' : atps_tit,
                'since_first_change' : input_time_sfc,
                'total_input_time' : input_time_tit,
                'pure_input_time' : pure_input_time,
                'pure_input_time_normalized' : pure_input_time_normalized,
                'avg_time_per_symbol_pure' : atps_pure,
                'avg_time_per_symbol_pure_normalized' : atps_pure_normalized,
                'service' : row["service"],
                'action_type' : row["action_type"],
                'query_length' : len_query,
                'tpah_log' : tpah_log,
                'fulltext' : fulltext,
                'history' : history,
                'empty_pref_history' : empty_pref_history
                }
    except AttributeError:
        pass


def average(measures):
    avg = 0.
    n = 0.
    for item in measures:
        if item is not None:
            avg += (item - avg)/(n + 1)
            n += 1
    return avg

def median(lst):
    lst = sorted(lst)
    if len(lst) < 1:
            return None
    if len(lst) %2 == 1:
            return lst[((len(lst)+1)/2)-1]
    else:
            return float(sum(lst[(len(lst)/2)-1:(len(lst)/2)+1]))/2.0

def used_metrics(path):
    total = float(len(path))
    not_u = 0.
    not_sh = 0.
    mouse = 0.
    keyboard = 0.
    edit = 0.
    tpah = 0.
    for item in path:
        if item == 'not_used':
            not_u += 1
        if item == 'not_shown':
            not_sh += 1
        if item == 'mouse':
            mouse += 1
        if item == 'keyboard':
            keyboard += 1
        if item == 'tpah':
            tpah += 1
        if item == 'edit':
            edit += 1
    try:
        used = (total - not_u - not_sh)/(total - not_sh)*100.0
        not_shown = not_sh/total*100.0
        coverage = (total - not_sh)/total*100.0
        used_from_total = (total - not_u - not_sh)/(total)*100.0
        mouse_keyboard = (mouse + keyboard)/total*100.0
        mouse = mouse/used*100.0
        keyboard = keyboard/used*100.0
        result = {
            'used': used,
            'not_shown' : not_shown,
            'coverage' : coverage,
            'used_from_total' : used_from_total,
            'mouse_keyboard' : mouse_keyboard,
            'mouse': mouse,
            'keyboard' : keyboard
        }
        return result
    except ZeroDivisionError:
        pass


def count_metrics(key, recs):
    atps_sfc = []
    atps_sfc_used = []
    atps_tit = []
    atps_tit_used = []
    atps_pure = []
    atps_pure_normalized = []
    times_sfc = []
    times_tit = []
    times_sfc_used = []
    times_tit_used = []
    times_sfc_hist = []
    times_tit_hist = []
    times_sfc_fulltext = []
    times_tit_fulltext = []
    times_pure = []
    times_pure_normalized = []
    length = []
    length_used = []
    negative_sfc = 0.
    zero_sfc = 0.
    null_sfc = 0.
    negative_tit = 0.
    zero_tit = 0.
    null_tit = 0.
    total = 0.
    path = []
    fulltext = 0.
    history = 0.
    empty_pref_history = 0.
    empty_pref_history_part_in_fulltext = 0.
    empty_pref_history_part_in_total = 0.
    fulltext_part = 0.
    for rec in recs:
        total += 1
        ip = rec["ip"]
        srv = rec["service"]
        length.append(rec['query_length'])
        path.append(rec['action_type'])
        fulltext += rec["fulltext"]
        history += rec["history"]
        empty_pref_history += rec["empty_pref_history"]
        if rec['avg_time_per_symbol_sfc'] is not None:
            atps_sfc.append(rec['avg_time_per_symbol_sfc'])
            times_sfc.append(rec['since_first_change'])
        if rec['avg_time_per_symbol_tit'] is not None:
            atps_tit.append(rec['avg_time_per_symbol_tit'])
            times_tit.append(rec['total_input_time'])
        if rec['avg_time_per_symbol_pure'] is not None:
            atps_pure.append(rec['avg_time_per_symbol_pure'])
            times_pure.append(rec['pure_input_time'])
        if rec['avg_time_per_symbol_pure_normalized'] is not None:
            atps_pure_normalized.append(rec['avg_time_per_symbol_pure_normalized'])
            times_pure_normalized.append(rec['pure_input_time_normalized'])
        if rec['since_first_change'] == 0:
            zero_sfc += 1
        if rec['since_first_change'] < 0 and rec['since_first_change'] is not None:
            negative_sfc += 1
        if rec['since_first_change'] is None:
            null_sfc += 1
        if rec['total_input_time'] == 0:
            zero_tit += 1
        if rec['total_input_time'] < 0 and rec['total_input_time'] :
            negative_tit += 1
        if rec['total_input_time'] is None:
            null_tit += 1
        if rec['action_type'] != "not_shown" and rec['action_type'] != "not_used":
            length_used.append(rec['query_length'])
            if rec['avg_time_per_symbol_sfc'] is not None:
                atps_sfc_used.append(rec['avg_time_per_symbol_sfc'])
                times_sfc_used.append(rec['since_first_change'])
            if rec['avg_time_per_symbol_tit'] is not None:
                atps_tit_used.append(rec['avg_time_per_symbol_tit'])
                times_tit_used.append(rec['total_input_time'])
        if rec['empty_pref_history'] == 1:
            if rec['since_first_change'] > 0:
                times_sfc_hist.append(rec['since_first_change'])
            if rec['total_input_time'] > 0:
                times_tit_hist.append(rec['total_input_time'])
        if rec["fulltext"] == 1:
            if rec['since_first_change'] > 0:
                times_sfc_fulltext.append(rec['since_first_change'])
            if rec['total_input_time'] > 0:
                times_tit_fulltext.append(rec['total_input_time'])
    avrg_time_per_symb_sfc = average(atps_sfc)
    avrg_time_per_symb_tit = average(atps_tit)
    avrg_time_per_symb_pure = average(atps_pure)
    avrg_time_per_symb_pure_normalized = average(atps_pure_normalized)
    used_avrg_time_per_symb_sfc = average(atps_sfc_used)
    used_avrg_time_per_symb_tit = average(atps_tit_used)
    median_sfc = median(times_sfc)
    median_tit = median(times_tit)
    used_median_sfc = median(times_sfc_used)
    used_median_tit = median(times_tit_used)
    query_length = average(length)
    query_length_used = average(length_used)
    avrg_time_normalized_sfc = average(times_sfc)
    avrg_time_normalized_tit = average(times_tit)
    avrg_time_pure = average(times_pure)
    avrg_time_pure_normalized = average(times_pure_normalized)
    history_avrg_time_normalized_sfc = average(times_sfc_hist)
    history_avrg_time_normalized_tit = average(times_tit_hist)
    fulltext_avrg_time_normalized_sfc = average(times_sfc_fulltext)
    fulltext_avrg_time_normalized_tit = average(times_tit_fulltext)
    used_avrg_time_normalized_sfc = average(times_sfc_used)
    used_avrg_time_normalized_tit = average(times_tit_used)
    zero_sfc = zero_sfc/total*100
    null_sfc = null_sfc/total*100
    negative_sfc = negative_sfc/total*100
    zero_tit = zero_tit/total*100
    null_tit = null_tit/total*100
    negative_tit = negative_tit/total*100
    used = used_metrics(path)
    if fulltext != 0:
        empty_pref_history_part_in_fulltext = empty_pref_history/fulltext*100
        empty_pref_history_part_in_total = empty_pref_history/total*100
        fulltext_part = fulltext/total*100
    if used is None:
        yield {'service' : srv,
            'ip' : ip,
            'total' : total,
            'used_%' : None,
            'not_shown_%' : None,
            'suggest_coverage_%' : None,
            'used_from_total_%' : None,
            'mouse_keyboard' : None,
            'mouse': None,
            'keyboard' : None,
            'zero_sfc_%' : zero_sfc,
            'null_sfc_%' : null_sfc,
            'negative_sfc_%' : negative_sfc,
            'zero_tit_%' : zero_tit,
            'null_tit_%' : null_tit,
            'negative_tit_%' : negative_tit,
            'avrg_time_per_symb_sfc' : avrg_time_per_symb_sfc,
            'avrg_time_per_symb_tit' : avrg_time_per_symb_tit,
            'avrg_pure_time_per_symb' : avrg_time_per_symb_pure,
            'avrg_pure_time_per_symb_normalized' : avrg_time_per_symb_pure_normalized,
            'median_input_time_sfc' : median_sfc,
            'median_input_time_tit' : median_tit,
            'avrg_normalized_time_sfc' : avrg_time_normalized_sfc,
            'avrg_normalized_time_tit' : avrg_time_normalized_tit,
            'avrg_pure_input_time' : avrg_time_pure,
            'avrg_pure_input_time_normalized' : avrg_time_pure_normalized,
            'used_avrg_time_per_symb_sfc' : used_avrg_time_per_symb_sfc,
            'used_avrg_time_per_symb_tit' : used_avrg_time_per_symb_tit,
            'used_median_input_time_sfc' : used_median_sfc,
            'used_median_input_time_tit' : used_median_tit,
            'used_avrg_normalized_time_sfc' : used_avrg_time_normalized_sfc,
            'used_avrg_normalized_time_tit' : used_avrg_time_normalized_tit,
            'avrg_query_length_normalized' : query_length,
            'used_avrg_query_length_normalized' : query_length_used,
            'fulltext_avrg_normalized_time_sfc' : fulltext_avrg_time_normalized_sfc,
            'fulltext_avrg_normalized_time_tit' : fulltext_avrg_time_normalized_tit,
            'history_avrg_normalized_time_sfc' : history_avrg_time_normalized_sfc,
            'history_avrg_normalized_time_tit' : history_avrg_time_normalized_tit,
            'fulltext_clicks' : fulltext,
            'history_clicks' : history,
            'empty_pref_history_clicks' : empty_pref_history,
            'empty_pref_history_part_in_fulltext' : empty_pref_history_part_in_fulltext,
            'empty_pref_history_part_in_total' : empty_pref_history_part_in_total,
            'fulltext_part' : fulltext_part
            }
    else:
        yield {'service' : srv,
                'ip' : ip,
                'total' : total,
                'used_%' : used["used"],
                'not_shown_%' : used["not_shown"],
                'suggest_coverage_%' : used["coverage"],
                'used_from_total_%' : used["used_from_total"],
                'mouse_keyboard' : used["mouse_keyboard"],
                'mouse': used["mouse"],
                'keyboard' : used["keyboard"],
                'zero_sfc_%' : zero_sfc,
                'null_sfc_%' : null_sfc,
                'negative_sfc_%' : negative_sfc,
                'zero_tit_%' : zero_tit,
                'null_tit_%' : null_tit,
                'negative_tit_%' : negative_tit,
                'avrg_time_per_symb_sfc' : avrg_time_per_symb_sfc,
                'avrg_time_per_symb_tit' : avrg_time_per_symb_tit,
                'avrg_pure_time_per_symb' : avrg_time_per_symb_pure,
                'avrg_pure_time_per_symb_normalized' : avrg_time_per_symb_pure_normalized,
                'median_input_time_sfc' : median_sfc,
                'median_input_time_tit' : median_tit,
                'avrg_normalized_time_sfc' : avrg_time_normalized_sfc,
                'avrg_normalized_time_tit' : avrg_time_normalized_tit,
                'used_avrg_time_per_symb_sfc' : used_avrg_time_per_symb_sfc,
                'used_avrg_time_per_symb_tit' : used_avrg_time_per_symb_tit,
                'used_median_input_time_sfc' : used_median_sfc,
                'used_median_input_time_tit' : used_median_tit,
                'used_avrg_normalized_time_sfc' : used_avrg_time_normalized_sfc,
                'used_avrg_normalized_time_tit' : used_avrg_time_normalized_tit,
                'avrg_pure_input_time' : avrg_time_pure,
                'avrg_pure_input_time_normalized' : avrg_time_pure_normalized,
                'avrg_query_length_normalized' : query_length,
                'used_avrg_query_length_normalized' : query_length_used,
                'fulltext_avrg_normalized_time_sfc' : fulltext_avrg_time_normalized_sfc,
                'fulltext_avrg_normalized_time_tit' : fulltext_avrg_time_normalized_tit,
                'history_avrg_normalized_time_sfc' : history_avrg_time_normalized_sfc,
                'history_avrg_normalized_time_tit' : history_avrg_time_normalized_tit,
                'fulltext_clicks' : fulltext,
                'history_clicks' : history,
                'empty_pref_history_clicks' : empty_pref_history,
                'empty_pref_history_part_in_fulltext' : empty_pref_history_part_in_fulltext,
                'empty_pref_history_part_in_total' : empty_pref_history_part_in_total,
                'fulltext_part' : fulltext_part
                }

def parse_args():
    parser = argparse.ArgumentParser(add_help=True, description='Suggest metrics calc')
    parser.add_argument('--timestamp', help='date timestamp for calculation')
    parser.add_argument('--from_date', help='from date for calculation (format: YYYY-MM-DD)')
    parser.add_argument('--to_date', help='to date for calculation (format: YYYY-MM-DD)')
    parser.add_argument('--mr_server', default='hahn', help='MR server (hahn, banach, ...)')
    parser.add_argument('--yt_pool', default='robot-suggestor-dev', help='YT pool')
    parser.add_argument('--output', default='output', help='output date')
    args = parser.parse_args()
    return args

def get_dates(timestamp, from_date, to_date):
    dates = []
    date_format = '%Y-%m-%d'
    if timestamp:
        date = datetime.fromtimestamp(int(timestamp[:10]))
        dates.append(date.strftime(date_format))
    if from_date and not to_date:
        to_date = datetime.strftime(datetime.now(), date_format)
    if not from_date and not to_date:
        from_date = datetime.strftime(datetime.now()- timedelta(days=1), date_format)
        to_date = from_date
    if from_date and to_date:
        current_date = datetime.strptime(from_date, date_format)
        while current_date <= datetime.strptime(to_date, date_format):
            dates.append(current_date.strftime(date_format))
            current_date += timedelta(1)
    return dates

def calc(date):
    inputTable = '//home/suggest-dev/suggest_logs/redir-log_preparates/clean/' + date #'//home/suggest-dev/galamaj/tmp/' + date
    outputTable = '//home/suggest-dev/galamaj/suggest_metrics/time_per_symb/all_srv/' + date
    yt.wrapper.config['memory_limit'] = 100 * 1024 * 1024 * 1024
    yt.wrapper.create("table", path = outputTable, recursive = True, ignore_existing = True, attributes = None)
    outputPath = yt.wrapper.TablePath(outputTable)
    print "%s   %s -> %s" % (str(datetime.now()), inputTable, outputTable)
    with yt.wrapper.TempTable('//home/suggest-dev/galamaj/tmp', prefix='kpi_') as tmpTable:
        yt.wrapper.run_map(mapper, inputTable, tmpTable, format = yt.wrapper.JsonFormat())
        yt.wrapper.run_sort(tmpTable,sort_by=['service','ip'])
        yt.wrapper.run_reduce(count_metrics, tmpTable, outputPath, reduce_by=['service','ip'], format = yt.wrapper.JsonFormat())
        return outputTable

if __name__ == '__main__':
    args = parse_args()
    result = open(args.output, 'w')
    dates = get_dates(args.timestamp, args.from_date, args.to_date)
    print dates
    for date in dates:
        print date
        metrics = calc(date)
        result.write(date + "\n")
    result.close()
