# -*- coding: utf-8 -*-

import yt.wrapper as yt
import sys
import argparse
import datetime
import json
from datetime import date, datetime, timedelta
#import pandas as pd
# from scipy.stats import mannwhitneyu as mw

yt.config['pickling']['module_filter'] = \
            lambda module: module and \
                'numpy' not in module.__name__ and \
                hasattr(module, '__file__') and \
                not module.__file__.endswith('.so')

yt.config['pickling']['force_using_py_instead_of_pyc'] = True

########################################################################################################################

class ExtractTestIds:
    def __init__(self, test_ids):
        self.test_ids = test_ids

    def __call__(self, rec):
        uid = rec["key"]
        value = rec["value"]
        if value is not None and value != "" and uid is not None and uid.find("y") != -1:
            test_ids = value.split("\t")
            for test_id in test_ids:
                if test_id in self.test_ids:
                    yield {"uid" : uid, "test_id" : test_id}

########################################################################################################################

class ExtractTestIdsNano:
    def __init__(self, test_ids):
        self.test_ids = test_ids

    def __call__(self, rec):
        uid = rec["key"]
        if uid.startswith("y"):
            value = json.loads(rec["value"])
            if "test_infos" in value:
                test_ids = value["test_infos"]
                for test_id in test_ids:
                    if test_id["test_id"] in self.test_ids:
                        yield {"uid":uid, "test_id":test_id["test_id"], "bucket":test_id["bucket"]}

########################################################################################################################

class MapRedirPreparates:
    def __init__(self, service):
        self.service = service

    def __call__(self, row):
        if row["service"] == self.service:
            atps_sfc = None
            atps_tit = None
            input_time_sfc = None
            input_time_tit = None
            if row["text"] != '' and row["text"] != ' ' and row["text"] is not None and row["query_length"] is not None\
                    and row["since_first_change"] >= 0 and row["total_input_time"] >= 0:
                uid = "y" + row["yandexuid"]
                tpah_log = row["tpah_log"]
                if row["since_first_change"] is not None:
                    input_time_sfc = float(row["since_first_change"])
                if row["total_input_time"] is not None:
                    input_time_tit = float(row["total_input_time"])
                action_type = row["action_type"]
                query_length = row["query_length"]
                text = row["text"]
                user_input = row["user_input"]
                if row["text"].endswith(' '):
                    len_query = float(query_length) - 1
                else:
                    len_query = float(query_length)
                if len_query > 100:
                    len_query = 100
                if 0 < input_time_sfc:
                    if input_time_sfc > 60000:
                        input_time_sfc = 60000
                    atps_sfc = input_time_sfc/len_query   #avg_time_per_symbol using since_first_change
                if 0 < input_time_tit:
                    if input_time_tit > 60000:
                        input_time_tit = 60000
                    atps_tit = input_time_tit/len_query  #avg_time_per_symbol using total_input_time
                yield {'uid' : uid,
                        'avg_time_per_symbol_sfc' : atps_sfc,
                        'avg_time_per_symbol_tit' : atps_tit,
                        'since_first_change' : input_time_sfc,
                        'total_input_time' : input_time_tit,
                        'action_type' : action_type,
                        'query_length' : len_query,
                        'tpah_log' : tpah_log,
                        'text' : text,
                        'user_input' : user_input
                        }

########################################################################################################################

def average(measures):
    avg = 0.
    n = 0.
    for item in measures:
        if item is not None and item >= 0:
            avg += (item - avg)/(n + 1)
            n += 1
    return avg

def median(lst):
    lst = sorted(lst)
    if len(lst) < 1:
        return None
    if len(lst) == 1:
        return lst[((len(lst)+1)/2)-1]
    else:
        return float(sum(lst[(len(lst)/2)-1:(len(lst)/2)+1]))/2.0

def used_metrics(path):
    total = float(len(path))
    not_u = 0.
    not_sh = 0.
    mouse = 0.
    keyboard = 0.
    edit = 0.
    tpah = 0.
    used = 0.
    not_shown = 0.
    coverage = 0.
    used_from_total = 0.
    mouse_keyboard = 0.
    m = 0.
    k = 0.
    for item in path:
        if item == 'not_used':
            not_u += 1
        if item == 'not_shown':
            not_sh += 1
        if item == 'mouse':
            mouse += 1
        if item == 'keyboard':
            keyboard += 1
        if item == 'tpah':
            tpah += 1
        if item == 'edit':
            edit += 1
    try:
        used = (total - not_u - not_sh)/(total - not_sh)*100.0
        not_shown = not_sh/total*100.0
        coverage = (total - not_sh)/total*100.0
        used_from_total = (total - not_u - not_sh)/(total)*100.0
        mouse_keyboard = (mouse + keyboard)/total*100.0
        m = mouse
        k = keyboard
        mouse = mouse/total*100.0
        keyboard = keyboard/total*100.0
        result = {
            'used': used,
            'not_shown' : not_shown,
            'coverage' : coverage,
            'used_from_total' : used_from_total,
            'mouse_keyboard' : mouse_keyboard,
            'mouse': mouse,
            'keyboard' : keyboard,
            'not_u' : not_u,
            'not_sh' : not_sh,
            'm' : m,
            'total' : total,
            'k' : k
        }
        return result
    except ZeroDivisionError:
        result = {
            'used': used,
            'not_shown' : not_shown,
            'coverage' : coverage,
            'used_from_total' : used_from_total,
            'mouse_keyboard' : mouse_keyboard,
            'mouse': mouse,
            'keyboard' : keyboard,
            'not_u' : not_u,
            'not_sh' : not_sh,
            'm' : m,
            'total' : total,
            'k' : k
        }
        return result

########################################################################################################################

def join_uids(key, recs):
    exprt = None
    avg_time_per_symbol_tit = []
    since_first_change = []
    total_input_time = []
    query_length = []
    avg_time_per_symbol_sfc = []
    action_type = []
    tpah_log = []
    text = []
    user_input = []
    for rec in recs:
        uid = rec["uid"]
        if 'test_id' in rec:
            exprt = rec['test_id']
            bucket = rec["bucket"]
        else:
            try:
                if rec["tpah_log"] == "[[submit,p0,0]]" and rec["total_input_time"] == 0:
                        continue
                avg_time_per_symbol_sfc.append(rec['avg_time_per_symbol_sfc'])
                since_first_change.append(rec['since_first_change'])
                avg_time_per_symbol_tit.append(rec['avg_time_per_symbol_tit'])
                total_input_time.append(rec['total_input_time'])
                query_length.append(rec['query_length'])
                action_type.append(rec["action_type"])
                tpah_log.append(rec['tpah_log'])
                text.append(rec['text'])
                user_input.append(rec['user_input'])
            except KeyError:
                continue
    if uid is not None and avg_time_per_symbol_sfc != [] and exprt is not None:
        yield {'uid' : rec["uid"],
            'test_id' : exprt,
            'bucket':bucket,
            'avg_time_per_symbol_sfc' : avg_time_per_symbol_sfc,
            'since_first_change' : since_first_change,
            'avg_time_per_symbol_tit' : avg_time_per_symbol_tit,
            'total_input_time' : total_input_time,
            'query_length' : query_length,
            'action_type' : action_type,
            'tpah_log' : tpah_log,
            'text' : text,
            'user_input' : user_input
        }

########################################################################################################################

def count_metrics(key, recs):
    atps_sfc = []
    atps_tit = []
    times_sfc = []
    times_tit = []
    length = []
    tpah_log = []
    user_input = []
    negative_sfc = 0.
    zero_sfc = 0.
    null_sfc = 0.
    negative_tit = 0.
    zero_tit = 0.
    null_tit = 0.
    total = 0.
    fulltext = 0.
    phrase = 0.
    tpah = 0.
    null_ui = 0.
    path = []
    total_user_input = []
    exprt = key['test_id']
    if "bucket" in key:
        bucket = key['bucket']
    else:
        bucket = None
    for rec in recs:
        total += 1
        length.extend(rec['query_length'])
        path.extend(rec['action_type'])
        tpah_log.extend(rec['tpah_log'])
        atps_sfc.extend(rec['avg_time_per_symbol_sfc'])
        atps_tit.extend(rec['avg_time_per_symbol_tit'])
        times_sfc.extend(rec['since_first_change'])
        times_tit.extend(rec['total_input_time'])
        user_input.extend(rec['user_input'])
    for item in tpah_log:
        if item is None:
            continue
        if item.find("fulltext") != -1:
            fulltext += 1
        if item.find("phrase") != -1:
            phrase += 1
        if item.find("tpah") != -1:
            tpah += 1
    for item in user_input:
        if item is None:
            null_ui += 1
            continue
        i = len(list(item))
        if i > 100:
            i = 100
        total_user_input.append(i)
    for item in times_sfc:
        if item == 0:
            zero_sfc += 1
        if item < 0 and item is not None:
            negative_sfc += 1
        if item is None:
            null_sfc += 1
    for item in times_tit:
        if item == 0:
            zero_tit += 1
        if item < 0 and item is not None:
            negative_tit += 1
        if item is None:
            null_tit += 1
    avrg_time_per_symb_sfc = average(atps_sfc)
    query_length = average(length)
    avrg_time_normalized_sfc = average(times_sfc)
    avrg_time_per_symb_tit = average(atps_tit)
    avrg_time_normalized_tit = average(times_tit)
    used = used_metrics(path)
    total_queries = len(path)
    null_ui_part = null_ui/total_queries*100
    zero_sfc = zero_sfc/total_queries*100
    null_sfc = null_sfc/total_queries*100
    negative_sfc = negative_sfc/total_queries*100
    zero_tit = zero_tit/total_queries*100
    null_tit = null_tit/total_queries*100
    negative_tit = negative_tit/total_queries*100
    tpah_used = tpah/total_queries*100
    fulltext_used = fulltext/total_queries*100
    phrase_used = phrase/total_queries*100
    ui1 = average(total_user_input)
    try:
        median_tit = median(times_tit)
    except TypeError:
        median_tit = 0
    try:
        median_sfc = median(times_sfc)
    except TypeError:
        median_sfc = 0
    yield {'test_id' : exprt,
           'bucket' : bucket,
            'total_uids' : total,
            'used_%' : used["used"],
            'not_shown_%' : used["not_shown"],
            'suggest_coverage_%' : used["coverage"],
            'used_from_total_%' : used["used_from_total"],
            'mouse_keyboard_%' : used["mouse_keyboard"],
            'mouse_%': used["mouse"],
            'keyboard_%' : used["keyboard"],
            'zero_sfc_%' : zero_sfc,
            'null_sfc_%' : null_sfc,
            'negative_sfc_%' : negative_sfc,
            'zero_tit_%' : zero_tit,
            'null_tit_%' : null_tit,
            'negative_tit_%' : negative_tit,
            'avrg_time_per_symb_sfc' : avrg_time_per_symb_sfc,
            'avrg_time_per_symb_tit' : avrg_time_per_symb_tit,
            'median_input_time_sfc' : median_sfc,
            'median_input_time_tit' : median_tit,
            'avrg_normalized_time_sfc' : avrg_time_normalized_sfc,
            'avrg_normalized_time_tit' : avrg_time_normalized_tit,
            'avrg_query_length_normalized' : query_length,
            'total_queries' : total_queries,
            'null_user_input' : null_ui,
            'null_user_input_%' : null_ui_part,
            'tpah_used' : tpah_used,
            'fulltext_used' : fulltext_used,
            'phrase_used' : phrase_used,
            'tpah' : tpah,
            'fulltext' : fulltext,
            'phrase' : phrase,
            'not_used' : used["not_u"],
            'not_shown' : used["not_sh"],
            'mouse' : used["m"],
            'keyboard' : used["k"],
            'avrg_length_user_input' : ui1
            }

########################################################################################################################

def parse_args():
    parser = argparse.ArgumentParser(add_help=True, description='Suggest metrics calc')
    parser.add_argument('--timestamp', help='date timestamp for calculation')
    parser.add_argument('--from_date', help='from date for calculation (format: YYYY-MM-DD)')
    parser.add_argument('--to_date', help='to date for calculation (format: YYYY-MM-DD)')
    parser.add_argument('--mr_server', default='hahn', help='MR server (hahn, banach, ...)')
    parser.add_argument('--yt_pool', default='robot-suggestor-dev', help='YT pool')
    parser.add_argument('--service', help='(serp_ru_touch) - for web touch, (serp_ru_desktop) - for web desktop')
    parser.add_argument('--test_ids', help='Comma separated experiments numbers (format: 12345,67891,35474)')
    parser.add_argument('--output1', default='output_by_days', help='results by days')
    parser.add_argument('--output2', default='output_all_days', help='result all days')
    parser.add_argument('--output_table_buckets', help='result all days by buckets', default=None, required=False)
    parser.add_argument('--output_table', help='result all days by buckets', default=None, required=False)
    parser.add_argument('--prefix_tables', help='prefix for interim tables', default='//home/suggest-dev/galamaj/suggest_experiments')
    args = parser.parse_args()
    return args

def get_dates(timestamp, from_date, to_date):
    dates = []
    date_format = '%Y-%m-%d'
    if timestamp:
        date = datetime.fromtimestamp(int(timestamp[:10]))
        dates.append(date.strftime(date_format))
    if from_date and not to_date:
        to_date = datetime.strftime(datetime.now(), date_format)
    if not from_date and not to_date:
        from_date = datetime.strftime(datetime.now()- timedelta(days=1), date_format)
        to_date = from_date
    if from_date and to_date:
        current_date = datetime.strptime(from_date, date_format)
        while current_date <= datetime.strptime(to_date, date_format):
            dates.append(current_date.strftime(date_format))
            current_date += timedelta(1)
    return dates

########################################################################################################################

def write_result(table, file_name, date):
    file = open(file_name, 'w')
    for row in yt.read_table(table):
        file.write(date + '\t' + str(row['test_id']) + "\n")
        print date + '\t' + str(row['test_id'])
        file.write(date + '\t' + str(row['test_id']) + "\n")
        print date + '\t' + str(row['test_id'])
        for k,v in row.iteritems():
            print str(k) + "\t" + str(v)
            file.write(str(k) + "\t" + str(v) + "\n")
        print "\n"
        file.write("\n")
    file.close()

########################################################################################################################

if __name__ == '__main__':
    args = parse_args()
    dates = get_dates(args.timestamp, args.from_date, args.to_date)
    test_ids = args.test_ids.split(",")
    service = args.service
    uids_testids = []
    preparates = []
    merge_tables = []
    merge_tables_buckets = []
    print dates
    output1 = args.output1
    output2 = args.output2
    TablePref = args.prefix_tables
    if test_ids is None:
        print "You need to enter --test_ids parameter"
    if service is None:
        print "Enter srv of your service"
    yt.config['memory_limit'] = 100 * 1024 * 1024 * 1024
    # yt.config['token_path'] = '/home/galamaj/.yt/token_mine'
    yt.config["proxy"]["url"] = "hahn.yt.yandex.net"
    # yt.config["pool"] = 'robot-suggestor-dev'
    yt.config['pickling']['module_filter'] =  lambda module: hasattr(module, '__file__') and "scipy" not in module.__file__
    yt.config['pickling']['force_using_py_instead_of_pyc'] = True
    yt.config['pickling']['dynamic_libraries']['enable_auto_collection'] = True
    yt.config['pickling']['dynamic_libraries']['library_filter'] = lambda lib: not lib.startswith('/lib')
    for date in dates:
        print TablePref
        nano_sessions = "//user_sessions/pub/nano_sessions/daily/" + date + "/web/clean"
        preparates = "//home/suggest-dev/suggest_logs/redir-log_preparates/clean/" + date
        extracted_uids = '//tmp/suggest_experiments/' + service + '/' + "_".join(test_ids) + '/uids_testids/' + date
        filtered_redir = TablePref + '/' + service + '/' + "_".join(test_ids) + '/redir/' + date
        merge_table = TablePref + '/' + service + '/' + "_".join(test_ids) + '/merged/' + date
        result_table = TablePref + '/' + service + '/' + "_".join(test_ids) + '/result/' + date
        result_buckets = TablePref + '/' + service + '/' + "_".join(test_ids) + '/result/buckets_' + date
        if not yt.exists(filtered_redir):
            yt.create(type="table", path=filtered_redir, recursive=True)
        if not yt.exists(extracted_uids):
            yt.create(type="table", path=extracted_uids, recursive=True)
        if not yt.exists(merge_table):
            yt.create(type="table", path=merge_table, recursive=True)
        if not yt.exists(result_table):
            yt.create(type="table", path=result_table, recursive=True)
        if not yt.exists(result_table):
            yt.create(type="table", path=result_buckets, recursive=True)
        yt.run_map(ExtractTestIdsNano(test_ids), nano_sessions, extracted_uids, spec = {'data_size_per_job': 16000000000})
        yt.run_map(MapRedirPreparates(service), preparates, filtered_redir, spec = {'data_size_per_job': 16000000000})
        yt.run_sort(filtered_redir, sort_by = 'uid')
        yt.run_sort(extracted_uids, sort_by = 'uid')
        yt.run_reduce(join_uids, [filtered_redir, extracted_uids], merge_table, reduce_by=['uid'])
        yt.run_sort(merge_table, sort_by = ['test_id','bucket'])
        yt.run_reduce(count_metrics, merge_table, result_table, sort_by = 'test_id', reduce_by = 'test_id')
        yt.run_reduce(count_metrics, merge_table, result_buckets, sort_by = ['test_id','bucket'], reduce_by = ['test_id','bucket'])
        yt.run_sort(merge_table, sort_by = ['test_id'])
        merge_tables.append(merge_table)
        merge_tables_buckets.append(result_buckets)
        write_result(result_table, output1, date)
    if args.output_table:
        output_table_buckets = args.output_table_buckets
    else:
        output_table_buckets = TablePref + '/'  + service + '/' + "_".join(test_ids) + '/result/buckets_all_days'
    if args.output_table:
        output_table = args.output_table
    else:
        output_table = TablePref + '/'  + service + '/' + "_".join(test_ids) + '/result/all_days'
    if not yt.exists(output_table):
        yt.create(type="table", path=output_table, recursive=True)
    yt.concatenate(source_paths=merge_tables_buckets, destination_path=output_table_buckets)
    yt.run_reduce(count_metrics, merge_tables, output_table, sort_by = ['test_id'], reduce_by = 'test_id')
    write_result(output_table, output2, "/".join(dates))
    print "success"


