import yt.wrapper

import threading
import sys
import urlparse
import re
import urllib
import urllib2
import math
import time
import json
import time
import datetime
from datetime import date, datetime, timedelta
import argparse

def mapper(row):
    ip = row["country_by_ip"]
    input_time_sfc = row["since_first_change"]
    input_time_tit = row["total_input_time"]
    if row["service"] is not None:
        if row["service"].find("market") != -1 and (ip == 225 or ip == 187 or ip == 149 or ip == 159 or ip == 983 or ip == 171 or ip == 170 or ip == 209 or ip == 168 or ip == 167 or ip == 169 or ip == 208 or ip == 207):
            if 0 <= input_time_sfc < 600000 and 0 < row["query_length"] < 121 and row["text"] != row["prev_query"] and row["text"] != '' and row["text"] != ' ': #and type(row["user_actions_count"]) == int:
                input_time_sfc = float(row["since_first_change"])
                if row["total_input_time"] == None:
                    input_time_tit = 0.
                else:
                    input_time_tit = float(row["total_input_time"])
                #actions = row["user_actions_count"]
                if row["text"].endswith(' '):
                    len_query = row["query_length"] - 1
                else:
                    len_query = row["query_length"]
                atps_sfc = input_time_sfc/len_query        #avg_time_per_symbol
                #atpa_sfc = input_time_sfc/actions           #avg_time_per_action
                atps_tit = input_time_tit/len_query        #avg_time_per_symbol using total_input_time
                #atpa_tit = input_time_tit/actions
                yield {'ip' : ip,
                        'service' : row["service"],
                        'avg_time_per_symbol_sfc' : atps_sfc,
                        #'avg_time_per_action_sfc' : atpa_sfc,
                        'avg_time_per_symbol_tit' : atps_tit,
                        #'avg_time_per_action_tit' : atpa_tit,
                        'time_sfc' : input_time_sfc,
                        'time_tit' : input_time_tit,
                        'path' : row["path"],
                        'query_length' : len_query,
                        'text' : row["text"],
                        'ratio' : row["ratio"],
                        'action_type' : row["action_type"]}




def saved_metrics(ratios):
    res = []
    res2 = []
    s = 0.
    n = 0.
    actions = []
    length = []
    cnt = 0.
    for item in ratios:
        try:
            n = map(float, item.split("."))
        except ValueError:
            pass
        if len(n) < 3:
            continue
        if n[1] == 0:
            continue
        saved = (n[1] - n[0])/n[1]*100
        saved2 = (n[1] - n[0])
        action = n[-1]
        cnt += 1
        res.append(saved)
        res2.append(saved2)
        actions.append(action)
        length.append(n[1])
    try:
        saved_mid = sum(res)/len(res)
        saved_symb = sum(res2)/len(res2)
        average_actions = sum(actions)/cnt
        average_length = sum(length)/cnt
        result = ['average_query_length' + '\t' + str(average_length), 'saved_%' + '\t' + str(saved_mid), 'saved_symbols' + '\t' + str(saved_symb), 'average_actions' + '\t' + str(average_actions)]
        return result
    except ZeroDivisionError:
        pass


def time_metrics(times):
    summ = 0
    n = 0
    count1 = 0
    count2 = 0
    count3 = 0
    count4 = 0
    count5 = 0
    count6 = 0
    times = map(float, times)
    times = sorted(times)
    if len(times) > 0:
        for time in times:
            if time <= 0:
                continue
            summ += (time - summ)/(n + 1)
            n += 1
            if time <= 5000:
                count1 += 1
            if 5000 < time <= 10000:
                count2 += 1
            if 10000 < time <= 15000:
                count3 +=1
            if 15000 < time <= 20000:
                count4 +=1
            if 20000 < time < 25000:
                count5 += 1
            if time > 25000:
                count6 += 1
        quantile = sum(times[:int(9.5*len(times)/10)])/(9.5*len(times)/10)
        less5 = count1*100.0/len(times)
        less10 = count2*100.0/len(times)
        less15 = count3*100.0/len(times)
        less20 = count4*100.0/len(times)
        less25 = count5*100.0/len(times)
        more25 = count6*100.0/len(times)
        result = ['input_time_average' + '\t' + str(summ), 'average_quantile_95%' + '\t' + str(quantile), 'input_time_less_5_seconds_%' + '\t' + str(less5), 'input_time_5_10_seconds_%' + '\t' + str(less10),  'input_time_10_15_seconds_%' + '\t' + str(less15), 'input_time_15_20_seconds_%' + '\t' + str(less20), 'input_time_20_25_seconds_%' + '\t' + str(less25), 'input_time_more_25_seconds_%' + '\t' + str(more25), '0_5_seconds_%' + '\t' + str(less5), '0_10_seconds_%' + '\t' + str(less5 + less10), '0_15_seconds_%' + '\t' + str(less5 + less10 + less15), '0_20_seconds_%' + '\t' + str(less5 + less10 + less15 + less20), '0_25_seconds_%' + '\t' + str(less5 + less10 + less15 + less20 + less25)]
        return result

def used_metrics(path):
    total = float(len(path))
    not_u = 0
    not_sh = 0
    for item in path:
        if item == 'not_used':
            not_u += 1
        if item == 'not_shown':
            not_sh += 1
    try:
        used = (total - not_u - not_sh)/(total - not_sh)*100.0
        result = used
        return result
    except ZeroDivisionError:
        pass

def saved_metrics(ratios):
    res = []
    res2 = []
    s = 0.
    n = 0.
    actions = []
    length = []
    cnt = 0.
    for item in ratios:
        try:
            n = map(float, item.split("."))
        except ValueError:
            pass
        if len(n) < 3:
            continue
        if n[1] == 0:
            continue
        saved = (n[1] - n[0])/n[1]*100
        saved2 = (n[1] - n[0])
        action = n[-1]
        cnt += 1
        res.append(saved)
        res2.append(saved2)
        actions.append(action)
        length.append(n[1])
    try:
        saved_mid = sum(res)/len(res)
        saved_symb = sum(res2)/len(res2)
        average_actions = sum(actions)/cnt
        average_length = sum(length)/cnt
        result = ['average_query_length' + '\t' + str(average_length), 'saved_%' + '\t' + str(saved_mid), 'saved_symbols' + '\t' + str(saved_symb), 'average_actions' + '\t' + str(average_actions)]
        return result
    except ZeroDivisionError:
        pass

def average(measures):
    avg = 0.
    n = 0.
    for item in measures:
        avg += (item - avg)/(n + 1)
        n += 1
    return avg

def count_metrics(key, records):
    times_sfc = []
    times_used_sfc = []
    times_tit = []
    times_used_tit = []
    ratios = []
    path = []
    atps_sfc = []
    atps_used_sfc = []
    atps_tit = []
    atps_used_tit = []
    length = []
    length_used = []
    total = 0
    for record in records:
        ip = record["ip"]
        ratios.append(record["ratio"])
        path.append(record["action_type"])
        atps_sfc.append(record['avg_time_per_symbol_sfc'])
        times_sfc.append(record['time_sfc'])
        atps_tit.append(record['avg_time_per_symbol_tit'])
        times_tit.append(record['time_tit'])
        length.append(record['query_length'])
        total += 1
        if record["action_type"] != "not_shown" and record["action_type"] != "not_used":
            atps_used_sfc.append(record['avg_time_per_symbol_sfc'])
            atps_used_tit.append(record['avg_time_per_symbol_tit'])
            times_used_tit.append(record['time_tit'])
            times_used_sfc.append(record['time_sfc'])
            length_used.append(record['query_length'])
    times_total_sfc = time_metrics(times_sfc)
    times_total_tit = time_metrics(times_tit)
    used_times_total_sfc = time_metrics(times_used_sfc)
    used_times_total_tit = time_metrics(times_used_tit)
    saved = saved_metrics(ratios)
    ctr = used_metrics(path)
    avrg_time_per_symb_sfc = average(atps_sfc)
    used_avrg_time_per_symb_sfc = average(atps_used_sfc)
    avrg_time_per_symb_tit = average(atps_tit)
    used_avrg_time_per_symb_tit = average(atps_used_tit)
    avrg_length = average(length)
    used_avrg_length = average(length_used)
    yield {
        'ip' : ip,
        'service' : record["service"],
        'input_time_sfc' : times_total_sfc,
        'input_time_tit' : times_total_tit,
        'used_input_time_sfc' : used_times_total_sfc,
        'used_input_time_tit' : used_times_total_tit,
        'saved' : saved,
        'ctr' : ctr,
        'total_queries' : total,
        'avrg_time_per_symb_by_query_sfc' : avrg_time_per_symb_sfc,
        'used_avrg_time_per_symb_by_query_sfc' : used_avrg_time_per_symb_sfc,
        'avrg_time_per_symb_by_query_tit' : avrg_time_per_symb_tit,
        'used_avrg_time_per_symb_by_query_tit' : used_avrg_time_per_symb_tit,
        'average_query_length' : avrg_length,
        'used_average_query_length' : used_avrg_length}


def get_country_name(ip):
    country = ""
    if ip == 225:
        country = "Ru"
        return country
    if ip == 187:
        country = "Ua"
        return country
    if ip == 149:
        country = "By"
        return country
    if ip == 159:
        country = "Kz"
        return country
    if ip == 983:
        country = "Tr"
        return country
    if ip == 171:
        country = "Uz"
        return country
    if ip == 170:
        country = "Tm"
        return country
    if ip == 209:
        country = "Tj"
        return country
    if ip == 168:
        country = "Am"
        return country
    if ip == 167:
        country = "Az"
        return country
    if ip == 169:
        country = "Ge"
        return country
    if ip == 207:
        country = "Kg"
        return country
    if ip == 208:
        country = "Md"
        return country

def parse_args():
    parser = argparse.ArgumentParser(add_help=True, description='Suggest metrics calc')
    parser.add_argument('--timestamp', help='date timestamp for calculation')
    parser.add_argument('--from_date', help='from date for calculation (format: YYYY-MM-DD)')
    parser.add_argument('--to_date', help='to date for calculation (format: YYYY-MM-DD)')
    parser.add_argument('--mr_server', default='hahn', help='MR server (hahn, banach, ...)')
    parser.add_argument('--yt_pool', default='robot-suggestor-dev', help='YT pool')
    args = parser.parse_args()
    return args

def get_dates(timestamp, from_date, to_date):
    dates = []
    date_format = '%Y-%m-%d'
    if timestamp:
        date = datetime.fromtimestamp(int(timestamp[:10]))
        dates.append(date.strftime(date_format))
    if from_date and not to_date:
        to_date = datetime.strftime(datetime.now(), date_format)
    if from_date and to_date:
        current_date = datetime.strptime(from_date, date_format)
        while current_date <= datetime.strptime(to_date, date_format):
            dates.append(current_date.strftime(date_format))
            current_date += timedelta(1)
    return dates

def calc(date):
    inputTable = '//home/suggest-dev/suggest_logs/redir-log_preparates/clean/' + date
    outputTable = '//home/suggest-dev/galamaj/suggest_metrics/market/' + date
    yt.wrapper.config['memory_limit'] = 100 * 1024 * 1024 * 1024
    yt.wrapper.create("table", path = outputTable, recursive = True, ignore_existing = True, attributes = None)
    outputPath = yt.wrapper.TablePath(outputTable) #, append = True
    print "%s   %s -> %s" % (str(datetime.now()), inputTable, outputTable)
    with yt.wrapper.TempTable('//home/suggest-dev/galamaj/tmp', prefix='market') as tmpTable:
        yt.wrapper.run_map(mapper, inputTable, tmpTable, format = yt.wrapper.JsonFormat())
        yt.wrapper.run_sort(tmpTable,sort_by=['service','ip'])
        yt.wrapper.run_reduce(count_metrics, tmpTable, outputPath, reduce_by=['service','ip'], format = yt.wrapper.JsonFormat())
        return outputTable

def push_to_razladki(outputTable):
    avgMetrics = 0.
    date = str(outputTable).split('/')[-1]
    timestamp = time.mktime(time.strptime(date, '%Y-%m-%d'))
    print timestamp
    for row in yt.wrapper.read_table(outputTable, raw = False):
        try:
            reg = get_country_name(row["ip"])
            srv = row["service"]
            for key in row.keys():
                if key == "ip" or key == "service" or row[key] is None:
                    continue
                if key == "ctr":
                    avgName = srv + "_" + reg + "_used_%"
                    avgMetrics = row[key]
                    razladkiUrl = 'http://launcher.razladki.yandex-team.ru/save_new_data/%s?%s=%s&ts=%s&override=1' % ("market_suggest_metrics", avgName, avgMetrics, timestamp)
                    print avgName, avgMetrics
                    print urllib2.urlopen(razladkiUrl).read()
                if type(row[key]) != list:
                    avgName = srv + "_" + reg + "_" + key
                    avgMetrics = row[key]
                    razladkiUrl = 'http://launcher.razladki.yandex-team.ru/save_new_data/%s?%s=%s&ts=%s&override=1' % ("market_suggest_metrics", avgName, avgMetrics, timestamp)
                    print avgName, avgMetrics
                    print urllib2.urlopen(razladkiUrl).read()
                if type(row[key]) == list:
                    if key == "used_input_time_sfc":
                        for item in row[key]:
                            avgName = srv + "_" + reg + "_" + "used_" + item.split("\t")[0] + "_sfc"
                            avgMetrics = item.split("\t")[-1]
                            razladkiUrl = 'http://launcher.razladki.yandex-team.ru/save_new_data/%s?%s=%s&ts=%s&override=1' % ("market_suggest_metrics", avgName, avgMetrics, timestamp)
                            print avgName, avgMetrics
                            print urllib2.urlopen(razladkiUrl).read()
                    elif key == "used_input_time_tit":
                        for item in row[key]:
                            avgName = srv + "_" + reg + "_" + "used_" + item.split("\t")[0] + "_tit"
                            avgMetrics = item.split("\t")[-1]
                            razladkiUrl = 'http://launcher.razladki.yandex-team.ru/save_new_data/%s?%s=%s&ts=%s&override=1' % ("market_suggest_metrics", avgName, avgMetrics, timestamp)
                            print avgName, avgMetrics
                            print urllib2.urlopen(razladkiUrl).read()
                    elif key == "input_time_sfc":
                        for item in row[key]:
                            avgName = srv + "_" + reg + "_" + item.split("\t")[0] + "_sfc"
                            avgMetrics = item.split("\t")[-1]
                            razladkiUrl = 'http://launcher.razladki.yandex-team.ru/save_new_data/%s?%s=%s&ts=%s&override=1' % ("market_suggest_metrics", avgName, avgMetrics, timestamp)
                            print avgName, avgMetrics
                            print urllib2.urlopen(razladkiUrl).read()
                    elif key == "input_time_tit":
                        for item in row[key]:
                            avgName = srv + "_" + reg + "_" + item.split("\t")[0] + "_tit"
                            avgMetrics = item.split("\t")[-1]
                            razladkiUrl = 'http://launcher.razladki.yandex-team.ru/save_new_data/%s?%s=%s&ts=%s&override=1' % ("market_suggest_metrics", avgName, avgMetrics, timestamp)
                            print avgName, avgMetrics
                            print urllib2.urlopen(razladkiUrl).read()
                    else:
                        for item in row[key]:
                            avgName = srv + "_" + reg + "_" + item.split("\t")[0]
                            avgMetrics = item.split("\t")[-1]
                            razladkiUrl = 'http://launcher.razladki.yandex-team.ru/save_new_data/%s?%s=%s&ts=%s&override=1' % ("market_suggest_metrics", avgName, avgMetrics, timestamp)
                            print avgName, avgMetrics
                            print urllib2.urlopen(razladkiUrl).read()
        except:
            sleep(300)

if __name__ == '__main__':
    args = parse_args()
    dates = get_dates(args.timestamp, args.from_date, args.to_date)
    print dates
    for date in dates:
        metrics = calc(date)
        push_to_razladki(metrics)

