import yt.wrapper

import threading
import sys
import urlparse
import re
import urllib
import urllib2
import math
import time
import json
import time
import datetime
from datetime import date, datetime, timedelta
import argparse

def mapper(row):
    ip = row["country_by_ip"]
    input_time = row["since_first_change"]
    if row["service"] is not None:
        if row["service"].find("video_mob_") != -1 and (ip == 225 or ip == 187 or ip == 149 or ip == 159 or ip == 983 or ip == 171 or ip == 170 or ip == 209 or ip == 168 or ip == 167 or ip == 169 or ip == 208 or ip == 207):
            if 0 < input_time < 600000 and 0 < row["query_length"] < 121 and row["text"] != row["prev_query"] and row["text"] != '' and row["text"] != ' ' and type(row["user_actions_count"]) == int:
                input_time = float(row["since_first_change"])
                actions = row["user_actions_count"]
                if row["text"].endswith(' '):
                    len_query = row["query_length"] - 1
                else:
                    len_query = row["query_length"]
                atps = input_time/len_query        #avg_time_per_symbol
                atpa = input_time/actions           #avg_time_per_action
                yield {'ip' : ip,
                        'avg_time_per_symbol' : atps,
                        'avg_time_per_action' : atpa,
                        'time' : input_time,
                        'path' : row["path"],
                        'query_length' : len_query,
                        'text' : row["text"],
                        'ratio' : row["ratio"],
                        'action_type' : row["action_type"]}

def saved_metrics(ratios):
    res = []
    res2 = []
    s = 0.
    n = 0.
    actions = []
    length = []
    cnt = 0.
    for item in ratios:
        try:
            n = map(float, item.split("."))
        except ValueError:
            pass
        if len(n) < 3:
            continue
        if n[1] == 0:
            continue
        saved = (n[1] - n[0])/n[1]*100
        saved2 = (n[1] - n[0])
        action = n[-1]
        cnt += 1
        res.append(saved)
        res2.append(saved2)
        actions.append(action)
        length.append(n[1])
    try:
        saved_mid = sum(res)/len(res)
        saved_symb = sum(res2)/len(res2)
        average_actions = sum(actions)/cnt
        average_length = sum(length)/cnt
        result = ['average_query_length' + '\t' + str(average_length), 'saved_%' + '\t' + str(saved_mid), 'saved_symbols' + '\t' + str(saved_symb), 'average_actions' + '\t' + str(average_actions)]
        return result
    except ZeroDivisionError:
        pass


def time_metrics(times):
    summ = 0
    n = 0
    count1 = 0
    count2 = 0
    count3 = 0
    count4 = 0
    count5 = 0
    count6 = 0
    times = map(float, times)
    times = sorted(times)
    if len(times) > 0:
        for time in times:
            if time <= 0:
                continue
            summ += (time - summ)/(n + 1)
            n += 1
            if time <= 5000:
                count1 += 1
            if 5000 < time <= 10000:
                count2 += 1
            if 10000 < time <= 15000:
                count3 +=1
            if 15000 < time <= 20000:
                count4 +=1
            if 20000 < time < 25000:
                count5 += 1
            if time > 25000:
                count6 += 1
        quantile = sum(times[:int(9.5*len(times)/10)])/(9.5*len(times)/10)
        less5 = count1*100.0/len(times)
        less10 = count2*100.0/len(times)
        less15 = count3*100.0/len(times)
        less20 = count4*100.0/len(times)
        less25 = count5*100.0/len(times)
        more25 = count6*100.0/len(times)
        result = ['input_time_average' + '\t' + str(summ), 'average_quantile_95%' + '\t' + str(quantile), 'input_time_less_5_seconds_%' + '\t' + str(less5), 'input_time_5_10_seconds_%' + '\t' + str(less10),  'input_time_10_15_seconds_%' + '\t' + str(less15), 'input_time_15_20_seconds_%' + '\t' + str(less20), 'input_time_20_25_seconds_%' + '\t' + str(less25), 'input_time_more_25_seconds_%' + '\t' + str(more25), '0_5_seconds_%' + '\t' + str(less5), '0_10_seconds_%' + '\t' + str(less5 + less10), '0_15_seconds_%' + '\t' + str(less5 + less10 + less15), '0_20_seconds_%' + '\t' + str(less5 + less10 + less15 + less20), '0_25_seconds_%' + '\t' + str(less5 + less10 + less15 + less20 + less25)]
        return result

def used_metrics(path):
    total = float(len(path))
    not_u = 0
    not_sh = 0
    for item in path:
        if item == 'not_used':
            not_u += 1
        if item == 'not_shown':
            not_sh += 1
    try:
        used = (total - not_u - not_sh)/(total - not_sh)*100.0
        result = ['used_%' + '\t' + str(used)]
        return result
    except ZeroDivisionError:
        pass

def saved_metrics(ratios):
    res = []
    res2 = []
    s = 0.
    n = 0.
    actions = []
    length = []
    cnt = 0.
    for item in ratios:
        try:
            n = map(float, item.split("."))
        except ValueError:
            pass
        if len(n) < 3:
            continue
        if n[1] == 0:
            continue
        saved = (n[1] - n[0])/n[1]*100
        saved2 = (n[1] - n[0])
        action = n[-1]
        cnt += 1
        res.append(saved)
        res2.append(saved2)
        actions.append(action)
        length.append(n[1])
    try:
        saved_mid = sum(res)/len(res)
        saved_symb = sum(res2)/len(res2)
        average_actions = sum(actions)/cnt
        average_length = sum(length)/cnt
        result = ['average_query_length' + '\t' + str(average_length), 'saved_%' + '\t' + str(saved_mid), 'saved_symbols' + '\t' + str(saved_symb), 'average_actions' + '\t' + str(average_actions)]
        return result
    except ZeroDivisionError:
        pass

def average(measures):
    avg = 0.
    n = 0.
    for item in measures:
        avg += (item - avg)/(n + 1)
        n += 1
    return avg

def count_metrics(key, records):
    times = []
    ratios = []
    path = []
    atps = []
    atps_used = []
    atpa = []
    atpa_used = []
    length = []
    length_used = []
    total = 0
    for record in records:
        ip = record["ip"]
        ratios.append(record["ratio"])
        path.append(record["action_type"])
        atps.append(record['avg_time_per_symbol'])
        atpa.append(record['avg_time_per_action'])
        times.append(record['time'])
        length.append(record['query_length'])
        total += 1
        if record["action_type"] != "not_shown" and record["action_type"] != "not_used":
            atps_used.append(record['avg_time_per_symbol'])
            atpa_used.append(record['avg_time_per_action'])
            times.append(record['time'])
            length_used.append(record['query_length'])
    times_total = time_metrics(times)
    saved = saved_metrics(ratios)
    ctr = used_metrics(path)
    avrg_time_per_symb = average(atps)
    avrg_time_per_act = average(atpa)
    used_avrg_time_per_symb = average(atps_used)
    used_avrg_time_per_act = average(atpa_used)
    avrg_length = average(length)
    used_avrg_length = average(length_used)
    yield {
        'ip' : ip,
        'input_time' : times_total,
        'saved' : saved,
        'ctr' : ctr,
        'total_queries' : total,
        'avrg_time_per_symb_by_query' : avrg_time_per_symb,
        'avrg_time_per_act_by_query' : avrg_time_per_act,
        'used_avrg_time_per_symb_by_query' : used_avrg_time_per_symb,
        'used_avrg_time_per_act_by_query' : used_avrg_time_per_act,
        'average_query_length' : avrg_length,
        'used_average_query_length' : used_avrg_length}


def parse_args():
    parser = argparse.ArgumentParser(add_help=True, description='Suggest metrics calc')
    parser.add_argument('--timestamp', help='date timestamp for calculation')
    parser.add_argument('--from_date', help='from date for calculation (format: YYYY-MM-DD)')
    parser.add_argument('--to_date', help='to date for calculation (format: YYYY-MM-DD)')
    # parser.add_argument('--mr_server', default='hahn', help='MR server (hahn, banach, ...)')
    # parser.add_argument('--yt_pool', default='robot-suggestor-dev', help='YT pool')
    args = parser.parse_args()
    return args

def get_dates(timestamp, from_date, to_date):
    dates = []
    date_format = '%Y-%m-%d'
    if timestamp:
        date = datetime.fromtimestamp(int(timestamp[:10]))
        dates.append(date.strftime(date_format))
    if from_date and not to_date:
        to_date = datetime.strftime(datetime.now(), date_format)
    if from_date and to_date:
        current_date = datetime.strptime(from_date, date_format)
        while current_date <= datetime.strptime(to_date, date_format):
            dates.append(current_date.strftime(date_format))
            current_date += timedelta(1)
    return dates

IP_TO_COUNTRY_NAME_MAP = {
    225: "Ru",
    187: "Ua",
    149: "By",
    159: "Kz",
    983: "Tr",
    171: "Uz",
    170: "Tm",
    209: "Tj",
    168: "Am",
    167: "Az",
    169: "Ge",
    207: "Kg",
    208: "Md"
}


def get_country_name(ip):
    return IP_TO_COUNTRY_NAME_MAP.get(ip)

def calc(date):
    inputTable = '//home/suggest-dev/suggest_logs/redir-log_preparates/clean/' + date
    outputTable = '//home/suggest-dev/suggest/galamaj/suggest_metrics/time_per_symb/video/' + date
    yt.wrapper.config['memory_limit'] = 100 * 1024 * 1024 * 1024
    yt.wrapper.create("table", path = outputTable, recursive = True, ignore_existing = True, attributes = None)
    outputPath = yt.wrapper.TablePath(outputTable) #, append = True
    print "%s   %s -> %s" % (str(datetime.now()), inputTable, outputTable)
    # yt.wrapper.run_map(mapper, inputTable, outputPath, format = yt.wrapper.JsonFormat())
    yt.wrapper.run_map_reduce(mapper, count_metrics, inputTable, outputPath, reduce_by='ip', format = yt.wrapper.JsonFormat())
    return outputTable

def push_to_razladki(outputTable):
    avgMetric = 0.
    date = str(outputTable).split('/')[-1]
    timestamp = time.mktime(time.strptime(date, '%Y-%m-%d'))
    print timestamp
    for row in yt.wrapper.read_table(outputTable, raw = False):
        reg = get_country_name(row["ip"])
        for key in row.keys():
            if key == "ip" or row[key] is None:
                continue
            try:
                if type(row[key]) != list:
                    avgName = reg + "_" + key
                    avgMetrics = row[key]
                    razladkiUrl = 'http://launcher.razladki.yandex-team.ru/save_new_data/%s?%s=%s&ts=%s&override=1' % ("video_suggest_metrics_touch", avgName, avgMetrics, timestamp)
                    print avgName, avgMetrics
                    print urllib2.urlopen(razladkiUrl).read()
                if type(row[key]) == list:
                    for item in row[key]:
                        avgName = reg + "_" + item.split("\t")[0]
                        avgMetric = item.split("\t")[-1]
                        razladkiUrl = 'http://launcher.razladki.yandex-team.ru/save_new_data/%s?%s=%s&ts=%s&override=1' % ("video_suggest_metrics_touch", avgName, avgMetric, timestamp)
                        print avgName, avgMetric
                        print urllib2.urlopen(razladkiUrl).read()
            except:
                time.sleep(300)

if __name__ == '__main__':
    args = parse_args()
    dates = get_dates(args.timestamp, args.from_date, args.to_date)
    print dates
    for date in dates:
        metrics = calc(date)
        push_to_razladki(metrics)

