import yt.wrapper

import threading
import sys
import urlparse
import re
import urllib
import urllib2
import math
import time
import json
import datetime
from datetime import date, datetime, timedelta
import argparse


def time_metrics(times):
    avrg = 0.
    n = 0.
    count1 = 0
    count2 = 0
    count3 = 0
    count4 = 0
    count5 = 0
    count6 = 0
    times = map(float, times)
    times = sorted(times)
    result = []
    if len(times) > 0:
        for time in times:
            avrg += (time - avrg)/(n + 1)
            n += 1
            if time <= 0:
                continue
            if time <= 5000:
                count1 += 1
            if 5000 < time <= 10000:
                count2 += 1
            if 10000 < time <= 15000:
                count3 +=1
            if 15000 < time <= 20000:
                count4 +=1
            if 20000 < time < 25000:
                count5 += 1
            if time > 25000:
                count6 += 1
        quantile = sum(times[:int(9.5*len(times)/10)])/(9.5*len(times)/10)
        # result.append(avrg)
        result = [avrg, quantile]
        return result

def saved_metrics(ratios):
    res = []
    res2 = []
    s = 0.
    actions = []
    length = []
    cnt = 0.
    n = []
    for item in ratios:
        try:
            n = map(float, item.split("."))
        except ValueError:
            pass
        if len(n) < 3:
            continue
        if n[1] == 0:
            continue
        saved = (n[1] - n[0])/n[1]*100
        saved2 = (n[1] - n[0])
        action = n[-1]
        cnt += 1
        res.append(saved)
        res2.append(saved2)
        actions.append(action)
        length.append(n[1])
    try:
        saved_mid = sum(res)/len(res)
        saved_symb = sum(res2)/len(res2)
        average_actions = sum(actions)/cnt
        average_length = sum(length)/cnt
        result = [average_length, saved_mid, saved_symb, average_actions]
        return result
    except ZeroDivisionError:
        pass

def mapper(row):
    try:
        ip = row["country_by_ip"]
        query_length = row["query_length"]
        if row["path"].find("_touch.") != -1 and ip == 225 and row["text"] != '' and row["text"] != ' ' and row["since_first_change"] > 0 and row["since_first_change"] < 600000 and row["query_length"] > 0 and row["query_length"] < 101:#(ip == 171 or ip == 170 or ip == 209 or ip == 168 or ip == 167 or ip == 169 or ip == 208 or ip == 207):#(ip == 187 or ip == 149 or ip == 159 or ip == 983):
            if row["text"].endswith(' '):
                row["query_length"] = query_length - 1
            else:
                row["query_length"] = query_length
            yield row
    except KeyError:
        pass
    except AttributeError:
        pass

def used_metrics(path):
    total = float(len(path))
    not_u = 0
    not_sh = 0
    for item in path:
        if item == "not_used":
            not_u += 1
        if item == "not_shown":
            not_sh += 1
    try:
        used = (total - not_u - not_sh)/(total - not_sh)*100.0
        return used
    except ZeroDivisionError:
        pass

def count_metrics(key, records):
    times_m = []
    times_s = []
    times = []
    times_m_used = []
    times_s_used = []
    times_used = []
    times_m_not_used = []
    times_s_not_used = []
    times_not_used = []
    length_m = []
    length_s = []
    #length = []
    ratios_m_used = []
    ratios_s_used = []
    ratios_used = []
    ratios_m= []
    ratios_s = []
    ratios = []
    used = []
    used_m = []
    used_s = []
    total = 0
    total_serp = 0
    total_morda = 0
    total_used = 0
    total_serp_used = 0
    total_morda_used = 0
    for record in records:
        try:
            length = float(record["query_length"])
            times.append(record["since_first_change"])
            ratios.append(record["ratio"])
            used.append(record["action_type"])
            total += 1
            if record["path"].find("serp_") != -1:
                times_s.append(record["since_first_change"])
                ratios_s.append(record["ratio"])
                used_s.append(record["action_type"])
                total_serp += 1
            if record["path"].find("morda_") != -1:
                times_m.append(record["since_first_change"])
                ratios_m.append(record["ratio"])
                used_m.append(record["action_type"])
                total_morda += 1
            if record["action_type"] != "not_used" and record["action_type"] != "not_shown":
                times_used.append(record["since_first_change"])
                ratios_used.append(record["ratio"])
                total_used += 1
                if record["path"].find("serp_") != -1:
                    times_s_used.append(record["since_first_change"])
                    ratios_s_used.append(record["ratio"])
                    total_serp_used += 1
                if record["path"].find("morda_") != -1:
                    times_m_used.append(record["since_first_change"])
                    ratios_m_used.append(record["ratio"])
                    total_morda_used += 1
            if record["action_type"] == "not_used":
                times_not_used.append(record["since_first_change"])
                #ratios_used.append(record["ratio"])
                if record["path"].find("serp_") != -1:
                    times_s_not_used.append(record["since_first_change"])
                    #ratios_s_used.append(record["ratio"])
                if record["path"].find("morda_") != -1:
                    times_m_not_used.append(record["since_first_change"])
                    #ratios_m_used.append(record["ratio"])
        except KeyError:
            pass
        except AttributeError:
            pass
        except TypeError:
            pass
    times_serp = time_metrics(times_s)
    times_morda = time_metrics(times_m)
    times_total = time_metrics(times)
    times_serp_used = time_metrics(times_s_used)
    times_morda_used = time_metrics(times_m_used)
    times_total_used = time_metrics(times_used)
    times_total_not_used = time_metrics(times_not_used)
    times_serp_not_used = time_metrics(times_s_not_used)
    times_morda_not_used = time_metrics(times_m_not_used)
    used_total = used_metrics(used)
    used_serp = used_metrics(used_s)
    used_morda = used_metrics(used_m)
    yield {
        'symbols' : int(length),
        'avrg_time_per_symb' : times_total[0]/length,
        'serp_avrg_time_per_symb' : times_serp[0]/length,
        'morda_avrg_time_per_symb' : times_morda[0]/length,
        'used_avrg_time_per_symb' : times_total_used[0]/length,
        'used_morda_avrg_time_per_symb' : times_morda_used[0]/length,
        'used_serp_avrg_time_per_symb' : times_serp_used[0]/length,
        'used_avrg_time_per_symb' : times_total_used[0]/length,
        'used_morda_avrg_time_per_symb' : times_morda_used[0]/length,
        'used_serp_avrg_time_per_symb' : times_serp_used[0]/length,
        'not_used_avrg_time_per_symb' : times_total_not_used[0]/length,
        'not_used_morda_avrg_time_per_symb' : times_morda_not_used[0]/length,
        'not_used_serp_avrg_time_per_symb' : times_serp_not_used[0]/length,
        'input_time_quantile_95' : times_total[1],
        'serp_input_time_quantile_95' : times_serp[1],
        'morda_input_time_quantile_95' : times_morda[1],
        'used_input_time_quantile_95' : times_total_used[1],
        'used_serp_input_time_quantile_95' : times_serp_used[1],
        'used_morda_input_time_quantile_95' : times_morda_used[1],
        'not_used_input_time_quantile_95' : times_total_not_used[1],
        'not_used_serp_input_time_quantile_95' : times_serp_not_used[1],
        'not_used_morda_input_time_quantile_95' : times_morda_not_used[1],
        'input_time_average' : times_total[0],
        'serp_input_time_average' : times_serp[0],
        'morda_input_time_average' : times_morda[0],
        'used_input_time_average' : times_total_used[0],
        'used_serp_input_time_average' : times_serp_used[0],
        'used_morda_input_time_average' : times_morda_used[0],
        'not_used_input_time_average' : times_total_not_used[0],
        'not_used_serp_input_time_average' : times_serp_not_used[0],
        'not_used_morda_input_time_average' : times_morda_not_used[0],
        'used' : used_total,
        'serp_used' : used_serp,
        'morda_used' : used_morda,
        'total' :  total,
        'total_serp' : total_serp,
        'total_morda' : total_morda,
        'total_used' : total_used,
        'total_serp_used' : total_serp_used,
        'total_morda_used' : total_morda_used
        }


def parse_args():
    parser = argparse.ArgumentParser(add_help=True, description='Suggest metrics calc')
    parser.add_argument('--timestamp', help='date timestamp for calculation')
    parser.add_argument('--from_date', help='from date for calculation (format: YYYY-MM-DD)')
    parser.add_argument('--to_date', help='to date for calculation (format: YYYY-MM-DD)')
    # parser.add_argument('--mr_server', default='hahn', help='MR server (hahn, banach, ...)')
    # parser.add_argument('--yt_pool', default='robot-suggestor-dev', help='YT pool')
    args = parser.parse_args()
    return args

def get_dates(timestamp, from_date, to_date):
    dates = []
    date_format = '%Y-%m-%d'
    if timestamp:
        date = datetime.fromtimestamp(int(timestamp[:10]))
        dates.append(date.strftime(date_format))
    if from_date and not to_date:
        to_date = datetime.strftime(datetime.now(), date_format)
    if from_date and to_date:
        current_date = datetime.strptime(from_date, date_format)
        while current_date <= datetime.strptime(to_date, date_format):
            dates.append(current_date.strftime(date_format))
            current_date += timedelta(1)
    return dates

def calc(date):
    inputTable = '//home/suggest-dev/suggest_logs/redir-log_preparates/clean/' + date
    outputTable = '//home/suggest-dev/suggest/galamaj/suggest_metrics/group_by_length/' + date
    yt.wrapper.config['memory_limit'] = 100 * 1024 * 1024 * 1024
    yt.wrapper.create("table", path = outputTable, recursive = True, ignore_existing = True, attributes = None)
    outputPath = yt.wrapper.TablePath(outputTable) #, append = True
    print "%s   %s -> %s" % (str(datetime.now()), inputTable, outputTable)
    # yt.wrapper.run_map(mapper, inputTable, outputPath, format = yt.wrapper.JsonFormat())
    yt.wrapper.run_map_reduce(mapper, count_metrics, inputTable, outputPath, reduce_by='query_length', format = yt.wrapper.JsonFormat())
    return outputTable


def push_to_razladki(outputTable):
    avgMetric = 0.
    date = str(outputTable).split('/')[-1]
    timestamp = time.mktime(time.strptime(date, '%Y-%m-%d'))
    print timestamp
    for row in yt.wrapper.read_table(outputTable, raw = False):
            symb = str(row["symbols"])
            if row["symbols"] < 150:
                for key in row.keys():
                    avgName = symb + "_" + key
                    avgMetrics = row[key]
                    try:
                        razladkiUrl = 'http://launcher.razladki.yandex-team.ru/save_new_data/%s?%s=%s&ts=%s&override=1' % ("suggest_metrics_for_queries_length", avgName, avgMetrics, timestamp)
                        print avgName, avgMetrics
                        print urllib2.urlopen(razladkiUrl).read()
                    except:
                        continue


if __name__ == '__main__':
    args = parse_args()
    dates = get_dates(args.timestamp, args.from_date, args.to_date)
    print dates
    for date in dates:
        metrics = calc(date)
        push_to_razladki(metrics)


