import yt.wrapper

import threading
import sys
import urlparse
import re
import urllib
import urllib2
import math
import time
import datetime
from datetime import date, datetime, timedelta
import argparse


def mapper(row):
    ip = row["country_by_ip"]
    input_time_sfc = row["since_first_change"]
    input_time_tit = row["total_input_time"]
    #ratio = map(float, row["ratio"].split("."))
    try:
        if row["path"].find("_touch.") != -1 and (ip == 225 or ip == 187 or ip == 149 or ip == 159 or ip == 983 or ip == 171 or ip == 170 or ip == 209 or ip == 168 or ip == 167 or ip == 169 or ip == 208 or ip == 207):
            if 0 < input_time_sfc and 0 < row["query_length"] < 101 and row["text"] != '' and row["text"] != ' ':
                if row["text"].endswith(' '):
                    len_query = float(row["query_length"]) - 1
                else:
                    len_query = float(row["query_length"])
                if input_time_tit > 60000:
                    input_time_tit = 60000
                if input_time_sfc > 60000:
                    input_time_sfc = 60000
                atps_sfc = input_time_sfc/len_query        #avg_time_per_symbol using since_first_change
                if input_time_tit <= 0 or type(input_time_tit) != int:
                    atps_tit = None
                else:
                    atps_tit = input_time_tit/len_query        #avg_time_per_symbol using total_input_time
                yield {'ip' : ip,
                    'avg_time_per_symbol_sfc' : atps_sfc,
                    'avg_time_per_symbol_tit' : atps_tit,
                    'since_first_change' : input_time_sfc,
                    'total_input_time' : input_time_tit,
                    'service' : row["service"],
                    'action_type' : row["action_type"],
                    'query_length' : len_query}
    except AttributeError:
        pass


def average(measures):
    avg = 0.
    n = 0.
    for item in measures:
        if item is not None:
            avg += (item - avg)/(n + 1)
            n += 1
    return avg

def median(lst):
    lst = sorted(lst)
    if len(lst) < 1:
            return None
    if len(lst) %2 == 1:
            return lst[((len(lst)+1)/2)-1]
    else:
            return float(sum(lst[(len(lst)/2)-1:(len(lst)/2)+1]))/2.0

def count_metrics(key, recs):
    atps_sfc = []
    atps_sfc_m = []
    atps_sfc_s = []
    atps_sfc_used = []
    atps_sfc_m_used = []
    atps_sfc_s_used = []
    atps_tit = []
    atps_tit_m = []
    atps_tit_s = []
    atps_tit_used = []
    atps_tit_m_used = []
    atps_tit_s_used = []
    times_sfc = []
    times_sfc_m = []
    times_sfc_s = []
    times_tit_m = []
    times_tit_s = []
    times_tit = []
    times_sfc_used = []
    times_sfc_m_used = []
    times_sfc_s_used = []
    times_tit_m_used = []
    times_tit_s_used = []
    times_tit_used = []
    length = []
    length_s = []
    length_m = []
    length_used = []
    length_s_used = []
    length_m_used = []
    for rec in recs:
        ip = rec["ip"]
        atps_sfc.append(rec['avg_time_per_symbol_sfc'])
        atps_tit.append(rec['avg_time_per_symbol_tit'])
        times_sfc.append(rec['since_first_change'])
        if type(rec['total_input_time']) != int:
            rec['total_input_time'] = 0
        times_tit.append(rec['total_input_time'])
        length.append(rec['query_length'])
        if rec["service"].find("serp_") != -1:
            atps_sfc_s.append(rec['avg_time_per_symbol_sfc'])
            atps_tit_s.append(rec['avg_time_per_symbol_tit'])
            times_sfc_s.append(rec['since_first_change'])
            times_tit_s.append(rec['total_input_time'])
            length_s.append(rec['query_length'])
        if rec["service"].find("morda_") != -1:
            atps_sfc_m.append(rec['avg_time_per_symbol_sfc'])
            atps_tit_m.append(rec['avg_time_per_symbol_tit'])
            times_sfc_m.append(rec['since_first_change'])
            times_tit_m.append(rec['total_input_time'])
            length_m.append(rec['query_length'])
        if rec["action_type"] != "not_shown" and rec["action_type"] != "not_used":
            atps_sfc_used.append(rec['avg_time_per_symbol_sfc'])
            atps_tit_used.append(rec['avg_time_per_symbol_tit'])
            times_sfc_used.append(rec['since_first_change'])
            times_tit_used.append(rec['total_input_time'])
            length_used.append(rec['query_length'])
            if rec["service"].find("serp_") != -1:
                atps_sfc_s_used.append(rec['avg_time_per_symbol_sfc'])
                atps_tit_s_used.append(rec['avg_time_per_symbol_tit'])
                times_sfc_s_used.append(rec['since_first_change'])
                times_tit_s_used.append(rec['total_input_time'])
                length_s_used.append(rec['query_length'])
            if rec["service"].find("morda_") != -1:
                atps_sfc_m_used.append(rec['avg_time_per_symbol_sfc'])
                atps_tit_m_used.append(rec['avg_time_per_symbol_tit'])
                times_sfc_m_used.append(rec['since_first_change'])
                times_tit_m_used.append(rec['total_input_time'])
                length_m_used.append(rec['query_length'])
    avrg_time_per_symb_sfc = average(atps_sfc)
    avrg_time_per_symb_tit = average(atps_tit)
    serp_avrg_time_per_symb_sfc = average(atps_sfc_s)
    serp_avrg_time_per_symb_tit = average(atps_tit_s)
    morda_avrg_time_per_symb_sfc = average(atps_sfc_m)
    morda_avrg_time_per_symb_tit = average(atps_tit_m)
    used_avrg_time_per_symb_sfc = average(atps_sfc_used)
    used_avrg_time_per_symb_tit = average(atps_tit_used)
    used_serp_avrg_time_per_symb_sfc = average(atps_sfc_s_used)
    used_serp_avrg_time_per_symb_tit = average(atps_tit_s_used)
    used_morda_avrg_time_per_symb_sfc = average(atps_sfc_m_used)
    used_morda_avrg_time_per_symb_tit = average(atps_tit_m_used)
    median_sfc = median(times_sfc)
    median_tit = median(times_tit)
    median_sfc_s = median(times_sfc_s)
    median_tit_s = median(times_tit_s)
    median_sfc_m = median(times_sfc_m)
    median_tit_m = median(times_tit_m)
    median_sfc_used = median(times_sfc_used)
    median_tit_used = median(times_tit_used)
    median_sfc_s_used = median(times_sfc_s_used)
    median_tit_s_used = median(times_tit_s_used)
    median_sfc_m_used = median(times_sfc_m_used)
    median_tit_m_used = median(times_tit_m_used)
    yield {'ip' : ip,
           'avrg_time_per_symb_normalized_sfc' : avrg_time_per_symb_sfc,
           'avrg_time_per_symb_normalized_tit' : avrg_time_per_symb_tit,
           'serp_avrg_time_per_symb_normalized_sfc' : serp_avrg_time_per_symb_sfc,
           'serp_avrg_time_per_symb_normalized_tit' : serp_avrg_time_per_symb_tit,
           'morda_avrg_time_per_symb_normalized_sfc' : morda_avrg_time_per_symb_sfc,
           'morda_avrg_time_per_symb_normalized_tit' : morda_avrg_time_per_symb_tit,
           'used_avrg_time_per_symb_normalized_sfc' : used_avrg_time_per_symb_sfc,
           'used_avrg_time_per_symb_normalized_tit' : used_avrg_time_per_symb_tit,
           'used_serp_avrg_time_per_symb_normalized_sfc' : used_serp_avrg_time_per_symb_sfc,
           'used_serp_avrg_time_per_symb_normalized_tit' : used_serp_avrg_time_per_symb_tit,
           'used_morda_avrg_time_per_symb_normalized_sfc' : used_morda_avrg_time_per_symb_sfc,
           'used_morda_avrg_time_per_symb_normalized_tit' : used_morda_avrg_time_per_symb_tit,
           'median_input_time_normalized_sfc' : median_sfc,
           'median_input_time_normalized_tit' : median_tit,
           'serp_median_input_time_normalized_sfc' : median_sfc_s,
           'serp_median_input_time_normalized_tit' : median_tit_s,
           'morda_median_input_time_normalized_sfc' : median_sfc_m,
           'morda_median_input_time_normalized_tit' : median_tit_m,
           'used_median_input_time_normalized_sfc' : median_sfc_used,
           'used_median_input_time_normalized_tit' : median_tit_used,
           'used_serp_median_input_time_normalized_sfc' : median_sfc_s_used,
           'used_serp_median_input_time_normalized_tit' : median_tit_s_used,
           'used_morda_median_input_time_normalized_sfc' : median_sfc_m_used,
           'used_morda_median_input_time_normalized_tit' : median_tit_m_used
           }


def get_country_name(ip):
    country = ""
    if ip == 225:
        country = "Ru"
        return country
    if ip == 187:
        country = "Ua"
        return country
    if ip == 149:
        country = "By"
        return country
    if ip == 159:
        country = "Kz"
        return country
    if ip == 983:
        country = "Tr"
        return country
    if ip == 171:
        country = "Uz"
        return country
    if ip == 170:
        country = "Tm"
        return country
    if ip == 209:
        country = "Tj"
        return country
    if ip == 168:
        country = "Am"
        return country
    if ip == 167:
        country = "Az"
        return country
    if ip == 169:
        country = "Ge"
        return country
    if ip == 207:
        country = "Kg"
        return country
    if ip == 208:
        country = "Md"
        return country

def parse_args():
    parser = argparse.ArgumentParser(add_help=True, description='Suggest metrics calc')
    parser.add_argument('--timestamp', help='date timestamp for calculation')
    parser.add_argument('--from_date', help='from date for calculation (format: YYYY-MM-DD)')
    parser.add_argument('--to_date', help='to date for calculation (format: YYYY-MM-DD)')
    # parser.add_argument('--mr_server', default='hahn', help='MR server (hahn, banach, ...)')
    # parser.add_argument('--yt_pool', default='robot-suggestor-dev', help='YT pool')
    args = parser.parse_args()
    return args

def get_dates(timestamp, from_date, to_date):
    dates = []
    date_format = '%Y-%m-%d'
    if timestamp:
        date = datetime.fromtimestamp(int(timestamp[:10]))
        dates.append(date.strftime(date_format))
    if from_date and not to_date:
        to_date = datetime.strftime(datetime.now(), date_format)
    if from_date and to_date:
        current_date = datetime.strptime(from_date, date_format)
        while current_date <= datetime.strptime(to_date, date_format):
            dates.append(current_date.strftime(date_format))
            current_date += timedelta(1)
    return dates

def calc(date):
    inputTable = '//home/suggest-dev/suggest_logs/redir-log_preparates/clean/' + date
    outputTable = '//home/suggest-dev/suggest/galamaj/suggest_metrics/time_per_symb/' + date
    yt.wrapper.config['memory_limit'] = 100 * 1024 * 1024 * 1024
    yt.wrapper.create("table", path = outputTable, recursive = True, ignore_existing = True, attributes = None)
    outputPath = yt.wrapper.TablePath(outputTable) #, append = True
    print "%s   %s -> %s" % (str(datetime.now()), inputTable, outputTable)
    # yt.wrapper.run_map(mapper, inputTable, outputPath, format = yt.wrapper.JsonFormat())
    yt.wrapper.run_map_reduce(mapper, count_metrics, inputTable, outputPath, reduce_by='ip', format = yt.wrapper.JsonFormat())
    return outputTable


def push_to_razladki(outputTable):
    avgMetrics = 0.
    date = str(outputTable).split('/')[-1]
    timestamp = time.mktime(time.strptime(date, '%Y-%m-%d'))
    print timestamp
    for row in yt.wrapper.read_table(outputTable, raw = False):
        reg = get_country_name(row["ip"])
        for key in row.keys():
            try:
                if key == "ip" or row[key] is None:
                    continue
                if row["ip"] == 225:
                    avgName = key
                    avgMetrics = row[key]
                    razladkiUrl = 'http://launcher.razladki.yandex-team.ru/save_new_data/%s?%s=%s&ts=%s&override=1' % ("suggest_metrics_touch", avgName, avgMetrics, timestamp)
                    print avgName, avgMetrics
                    print urllib2.urlopen(razladkiUrl).read()
                    # avgName2 = reg + "_" + key
                    # razladkiUrl2 = 'http://launcher.razladki.yandex-team.ru/save_new_data/%s?%s=%s&ts=%s&override=1' % ("suggest_metrics_touch_rkubt", avgName2, avgMetrics, timestamp)
                    # print avgName, avgMetrics
                    # print urllib2.urlopen(razladkiUrl2).read()
                else:
                    avgName = reg + "_" + key
                    avgMetrics = row[key]
                    razladkiUrl = 'http://launcher.razladki.yandex-team.ru/save_new_data/%s?%s=%s&ts=%s&override=1' % ("suggest_metrics_touch_rkubt", avgName, avgMetrics, timestamp)
                    print avgName, avgMetrics
                    print urllib2.urlopen(razladkiUrl).read()
            except:
                time.sleep(300)


if __name__ == '__main__':
    args = parse_args()
    dates = get_dates(args.timestamp, args.from_date, args.to_date)
    print dates
    for date in dates:
        metrics = calc(date)
        push_to_razladki(metrics)
