import yt.wrapper

import threading
import sys
import urlparse
import re
import urllib
import urllib2
import math
import time
import datetime
from datetime import date, datetime, timedelta
import argparse

def mapper(row):
    allowed_countries = (225, 187, 149, 159, 983, 171, 170, 209, 168, 167, 169, 208, 207)
    try:
        ip = row["country_by_ip"]
        rndr = None
        server_coverage = 0.
        not_ersp_not_rndr = None
        not_ersp_not_rndr_part = 0.
        if (ip in allowed_countries) and 0 < row["query_length"] < 101 and row["text"] != '' and row["text"] != ' ':
            if row["requests_count"] > 0 and row["response_count"] is not None and row["empty_response_count"] is not None:
                if row["requests_count"] >= row["response_count"] and row["response_count"] >= row["cached_count"] and row["response_count"] >= row["empty_response_count"]:
                    if row["response_count"] != 0:
                        ersp_part_rsp = float(row["empty_response_count"])/float(row["response_count"])
                        server_coverage = (float(row["response_count"]) - float(row["empty_response_count"])) / float(row["response_count"])
                        server_coverage_new = (float(row["response_count"]) - float(row["empty_response_count"])) / float(row["requests_count"])
                    else:
                        ersp_part_rsp = 1.
                    ersp_part_rqs = float(row["empty_response_count"])/float(row["requests_count"])
                    rsp_part = float(row["response_count"])/float(row["requests_count"])
                    if row["rendered_count"] is not None:
                        rndr = float(row["rendered_count"])/float(row["requests_count"])
                        not_ersp_not_rndr = float(row["response_count"])-float(row["empty_response_count"])- float(row["rendered_count"])
                        try:
                            not_ersp_not_rndr_part = not_ersp_not_rndr/(float(row["response_count"])-float(row["empty_response_count"]))
                        except ZeroDivisionError:
                            not_ersp_not_rndr_part = 0.
                    if row["late_response_count"] is not None:
                        lrsp = float(row["late_response_count"])/float(row["requests_count"])
                    else:
                        lrsp = None
                    yield {"ip" : ip,
                        "service" : row["service"],
                        "ersp":row["empty_response_count"],
                        "rqs":row["requests_count"],
                        "rsp":row["response_count"],
                        "cchd":row["cached_count"],
                        "clks":row["clicks_count"],
                        "rndr":row["rendered_count"],
                        "lrsp":row["late_response_count"],
                        "ersp_part_rsp":ersp_part_rsp,
                        "ersp_part_rqs":ersp_part_rqs,
                        "user_coverage":rndr,
                        "server_coverage":server_coverage,
                        "response_rate":rsp_part,
                        "not_ersp_not_rndr":not_ersp_not_rndr,
                        "not_ersp_not_rndr_part":not_ersp_not_rndr_part,
                        'lrsp_part_rqs':lrsp}
    except AttributeError:
        pass

def count_metrics(key,recs):
    result = {}
    total = 0.
    ersp = 0.
    rqs = 0.
    rsp = 0.
    rnd = 0.
    cchd = 0.
    rndr = 0.
    clks = 0.
    lrsp = 0.
    ersp_part_rsp = 0.
    ersp_part_rqs = 0.
    ersp_0 = 0.
    ersp_not_0 = 0.
    user_coverage = 0.
    server_coverage = 0.
    response_rate = 0.
    not_ersp_not_rndr = 0.
    not_ersp_not_rndr_part = 0.
    lrsp_part_rqs = 0.
    for rec in recs:
        total += 1
        result["ip"] = rec["ip"]
        result["service"] = rec["service"]
        ersp += rec["ersp"]
        rqs += rec["rqs"]
        rsp += rec["rsp"]
        if rec["lrsp"] != None:
            lrsp += rec["lrsp"]
        if rec["cchd"] != None:
            cchd += rec["cchd"]
        if rec["clks"] != None:
            clks += rec["clks"]
        ersp_part_rsp += rec["ersp_part_rsp"]
        ersp_part_rqs += rec["ersp_part_rqs"]
        if rec["user_coverage"] != None:
            user_coverage += rec["user_coverage"]
        server_coverage += rec["server_coverage"]
        response_rate += rec["response_rate"]
        if rec["ersp"] == 0:
            ersp_0 += 1
        else:
            ersp_not_0 += 1
        if rec["rndr"] != None:
            rndr += rec["rndr"]
        if rec["not_ersp_not_rndr"] != None:
            not_ersp_not_rndr += rec["not_ersp_not_rndr"]
        if rec["not_ersp_not_rndr_part"] != None:
            not_ersp_not_rndr_part += rec["not_ersp_not_rndr_part"]
        if rec["lrsp_part_rqs"] != None:
            lrsp_part_rqs += rec["lrsp_part_rqs"]
    if total > 0:
        result["rendered_suggest"] = rndr/total
        result["suggest_user_coverage"] = user_coverage/total*100
        result["suggest_server_coverage"] = server_coverage/total*100
        result["server_response_rate"] = response_rate/total*100
        result["queries_without_empty_responses_from_suggest_rate"] = ersp_0/total*100
        result["queries_have_empty_responses_from_suggest_rate"] = ersp_not_0/total*100
        result["total"] = total
        result["empty_responses_from_suggest"] = ersp/total
        result["requests_to_suggest"] = rqs/total
        result["responses_from_suggest"] = rsp/total
        result["responses_from_cache"] = cchd/total
        result["clicks_on_suggest"] = clks/total
        result["empty_responses_rate_per_1_query"] = ersp_part_rsp/total
        result["not_empty_but_not_rendered_part"] = not_ersp_not_rndr_part/total*100
        result["not_empty_but_not_rendered"] = not_ersp_not_rndr/total
        result["empty_responses_part_from_requests"] = ersp_part_rqs/total*100
        result["late_response"] = lrsp/total
        result["late_response_part"] = lrsp_part_rqs/total*100
        if rsp > 0:
            result["empty_responses_rate"] = ersp/rsp*100
        else:
            result["empty_responses_rate"] = 1
        yield result

def parse_args():
    parser = argparse.ArgumentParser(add_help=True, description='Suggest metrics calc')
    parser.add_argument('--timestamp', help='date timestamp for calculation')
    parser.add_argument('--from_date', help='from date for calculation (format: YYYY-MM-DD)')
    parser.add_argument('--to_date', help='to date for calculation (format: YYYY-MM-DD)')
    parser.add_argument('--mr_server', default='hahn', help='MR server (hahn, banach, ...)')
    parser.add_argument('--yt_pool', default='robot-suggestor-dev', help='YT pool')
    parser.add_argument('--output', default='output', help='output date')
    args = parser.parse_args()
    return args

def get_dates(timestamp, from_date, to_date):
    dates = []
    date_format = '%Y-%m-%d'
    if timestamp:
        date = datetime.fromtimestamp(int(timestamp[:10]))
        dates.append(date.strftime(date_format))
    if from_date and not to_date:
        to_date = datetime.strftime(datetime.now(), date_format)
    if not from_date and not to_date:
        from_date = datetime.strftime(datetime.now()- timedelta(days=1), date_format)
        to_date = from_date
    if from_date and to_date:
        current_date = datetime.strptime(from_date, date_format)
        while current_date <= datetime.strptime(to_date, date_format):
            dates.append(current_date.strftime(date_format))
            current_date += timedelta(1)
    return dates

def calc(date):
    inputTable = '//home/suggest-dev/suggest_logs/redir-log_preparates/clean/' + date #'//home/suggest-dev/galamaj/tmp/' + date
    outputTable = '//home/suggest-dev/galamaj/suggest_metrics/technical_metrics/' + date
    yt.wrapper.config['memory_limit'] = 100 * 1024 * 1024 * 1024
    yt.wrapper.create("table", path = outputTable, recursive = True, ignore_existing = True, attributes = None)
    outputPath = yt.wrapper.TablePath(outputTable)
    print "%s   %s -> %s" % (str(datetime.now()), inputTable, outputTable)
    with yt.wrapper.TempTable('//home/suggest-dev/galamaj/tmp', prefix='kpi_') as tmpTable:
        yt.wrapper.run_map(mapper, inputTable, tmpTable, format = yt.wrapper.JsonFormat())
        yt.wrapper.run_sort(tmpTable,sort_by=['service','ip'])
        yt.wrapper.run_reduce(count_metrics, tmpTable, outputPath, reduce_by=['service','ip'], format = yt.wrapper.JsonFormat())
        return outputTable

if __name__ == '__main__':
    args = parse_args()
    result = open(args.output, 'w')
    dates = get_dates(args.timestamp, args.from_date, args.to_date)
    print dates
    for date in dates:
        print date
        metrics = calc(date)
        result.write(date + "\n")
    result.close()
