import yt.wrapper as yt
import sys
import argparse

def mapper(row):
    try:
        atps_sfc = None
        input_time_sfc = None
        input_time_tit = None
        atps_tit = None
        redir_log = {}
        if row["value"].find("cid=2873") != -1 and row["value"].find("_touch.") != -1:
                data = row["value"].split("\t")
                redir_uid = False
                for item in data:
                    if item.find("=") != -1:
                        item = item.split("=")
                        param = item[0]
                        val = item[1]
                        redir_log[param] = val
                query_length = int(redir_log["ratio"].split(".")[1])
                action_type = redir_log["path"].split(".")[1]
                if redir_log["path"].find("serp_") != -1 and redir_log["path"].find("_touch.") != -1 and 0 < int(query_length) < 101 and redir_log["text"] != redir_log["prev_query"] and redir_log["text"] != '' and redir_log["text"] != ' ':
                    uid = redir_log["yandexuid"]
                    if redir_log["text"].endswith(' '):
                        len_query = float(query_length) - 1
                    else:
                        len_query = float(query_length)
                    if redir_log["since_first_change"] is not None:
                        input_time_sfc = int(redir_log["since_first_change"])
                        if 0 < input_time_sfc:
                            if input_time_sfc > 60000:
                                input_time_sfc = 60000
                            if len_query > 100:
                                len_query = 100
                            atps_sfc = input_time_sfc/len_query   #avg_time_per_symbol using since_first_change
                    if redir_log["total_input_time"] is not None:
                        input_time_tit = int(redir_log["total_input_time"])
                        if 0 < input_time_tit:
                            if input_time_tit > 60000:
                                input_time_tit = 60000
                            atps_tit = input_time_tit/len_query  #avg_time_per_symbol using total_input_time
                    yield {'uid' : redir_log["yandexuid"],
                        'avg_time_per_symbol_sfc' : atps_sfc,
                        'avg_time_per_symbol_tit' : atps_tit,
                        'since_first_change' : input_time_sfc,
                        'total_input_time' : input_time_tit,
                        'action_type' : action_type,
                        'query_length' : len_query
                        }
    except AttributeError:
        pass
    except KeyError:
        pass
    except ValueError:
        pass


def average(measures):
    avg = 0.
    n = 0.
    for item in measures:
        if item is not None:
            avg += (item - avg)/(n + 1)
            n += 1
    return avg

def median(lst):
    lst = sorted(lst)
    if len(lst) < 1:
            return None
    if len(lst) %2 == 1:
            return lst[((len(lst)+1)/2)-1]
    else:
            return float(sum(lst[(len(lst)/2)-1:(len(lst)/2)+1]))/2.0

def used_metrics(path):
    total = float(len(path))
    not_u = 0
    not_sh = 0
    for item in path:
        if item == 'not_used':
            not_u += 1
        if item == 'not_shown':
            not_sh += 1
    try:
        used = (total - not_u - not_sh)/(total - not_sh)*100.0
        result = used
        return result
    except ZeroDivisionError:
        pass

def count_metrics(key, recs):
    atps_sfc = []
    atps_tit = []
    times_sfc = []
    times_tit = []
    length = []
    negative_sfc = 0.
    zero_sfc = 0.
    null_sfc = 0.
    negative_tit = 0.
    zero_tit = 0.
    null_tit = 0.
    total = 0.
    path = []
    for rec in recs:
        exprt = rec['exprt']
        total += 1
        length.extend(rec['query_length'])
        path.extend(rec['action_type'])
        if rec['avg_time_per_symbol_sfc'] is not None:
            atps_sfc.extend(rec['avg_time_per_symbol_sfc'])
            times_sfc.extend(rec['since_first_change'])
        if rec['avg_time_per_symbol_tit'] is not None:
            atps_tit.extend(rec['avg_time_per_symbol_tit'])
            times_tit.extend(rec['total_input_time'])
        for item in rec['since_first_change']:
            if item == 0:
                zero_sfc += 1
            if item < 0 and item is not None:
                negative_sfc += 1
            if item is None:
                null_sfc += 1
        for item in rec['total_input_time']:
            if item == 0:
                zero_tit += 1
            if item < 0:
                negative_tit += 1
            if item is None:
                null_tit += 1
    avrg_time_per_symb_sfc = average(atps_sfc)
    avrg_time_per_symb_tit = average(atps_tit)
    median_sfc = median(times_sfc)
    median_tit = median(times_tit)
    query_length = average(length)
    avrg_time_normalized_sfc = average(times_sfc)
    avrg_time_normalized_tit = average(times_tit)
    zero_sfc = zero_sfc/total*100
    null_sfc = null_sfc/total*100
    negative_sfc = negative_sfc/total*100
    zero_tit = zero_tit/total*100
    null_tit = null_tit/total*100
    negative_tit = negative_tit/total*100
    used = used_metrics(path)
    yield {'exprt' : exprt,
               'total' : total,
               'used_%' : used,
               'zero_sfc_%' : zero_sfc,
               'null_sfc_%' : null_sfc,
               'negative_sfc_%' : negative_sfc,
               'zero_tit_%' : zero_tit,
               'null_tit_%' : null_tit,
               'negative_tit_%' : negative_tit,
               'avrg_time_per_symb_sfc' : avrg_time_per_symb_sfc,
               'avrg_time_per_symb_tit' : avrg_time_per_symb_tit,
               'median_input_time_sfc' : median_sfc,
               'median_input_time_tit' : median_tit,
               'avrg_normalized_time_sfc' : avrg_time_normalized_sfc,
               'avrg_normalized_time_tit' : avrg_time_normalized_tit,
               'avrg_query_length_normalized' : query_length
               }

def get_uids_with_exprt(row):
    uids = {}
    exprt = False
    if row["value"].find("service=www.yandex") != -1 and row["value"].find("ui=yandex/touchsearch") != -1:
        items = row["value"].split("\t")
        uid = row["key"].lstrip("y")
        for item in items:
            if item.find("test_buckets=") != 1:
                test_buckets = item
                if test_buckets.find("41862,") != -1:
                    exprt = "41862"
                if test_buckets.find("41863,") != -1:
                    exprt = "41863"
                if test_buckets.find("41864,") != -1:
                    exprt = "41864"
                if test_buckets.find("41865,") != -1:
                    exprt = "41865"
        if exprt != False:
            yield {"uid" : uid,
                "exprt" : exprt
                }


def join_uids(key, recs):
    exprt = None
    avg_time_per_symbol_tit = []
    since_first_change = []
    total_input_time = []
    query_length = []
    avg_time_per_symbol_sfc = []
    action_type = []
    for rec in recs:
        uid = rec["uid"]
        if 'exprt' in rec:
            exprt = rec['exprt']
        else:
            try:
                avg_time_per_symbol_sfc.append(rec['avg_time_per_symbol_sfc'])
                avg_time_per_symbol_tit.append(rec['avg_time_per_symbol_tit']),
                since_first_change.append(rec['since_first_change']),
                total_input_time.append(rec['total_input_time']),
                query_length.append(rec['query_length'])
                action_type.append(rec["action_type"])
            except KeyError:
                continue
    if uid is not None and avg_time_per_symbol_sfc != [] and exprt is not None:
        yield {'uid' : rec["uid"],
            'exprt' : exprt,
            'avg_time_per_symbol_sfc' : avg_time_per_symbol_sfc,
            'avg_time_per_symbol_tit' : avg_time_per_symbol_tit,
            'since_first_change' : since_first_change,
            'total_input_time' : total_input_time,
            'query_length' : query_length,
            'action_type' : action_type
        }




def HandleOption():
    parser = argparse.ArgumentParser()
    parser.add_argument("--server", dest="server", help="yt server",default='hahn.yt.yandex.net', required=False)
    #parser.add_argument("--bs", dest="blockstat", help="path to blockstat.dict",default='/home/galamaj/blockstat.dict', required=False)
    return parser


def main():
    args = HandleOption().parse_args()
    yt.update_config({'proxy': {'url': args.server}})
    days = ["2017-04-07", "2017-04-08", "2017-04-09", "2017-04-10", "2017-04-11", "2017-04-12", "2017-04-13", "2017-04-14", "2017-04-15", "2017-04-16", "2017-04-17", "2017-04-18", "2017-04-19", "2017-04-20"]#'2016-12-09', '2016-12-10', '20
    for day in days:
        usersessions = '//user_sessions/pub/search/daily/' + day + '/clean'
        uids_exprt = '//home/suggest-dev/galamaj/analytics/experiments/nav_uids_' + day
        redir = '//home/logfeller/logs/redir-log/1d/' + day
        output = '//home/suggest-dev/galamaj/analytics/experiments/redir_' + day
        joined = '//home/suggest-dev/galamaj/analytics/experiments/joined_' + day
        result = '//home/suggest-dev/galamaj/analytics/experiments/result_' + day
        if not yt.exists(output):
            yt.create_table(path=output, recursive=True)
        if not yt.exists(joined):
            yt.create_table(path=joined, recursive=True)
        if not yt.exists(result):
            yt.create_table(path=result, recursive=True)
        yt.run_map(get_uids_with_exprt, usersessions, uids_exprt, spec = {'data_size_per_job': 16000000000}) #~16GB
        yt.run_map(mapper, redir, output, spec = {'data_size_per_job': 16000000000})
        yt.run_sort(output, sort_by = "uid")
        yt.run_sort(uids_exprt, sort_by = ["uid"])
        yt.run_reduce(join_uids, [output, uids_exprt], joined, reduce_by=['uid'])
        yt.run_sort(joined, sort_by = "exprt")
        yt.run_reduce(count_metrics, joined, result, reduce_by = "exprt")






if __name__ == '__main__':
    main()











