#!/usr/bin/env python
# -*- coding: utf-8 -*-

import yt.wrapper as yt
import sys
import argparse
import datetime
from datetime import date, datetime, timedelta
import re
import json

########################################################################################################################

def parse_args():
    parser = argparse.ArgumentParser(add_help=True, description='Ya.Stroka metrics DAU calc')
    parser.add_argument('--timestamp', help='date timestamp for calculation')
    parser.add_argument('--from_date', help='from date for calculation (format: YYYY-MM-DD)')
    parser.add_argument('--to_date', help='to date for calculation (format: YYYY-MM-DD)')
    parser.add_argument('--mr_server', default='hahn', help='MR server (hahn, banach, ...)')
    parser.add_argument('--yt_pool', default='robot-suggestor-dev', help='YT pool')
    parser.add_argument('--experiments', help='Comma separated experiments numbers (format: 12345,67891,35474)')
    parser.add_argument('--output', default='output', help='output date')
    args = parser.parse_args()
    return args

def get_dates(timestamp, from_date, to_date):
    dates = []
    date_format = '%Y-%m-%d'
    if timestamp:
        date = datetime.fromtimestamp(int(timestamp[:10]))
        dates.append(date.strftime(date_format))
    if from_date and not to_date:
        to_date = datetime.strftime(datetime.now(), date_format)
    if not from_date and not to_date:
        from_date = datetime.strftime(datetime.now()- timedelta(days=1), date_format)
        to_date = from_date
    if from_date and to_date:
        current_date = datetime.strptime(from_date, date_format)
        while current_date <= datetime.strptime(to_date, date_format):
            dates.append(current_date.strftime(date_format))
            current_date += timedelta(1)
    return dates

########################################################################################################################

class StartAction:
    def __init__(self, distr_dict):
        self.distr_dict = distr_dict
        p = self.distr_dict["portal"]
        print len(p)
        s = self.distr_dict["switch"]
        print len(s)
        c = self.distr_dict["commercial"]
        print len(c)

    def __call__(self, row):
        params = {}
        clid = None
        if row['clid'] is not None and "-" in row['clid']:
            clid = row['clid'].split("-")[0]
        else:
            clid = row['clid']
        if clid in self.distr_dict['portal']:
            row['distr_type'] = "portal"
        elif clid in self.distr_dict['switch']:
            row['distr_type'] = "switch"
        elif clid in self.distr_dict['commercial']:
            row['distr_type'] = "commercial"
        else:
            row['distr_type'] = None
        yield row

########################################################################################################################

def parse_json(counters):
    params = {'str_view':None, 'IE':None, 'mic':None}
    if counters.find('setting') != -1:
        data = json.loads(counters)[0]
        if data['parent-path'] == 'setting':
            children = data['children']
            for child in children:
                if child['ctag'] == 'ie':
                    params['IE'] = child['children'][0]['ctag']
                if child['ctag'] == 'mic':
                    params['mic'] = child['children'][0]['ctag']
                if child['ctag'] == 'str_view':
                    params['str_view'] = child['children'][0]['ctag']
    return params



########################################################################################################################

def parse_websessions(key, rows):
    INJECTOR = re.compile(r'(injector\.load\.|supdate\.)')     #отброcить записи с path=injector. и path=supdate чтоб не цеплялся лишний яндекс уид
    params = {'yandexuid' : None, 'path' : None, 'ui' : None, 'ver' : None, 'dayuse' : None, 'str_view' : None, 'clid1' : None, 'IE' : None, 'mic' : None, 'key': None}
    for row in rows:
        params['key'] = row['key']
        value = row['value']
        if '/' in row['key']:
            uid = row['key'].split('/')[-1]
        elif '.' not in row['key'] and len(row['key']) > 13:
            uid = row['key']
        else:
            uid = None
        params["yandexuid"] = uid
        for item in value.split("\t"):
            if item.startswith('counters='):
                counters = item.split("=")[1]
                try:
                    counters_params = parse_json(counters)
                    params['str_view'] = counters_params['str_view']
                    params['mic'] = counters_params['mic']
                    params['IE'] = counters_params['IE']
                except:
                    continue
            if len(re.findall("=", item)) == 1:
                key = item.split("=")[0]
                val = item.split("=")[-1]
                if key in params.keys():
                    params[key] = val
            else:
                vars = item.lstrip('vars=').split(",")
                for var in vars:
                    key = var.split("=")[0]
                    if key.startswith("-"):
                        key = key.lstrip("-")
                        val = var.split("=")[-1]
                        if key in params.keys():
                            params[key] = val
    if params['path'] is not None:
        if INJECTOR.search(params['path']):
            params['path'] = None
    if params['ver'] is None and params['path'] is not None and params['path'].find('exp.') != -1:
        params['ver'] = params['path'].split(".")[1]
    if params['path'] is not None and params['ui'] is not None:
        yield params

########################################################################################################################

def check_dayuse(dayuse):
    try:
        dayuse = int(dayuse)
    except ValueError:
        pass
    result = None
    if dayuse == 0:
        result = '1'
    if dayuse == 1:
        result = '2'
    if dayuse >= 2:
        result = '3+'
    if dayuse >= 4:
        result = '5+'
    if dayuse >= 9:
        result = '10+'
    if dayuse >= 49:
        result = '50+'
    if dayuse >= 99:
        result = '100+'
    if dayuse >= 200:
        result = '200+'
    return result


def check_param(param):
    if len(param) == 0:
        param = None
    else:
        param = param[0]
    return param


def check_actions(path):
    actions = {'voice': False, 'text': False, 'tablo': False, 'voice_in': False, 'text_in': False, 'tablo_click': False}
    for item in path:
        p = item.split(".")
        if "voice" in p:
            actions['voice'] = True
        if "voice_in" in p:
            actions['voice_in'] = True
        if "text" in p:
            actions['text'] = True
        if "text_in" in p:
            actions['text_in'] = True
        if "tablo" in p:
            actions['tablo'] = True
            if "click" in p:
                actions['tablo_click'] = True
    return actions


def distr_type(date):
    distr_dict = {'portal' : [], 'switch' : [], 'commercial' : []}
    for row in yt.read_file('//statbox/statbox-dict-by-name/distr_report/' + date):
        items = row.split("\t")
        type = items[0]
        cid = items[1]
        if type == "ya-self-distribution":
            distr_dict['portal'].append(cid)
        elif type == "ya-switch-distribution":
            distr_dict['switch'].append(cid)
        else:
            distr_dict['commercial'].append(cid)
    return distr_dict

########################################################################################################################

def group_by_one_ui(key, recs):
    ACTION = re.compile(r'(voice|searchbar|tablo|text)\.')
    EXP = re.compile(r'exp\.')
    path = []
    uids = []
    str_view = []
    clid1 = []
    dayuse = []
    ie = []
    mic = []
    for rec in recs:
        ui = rec['ui']
        ver = rec['ver']
        uid = rec['yandexuid']
        if uid is None:
            uid = 'none'
        uids.append(uid)
        ie.append(rec['IE'])
        if rec['str_view'] is not None:
            str_view.append(rec['str_view'])
        if rec['clid1'] is not None:
            clid1.append(rec['clid1'])
        if rec['dayuse'] is not None:
            d = check_dayuse(rec['dayuse'])
            dayuse.append(d)
        if rec['mic'] is not None:
            mic.append(rec['mic'])
        if ACTION.search(rec['path']):
            path.append(rec['path'])
        if EXP.search(rec['path']):
            exp = rec['path'].split('.')
            ver = 'exp.' + exp[1]
    uids = list(set(uids))
    if len(uids) == 1:
        yandexuid = uids[0]
    elif len(uids) == 2 and 'none' in uids:
        uids.remove('none')
        yandexuid = uids[0]
    else:
        yandexuid = None
    if yandexuid is not None and len(path) < 10000 and check_param(ie) != '8':
        yield {'ui' : ui,
            'dayuse' : check_param(dayuse),
            'yandexuid' : yandexuid,
            'ver' : ver,
            'str_view' : check_param(str_view),
            'path' : path,
            'clid' : check_param(clid1),
            'mic' : check_param(mic),
            'ie' : check_param(ie)
            }

########################################################################################################################

def count_dau(key, rows):
    exp = 0.
    counted = {'voice': 0, 'voice_in': 0, 'total': 0, 'only_start': 0, 'at_least_1_action': 0, 'all_actions': 0, 'text': 0, 'text_in': 0, 'tablo': 0,'tablo_click': 0, 'voice_and_text' : 0, 'voice_and_tablo' : 0, 'text_and_tablo' : 0}
    parts = {'voice_part': 0, 'voice_in_part': 0, 'only_start_part': 0, 'at_least_1_action_part': 0, 'all_actions_part': 0, 'text_part': 0, 'text_in_part': 0, 'tablo_part': 0,'tablo_click_part': 0, 'voice_and_text_part' : 0, 'voice_and_tablo_part' : 0, 'text_and_tablo_part' : 0}
    for k in key.keys():
        key_name = k
        counted[key_name] = key[k]
    for row in rows:
        row['check'] = None
        if row["yandexuid"] is None and (row["ui"] == '{none}' or row["ui"] is None):
            continue
        counted['total'] += 1
        path = row["path"]
        if path == []:
            counted['only_start'] += 1
        else:
            counted['at_least_1_action'] += 1
            actions = check_actions(path)
            for k, v in actions.items():
                if v == True:
                    counted[k] += 1
            if actions['voice_in'] == True and actions['text_in'] == True and actions['tablo_click'] == False:
                counted['voice_and_text'] +=1
            if actions['voice_in'] == True and actions['tablo_click'] == True and actions['text_in'] == False:
                counted['voice_and_tablo'] += 1
            if actions['text_in'] == True and actions['tablo_click'] == True and actions['voice_in'] == False:
                counted['text_and_tablo'] += 1
            if actions['text_in'] == True and actions['tablo_click'] == True and actions['voice_in'] == True:
                counted['all_actions'] += 1
    total = float(counted['total'])
    for key in counted.keys():
        if key == key_name or key == 'total' or counted[key] is None or type(counted[key]) == str:
            continue
        else:
            val = float(counted[key])/total
            k = key + '_part'
            parts[k] = val
    counted.update(parts)
    yield counted

########################################################################################################################

def count_total_dau(input, output):
    counted = {'voice': 0, 'voice_in': 0, 'total': 0, 'only_start': 0, 'at_least_1_action': 0, 'all_actions': 0, 'text': 0, 'text_in': 0, 'tablo': 0,'tablo_click': 0, 'voice_and_text' : 0, 'voice_and_tablo' : 0, 'text_and_tablo' : 0}
    parts = {'voice_part': 0, 'voice_in_part': 0, 'only_start_part': 0, 'at_least_1_action_part': 0, 'all_actions_part': 0, 'text_part': 0, 'text_in_part': 0, 'tablo_part': 0,'tablo_click_part': 0, 'voice_and_text_part' : 0, 'voice_and_tablo_part' : 0, 'text_and_tablo_part' : 0}
    for row in yt.read_table(input):
        row['check'] = None
        if row["yandexuid"] is None and (row["ui"] == '{none}' or row["ui"] is None):
            continue
        counted['total'] += 1
        path = row["path"]
        if path == []:
            counted['only_start'] += 1
        else:
            counted['at_least_1_action'] += 1
            actions = check_actions(path)
            for k, v in actions.items():
                if v == True:
                    counted[k] += 1
            if actions['voice_in'] == True and actions['text_in'] == True and actions['tablo_click'] == False:
                counted['voice_and_text'] +=1
            if actions['voice_in'] == True and actions['tablo_click'] == True and actions['text_in'] == False:
                counted['voice_and_tablo'] += 1
            if actions['text_in'] == True and actions['tablo_click'] == True and actions['voice_in'] == False:
                counted['text_and_tablo'] += 1
            if actions['text_in'] == True and actions['tablo_click'] == True and actions['voice_in'] == True:
                counted['all_actions'] += 1
    total = float(counted['total'])
    for key in counted.keys():
        if key == 'total' or counted[key] is None:
            continue
        else:
            val = float(counted[key])/float(total)
            k = key + '_part'
            parts[k] = val
    counted.update(parts)
    print counted
    yt.write_table(output, [counted], raw=False)

########################################################################################################################

def main(date, distr_dict):
    websessions = '//home/desktop/searchband/sessions/' + date
    sort = '//home/suggest-dev/galamaj/yastroka/tmp/sorted_' + date
    parsed = '//home/suggest-dev/galamaj/yastroka/tmp/parsed_' + date
    joined = '//home/suggest-dev/galamaj/yastroka/tmp/joined_' + date
    final_data = '//home/suggest-dev/galamaj/yastroka/tmp/final_data_' + date
    ver_dau = '//home/suggest-dev/galamaj/yastroka/dau/ver/' + date
    dayuse_dau = '//home/suggest-dev/galamaj/yastroka/dau/dayuse/' + date
    distr_dau = '//home/suggest-dev/galamaj/yastroka/dau/distr/' + date
    strview_dau = '//home/suggest-dev/galamaj/yastroka/dau/strview/' + date
    strview_dayuse_dau = '//home/suggest-dev/galamaj/yastroka/dau/strview_dayuse/' + date
    strview_distr_dau = '//home/suggest-dev/galamaj/yastroka/dau/strview_distr/' + date
    distr_dayuse_dau = '//home/suggest-dev/galamaj/yastroka/dau/distr_dayuse/' + date
    strview_distr_dayuse_dau = '//home/suggest-dev/galamaj/yastroka/dau/strview_distr_dayuse/' + date
    total_dau = '//home/suggest-dev/galamaj/yastroka/dau/total/' + date
    mic_dau = '//home/suggest-dev/galamaj/yastroka/dau/mic/' + date
    mic_ver_dau = '//home/suggest-dev/galamaj/yastroka/dau/mic_ver/' + date
    mic_dayuse_dau = '//home/suggest-dev/galamaj/yastroka/dau/mic_dayuse/' + date
    mic_strview_dau = '//home/suggest-dev/galamaj/yastroka/dau/mic_strview/' + date
    mic_distr_dau = '//home/suggest-dev/galamaj/yastroka/dau/mic_distr/' + date
    yt.config['memory_limit'] = 100 * 1024 * 1024 * 1024
    if not yt.exists(ver_dau):
        yt.create_table(path=ver_dau, recursive=True)
    if not yt.exists(dayuse_dau):
        yt.create_table(path=dayuse_dau, recursive=True)
    if not yt.exists(distr_dau):
        yt.create_table(path=distr_dau, recursive=True)
    if not yt.exists(strview_dau):
        yt.create_table(path=strview_dau, recursive=True)
    if not yt.exists(final_data):
        yt.create_table(path=final_data, recursive=True)
    if not yt.exists(strview_dayuse_dau):
        yt.create_table(path=strview_dayuse_dau, recursive=True)
    if not yt.exists(strview_distr_dau):
        yt.create_table(path=strview_distr_dau, recursive=True)
    if not yt.exists(distr_dayuse_dau):
        yt.create_table(path=distr_dayuse_dau, recursive=True)
    if not yt.exists(strview_distr_dayuse_dau):
        yt.create_table(path=strview_distr_dayuse_dau, recursive=True)
    if not yt.exists(total_dau):
        yt.create_table(path=total_dau, recursive=True)
    if not yt.exists(mic_dau):
        yt.create_table(path=mic_dau, recursive=True)
    if not yt.exists(mic_dayuse_dau):
        yt.create_table(path=mic_dayuse_dau, recursive=True)
    if not yt.exists(mic_strview_dau):
        yt.create_table(path=mic_strview_dau, recursive=True)
    if not yt.exists(mic_ver_dau):
        yt.create_table(path=mic_ver_dau, recursive=True)
    if not yt.exists(mic_distr_dau):
        yt.create_table(path=mic_distr_dau, recursive=True)
    yt.run_sort(websessions, sort, sort_by=['key'])
    yt.run_reduce(parse_websessions, sort, parsed, sort_by = ['key'], reduce_by = ['key'], spec = {"job_io": {"table_writer": {"max_row_weight": 134217728}}})
    # yt.run_map(parse_websessions, websessions, parsed)
    yt.run_sort(parsed, sort_by=['ui','yandexuid'])
    yt.run_reduce(group_by_one_ui, parsed, joined, sort_by = ['ui'], reduce_by = ['ui'], spec = {"job_io": {"table_writer": {"max_row_weight": 134217728}}})
    yt.run_map(StartAction(distr_dict), joined, final_data)
    yt.run_sort(final_data, sort_by=['ver'])
    yt.run_reduce(count_dau, final_data, ver_dau, sort_by = ['ver'], reduce_by = ['ver'])
    yt.run_sort(final_data, sort_by=['dayuse'])
    yt.run_reduce(count_dau, final_data, dayuse_dau, sort_by = ['dayuse'], reduce_by = ['dayuse'])
    yt.run_sort(final_data, sort_by=['str_view'])
    yt.run_reduce(count_dau, final_data, strview_dau, sort_by = ['str_view'], reduce_by = ['str_view'])
    yt.run_sort(final_data, sort_by=['distr_type'])
    yt.run_reduce(count_dau, final_data, distr_dau, sort_by = ['distr_type'], reduce_by = ['distr_type'])
    yt.run_sort(final_data, sort_by=['str_view', 'dayuse'])
    yt.run_reduce(count_dau, final_data, strview_dayuse_dau, sort_by = ['str_view', 'dayuse'], reduce_by = ['str_view', 'dayuse'])
    yt.run_sort(final_data, sort_by=['str_view','distr_type'])
    yt.run_reduce(count_dau, final_data, strview_distr_dau, sort_by = ['str_view','distr_type'], reduce_by = ['str_view','distr_type'])
    yt.run_sort(final_data, sort_by=['distr_type','dayuse'])
    yt.run_reduce(count_dau, final_data, distr_dayuse_dau, sort_by = ['distr_type','dayuse'], reduce_by = ['distr_type','dayuse'])
    yt.run_sort(final_data, sort_by=['str_view', 'distr_type', 'dayuse'])
    yt.run_reduce(count_dau, final_data, strview_distr_dayuse_dau, sort_by = ['str_view', 'distr_type', 'dayuse'], reduce_by = ['str_view', 'distr_type', 'dayuse'])
    yt.run_sort(joined, sort_by=['mic'])
    yt.run_reduce(count_dau, joined, mic_dau, sort_by = ['mic'], reduce_by = ['mic'])
    yt.run_sort(joined, sort_by=['mic', 'ver'])
    yt.run_reduce(count_dau, joined, mic_ver_dau, sort_by = ['mic', 'ver'], reduce_by = ['mic', 'ver'])
    yt.run_sort(joined, sort_by=['mic', 'dayuse'])
    yt.run_reduce(count_dau, joined, mic_dayuse_dau, sort_by = ['mic', 'dayuse'], reduce_by = ['mic', 'dayuse'])
    yt.run_sort(joined, sort_by=['mic', 'str_view'])
    yt.run_reduce(count_dau, joined, mic_strview_dau, sort_by = ['mic', 'str_view'], reduce_by = ['mic', 'str_view'])
    yt.run_sort(final_data, sort_by=['mic','distr_type'])
    yt.run_reduce(count_dau, final_data, mic_distr_dau, sort_by = ['mic','distr_type'], reduce_by = ['mic','distr_type'])
    count_total_dau(final_data, total_dau)
    yt.remove(sort)
    yt.remove(parsed)
    yt.remove(joined)

########################################################################################################################

if __name__ == '__main__':
    args = parse_args()
    result = open(args.output, 'w')
    dates = get_dates(args.timestamp, args.from_date, args.to_date)
    print dates
    for date in dates:
        distr_dict = distr_type(date)
        main(date, distr_dict)
        result.write(date + "\n")
    result.close()
