import re
from collections import defaultdict

import luigi
import yt.wrapper as yt

from lib import graphite_sender
from lib.luigi import yt_luigi
from rtcconf import config


def get_tables(directory):
    all_tables = yt.list(directory.rstrip('/'))
    return all_tables


def is_time_table(table):
    """
    check we have format 'yyyy-MM-dd' or 'yyyy-MM-dd'T'HH:mm:ss'
    """
    reg_exp = r'\A(\d{4}-\d{2}-\d{2})((T\d{2}:\d{2}:\d{2})?)\Z'
    if re.match(reg_exp, table):
        return True
    return False


def mapper_to_format(rec):
    yield {'key': rec['yuid'], 'value': rec['hash'], 'type': 'yuid'}
    yield {'key': rec['hash'], 'value': rec['yuid'], 'type': 'fpc'}


def take_hash_count(keys, recs):
    vals = set()
    for rec in recs:
        vals.add(rec['value'])
    yield {'type': keys['type'], 'count': len(vals)}


@yt.aggregator
def take_avg_val_mapper(recs):
    all_possible_values = {
        'yuid': defaultdict(int),
        'fpc' : defaultdict(int),
    }
    for rec in recs:
        rec_type = rec['type']
        count = rec['count']
        all_possible_values[rec_type][str(count)] += 1
    yield {
        'yuid': dict(all_possible_values['yuid'].iteritems()),
        'fpc': dict(all_possible_values['fpc'].iteritems()),
    }


def cacl_perc(prec, dictionary):
    counter = 0
    weighted_sum = 0
    percentile = 0
    for num, count in sorted(dictionary.iteritems(), key=lambda x: x[0]):
        percentile = num
        if counter + count <= prec:
            weighted_sum += num * count
            counter += count
        else:
            diff = prec - counter
            weighted_sum += num * diff
            counter += diff
            break
    return (weighted_sum / float(counter) if counter else 0), percentile


def take_avg_val_reducer(_, recs):
    all_possible_values = {
        'yuid': defaultdict(int),
        'fpc' : defaultdict(int),
    }
    for rec in recs:
        for num, count in rec['yuid'].iteritems():
            all_possible_values['yuid'][int(num)] += int(count)
        for num, count in rec['fpc'].iteritems():
            all_possible_values['fpc'][int(num)] += int(count)
    yuid_summary = sum(c for _, c in all_possible_values['yuid'].iteritems())
    fpc_summary = sum(c for _, c in all_possible_values['fpc'].iteritems())
    perc = [1, 0.99, 0.95, 0.9, 0.75, 0.5]
    result_dict = {
        'avg_yuid_' + str(p): cacl_perc(yuid_summary * p, all_possible_values['yuid'])[0]
        for p in perc
    }
    result_dict.update(
        ('avg_fpc_' + str(p), cacl_perc(fpc_summary * p, all_possible_values['fpc'])[0])
        for p in perc
    )
    result_dict.update(
        ('perc_yuid_' + str(p), cacl_perc(yuid_summary * p, all_possible_values['yuid'])[1])
        for p in perc
    )
    result_dict.update(
        ('perc_fpc_' + str(p), cacl_perc(fpc_summary * p, all_possible_values['fpc'])[1])
        for p in perc
    )
    yield {'result': result_dict}


class FpcStatTask(yt_luigi.BaseYtTask):

    date = luigi.Parameter()

    def output(self):
        return yt_luigi.TodayFileTarget(config.LOCAL_OUTPUT_FOLDER + 'fpc_matching_stat', self.date)

    def input_folders(self):
        return {
            'fpc_dir': config.GRAPH_YT_OUTPUT_FOLDER + 'realtime/watchlog_first_party/'
        }

    def run(self):
        if '//home/crypta/production/' in self.in_f('fpc_dir'):
            with yt.TempTable() as preresult_table,\
                    yt.TempTable() as result_table:
                input_tables = filter(is_time_table, get_tables(self.in_f('fpc_dir')))

                yt.run_map_reduce(
                    mapper_to_format,
                    take_hash_count,
                    [
                        self.in_f('fpc_dir') + t
                        for t in input_tables
                    ],
                    preresult_table,
                    reduce_by=['key', 'type']
                )

                yt.run_map_reduce(
                    take_avg_val_mapper,
                    take_avg_val_reducer,
                    preresult_table,
                    result_table,
                    reduce_by='fantom_key'
                )

                result = {}
                for rec in yt.read_table(result_table, raw=False):
                    dictionary = rec['result']
                    result.update(dictionary)
                    break

                graphite_sender.to_graphite_sender_batch('fpc_yuid_matching_stat', result.iteritems(), self.date)

        yt_luigi.TodayFileTarget.done(config.LOCAL_OUTPUT_FOLDER + 'fpc_matching_stat', self.date)


if __name__ == "__main__":
    import os
    yt.config["tabular_data_format"] = yt.YsonFormat(process_table_index=True)
    yt.config.set_proxy(os.getenv('RTCRYPTA_MR_SERVER'))

    @yt_luigi.BaseYtTask.event_handler(luigi.Event.START)
    def on_task_start(_):
        yt_luigi.reset_global_yt_state()

    FpcStatTask(date='2017-09-26').run()
