"""Luigi Tasks to collect and send stats
about known identifiers, e.g. phones, emails.:
"""
import json
import os
from functools import partial

import luigi

from crypta.graph.v1.python.lib import graphite_sender
from crypta.graph.v1.python.lib.luigi import base_luigi_task
from crypta.graph.v1.python.lib.luigi import yt_luigi
from crypta.graph.v1.python.lib.luigi.yt_luigi import PostGraphTask
from crypta.graph.v1.python.matching.yuid_matching import graph_dict
from crypta.graph.v1.python.matching.yuid_matching.enrich import org_emails_classify
from crypta.graph.v1.python.rtcconf import config
from crypta.graph.v1.python.utils import mr_utils as mr
from crypta.graph.v1.python.utils import yt_clients

ID_TYPES = (config.ID_TYPE_EMAIL, config.ID_TYPE_PHONE, config.ID_TYPE_VKCOM, config.ID_TYPE_OKRU)
REFERENCE_SAMPLE_BASES = {
    config.ID_TYPE_EMAIL: os.path.join(
        config.CRYPTA_GRAPH_CRYPTA_HOME, "state/extras/reference-bases/audi-mvideo-emails"
    )
    if config.CRYPTA_GRAPH_CRYPTA_HOME
    else "",
    config.ID_TYPE_PHONE: os.path.join(
        config.CRYPTA_GRAPH_CRYPTA_HOME, "state/extras/reference-bases/sberbank_phones_hash"
    )
    if config.CRYPTA_GRAPH_CRYPTA_HOME
    else "",
}


def to_graphite(stats_table, date):
    yt_client = yt_clients.get_yt_client()
    stats = [json.loads(stat_record) for stat_record in yt_client.read_table(stats_table, format="json", raw=True)]
    metric_name = "id_stats"
    metrics = []
    for id_stats in stats:
        id_type = id_stats.pop("id_type")
        for stat_name, value in id_stats.iteritems():
            stat_path = ".".join([id_type, stat_name])
            metrics.append((metric_name, stat_path, value))
    graphite_sender.to_graphite_sender(metrics, date)


def reduce_unique(key, _):
    yield {"id_value": key["id_value"]}


def reduce_unique_oneday(key, records, id_type, day):
    id_value = key["id_value"]
    for rec in records:
        days = rec[id_type + "_dates"][id_value].keys()
        if day in days:
            yield {"id_value": id_value}
            break


def unique_count(input_table, workdir, id_type, day):
    alltime_tmp = workdir + "unique_keys"
    oneday_tmp = workdir + "unique_keys_oneday"
    yt_client = yt_clients.get_yt_client()
    yt_client.run_reduce(reduce_unique, input_table, alltime_tmp, reduce_by="id_value")
    yt_client.run_reduce(
        partial(reduce_unique_oneday, id_type=id_type, day=day), input_table, oneday_tmp, reduce_by="id_value"
    )
    count_alltime = yt_client.row_count(alltime_tmp)
    count_oneday = yt_client.row_count(oneday_tmp)
    yt_client.remove(alltime_tmp)
    yt_client.remove(oneday_tmp)
    return count_alltime, count_oneday


def reduce_intersection(key, recs):
    tables = set()
    for rec in recs:
        tables.add(rec["@table_index"])
    if len(tables) == 2:
        yield {"id_value": key["id_value"]}


def reference_coverage(input_table, reference_table, workdir):
    outtable = os.path.join(workdir, "tmp_intersection")
    yt_client = yt_clients.get_yt_client()
    yt_client.run_reduce(reduce_intersection, [reference_table, input_table], outtable, reduce_by="id_value")
    reference_records_cnt = yt_client.row_count(reference_table)
    out_records_cnt = yt_client.row_count(outtable)
    coverage = (100.0 * (float(out_records_cnt) / float(reference_records_cnt))) if reference_records_cnt else None
    yt_client.remove(outtable)
    return coverage


def collect_stats(id_type, input_table, output_folder, day):
    """
    * unique_ID_TYPE - Absolute number of known IDs, linked with yuid.
    * yuid_with_id_ID_TYPE_num - absolute number of yuids with ID of given type.
    * ID_TYPE_reference_base_coverage - intersection with reference base.
    """
    mr.sort_all([input_table], sort_by="id_value")
    stats = {"id_type": id_type}
    yt_client = yt_clients.get_yt_client()
    stats["yuid_with_id_count"] = yt_client.row_count(input_table)
    unique_count_alltime, unique_count_oneday = unique_count(input_table, output_folder, id_type, day)
    stats["unique_count"] = unique_count_alltime
    stats["unique_count_oneday"] = unique_count_oneday
    if id_type in REFERENCE_SAMPLE_BASES:
        stats["reference_base_coverage"] = reference_coverage(
            input_table, REFERENCE_SAMPLE_BASES[id_type], output_folder
        )
    return stats


def write_stats(table, stats):
    yt_client = yt_clients.get_yt_client()
    yt_client.write_table(table, [json.dumps(s) for s in stats], format="json", raw=True)


def get_output_folder(date):
    return config.YT_OUTPUT_FOLDER + date + "/stat_new/"


def collect_stat(date):
    dicts_folder = config.GRAPH_YT_DICTS_FOLDER
    output_folder = get_output_folder(date)
    mr.mkdir(output_folder)

    stats = []
    for id_type in ID_TYPES:
        input_table = dicts_folder + "yuid_with_id_" + id_type
        stats.append(collect_stats(id_type, input_table, output_folder, date))

    stats_table = output_folder + "id_stats"
    write_stats(stats_table, stats)
    to_graphite(stats_table, date)


class IdStatTask(PostGraphTask):
    """Luigi Task to collect stats about known identifiers, e.g. phones, emails.:
    * Absolute number of known IDs, linked with yuid.
    * Intersection with reference base.
    * Absolute number of yuids with ID of given type.
    """

    tags = ["v1"]

    def __init__(self, date, name):
        super(IdStatTask, self).__init__(date=date, name=name)

    def requires(self):
        ref_tables = [
            yt_luigi.ExternalInput(REFERENCE_SAMPLE_BASES[id_type])
            for id_type in ID_TYPES
            if id_type in REFERENCE_SAMPLE_BASES
        ]
        return [
            graph_dict.YuidAllIdBySourceDictsTask(self.date),
            org_emails_classify.OrgEmailsClassifyTask(self.date),
        ] + ref_tables

    def run_post_graph(self):
        collect_stat(self.date)

    def output(self):
        output_folder = get_output_folder(self.date)
        return [yt_luigi.YtTarget(output_folder + "id_stats")]


class YuidStatTask(base_luigi_task.BaseTask, yt_clients.YtClientMixin):
    date = luigi.Parameter()
    tags = ["v1"]

    def requires(self):
        return [graph_dict.YuidAllIdDictsTask(self.date)]

    def run(self):
        stats_f = config.GRAPH_YT_DICTS_FOLDER + "stats/"
        mr.mkdir(stats_f)

        mr.count_field_recs(
            self.yt.TablePath(config.GRAPH_YT_DICTS_FOLDER + "yuid_with_all", columns=["ip_activity_type"]),
            stats_f + "yuid_activity",
            ["ip_activity_type"],
        )

        activity_metrics = []
        for r in self.yt.read_table(stats_f + "yuid_activity"):
            activity_metrics.append(("yuid_with_all.activity_type", r["ip_activity_type"], r["count"]))

        graphite_sender.to_graphite_sender(activity_metrics)

        mr.set_generate_date(stats_f + "yuid_activity", self.date)

    def output(self):
        return [yt_luigi.YtDateTarget(config.GRAPH_YT_DICTS_FOLDER + "stats/yuid_activity", self.date)]
