import re
from functools import partial

import luigi
from crypta.graph.soup.config.python import (  # N811 # noqa
    EDGE_TYPE as edges,
    ID_TYPE as ids,
    LOG_SOURCE as log_source,
    SOURCE_TYPE as source_type,
)

from crypta.graph.v1.python.data_imports.day_aggregate import (
    reduce_yuid_log_events_day,
    finalize_yuid_with_x_day_tables,
)
from crypta.graph.v1.python.lib.luigi import yt_luigi
from crypta.graph.v1.python.rtcconf import config
from crypta.graph.v1.python.utils import mr_utils as mr
from crypta.graph.v1.python.utils import uat_utils
from crypta.graph.v1.python.v2.soup.soup_tables import SoupDailyLogTable

yuid_kpid_al = edges.get_edge_type(ids.YANDEXUID, ids.KINOPOISK_ID, source_type.KINOPOISK, log_source.ACCESS_LOG)
yandexuid_regex = re.compile(r"\byandexuid=(\d+)")


def extract_yuid(body):
    if body and "yandexuid=" in body:
        m = yandexuid_regex.search(body)
        if m:
            yuid_str = m.group(1)
            try:
                return str(int(yuid_str))
            except ValueError:
                return ""
        else:
            return ""
    else:
        return ""


def map_kinopoisk(rec):
    yuid = extract_yuid(rec.get("cookies", ""))
    host = rec.get("vhost")
    cookies = rec.get("cookies")

    ua = uat_utils.Ua(rec.get("user_agent"))
    if ua.is_bad():
        rec["@table_index"] = 2  # bad ua
        yield rec
        return

    if yuid and host and "kinopoisk" in host and cookies and cookies != "-":
        kp_uid = mr.get_field_value("uid", cookies, separator="; ")
        try:
            int(kp_uid)
        except ValueError:
            return
        if kp_uid:
            yield {"id_value": kp_uid, "yuid": yuid}
            yield SoupDailyLogTable.make_rec(yuid, kp_uid, yuid_kpid_al, table_index=1)


def join_kinopoisk_email(kp_uid_key, recs):
    try:
        yuid_raw_recs, kp_email_recs = mr.split_left_right(recs)
    except mr.OomLimitException as oom:
        yield {"kp_yuid": kp_uid_key["id_value"], "oom": oom.recs_count, "@table_index": 1}
        return

    for yuid_raw in yuid_raw_recs:
        for kp_email in kp_email_recs:
            # change kp uid to email
            yuid_raw[config.ID_TYPE_KINOPOISK_UID] = kp_email["id_value"]
            yuid_raw["id_value"] = kp_email["email"]
            yuid_raw["id_type"] = config.ID_TYPE_EMAIL
            yield yuid_raw


class ImportKinopoiskAccessLog(yt_luigi.BaseYtTask):
    date = luigi.Parameter()
    run_date = luigi.Parameter()

    def __init__(self, *args, **kwargs):
        super(ImportKinopoiskAccessLog, self).__init__(*args, **kwargs)
        self.soup_log = SoupDailyLogTable(log_source.ACCESS_LOG, self.date, source_types=[yuid_kpid_al.SourceType])

    def input_folders(self):
        return {"log": config.LOGFELLER_KINOPOISK_FOLDER + self.date, "dict": config.GRAPH_YT_DICTS_FOLDER}

    def output_folders(self):
        return {
            "yuid_raw": config.YT_OUTPUT_FOLDER + self.date + "/yuid_raw/",
            "kinopoisk": config.YT_OUTPUT_FOLDER + self.date + "/kinopoisk/",
        }

    def requires(self):
        return [
            yt_luigi.ExternalInput(self.in_f("log"), allow_empty=True),
            yt_luigi.ExternalInput(self.in_f("dict") + "kinopoisk"),
        ]

    def before_run(self):
        mr.mkdir(self.out_f("yuid_raw"))
        mr.mkdir(self.out_f("kinopoisk"))
        self.soup_log.ensure_dir()

    def run(self):

        kp_uid_out_table = self.out_f("yuid_raw") + "_".join(
            ("yuid_with", config.ID_TYPE_KINOPOISK_UID, config.ID_SOURCE_TYPE_KINOPOISK)
        )

        kp_email_out_table = self.out_f("yuid_raw") + "_".join(
            ("yuid_with", config.ID_TYPE_EMAIL, config.ID_SOURCE_TYPE_KINOPOISK)
        )

        kp_uid_tmp_table = self.out_f("kinopoisk") + "yuid_uid_kinopoisk"

        soup_table = self.soup_log.create()
        bad_ua_table = self.out_f("kinopoisk") + "bad_ua"

        self.yt.run_map(map_kinopoisk, self.in_f("log"), [kp_uid_tmp_table, soup_table, bad_ua_table])

        self.soup_log.prepare_daily_tables_from_log()

        self.yt.run_map_reduce(
            None,
            partial(
                reduce_yuid_log_events_day,
                dt=self.date,
                id_type=config.ID_TYPE_KINOPOISK_UID,
                source_type=config.ID_SOURCE_TYPE_KINOPOISK,
            ),
            kp_uid_tmp_table,
            kp_uid_out_table,
            reduce_by=config.ID_TYPE_YUID,
        ),

        # enrich kp with email
        self.yt.run_sort(kp_uid_out_table, sort_by="id_value")
        self.yt.run_reduce(
            join_kinopoisk_email,
            [kp_uid_out_table, self.in_f("dict") + "kinopoisk"],
            [kp_email_out_table, self.out_f("kinopoisk") + "email_join_oom"],
            reduce_by="id_value",
        )

        mr.merge_chunks_all([kp_uid_tmp_table, bad_ua_table, kp_uid_out_table, kp_email_out_table])
        finalize_yuid_with_x_day_tables([kp_uid_out_table, kp_email_out_table])

    def output(self):
        if self.date == self.run_date:
            soup_out_tables = self.soup_log.daily_tables_targets()
        else:
            soup_out_tables = []

        kp_uid_out_table = self.out_f("yuid_raw") + "_".join(
            ("yuid_with", config.ID_TYPE_KINOPOISK_UID, config.ID_SOURCE_TYPE_KINOPOISK)
        )
        kp_email_out_table = self.out_f("yuid_raw") + "_".join(
            ("yuid_with", config.ID_TYPE_EMAIL, config.ID_SOURCE_TYPE_KINOPOISK)
        )

        return [
            yt_luigi.YtTarget(t, allow_empty=True) for t in [kp_uid_out_table, kp_email_out_table]
        ] + soup_out_tables
