from crypta.graph.fuzzy.lib.luiger import BaseTask, DateParameter
from crypta.graph.fuzzy.lib.common import cached_property
from yt.wrapper import with_context as yt_with_context, common as yt_common
from extract_all_emails_task import ExtractAllEmailsAndLoginsTask

from functools import partial
from itertools import islice, ifilter

import crypta.graph.fuzzy.lib.config as conf


def mapper_extract_emails(record):
    """
    here we extract all symbol sequences in email
    """
    email = record.get("email", "")
    login_email = email.split("@")[0]
    yuids = record.get("yuids", [])
    if login_email and yuids:
        frags = set()
        for step in xrange(3, 8):
            for i in xrange(len(login_email) - step + 1):
                frags.add(login_email[i : i + step])
        for frag in ifilter(lambda x: len(x) > 2, frags):
            yield {"fragment": frag, "email": email, "yuids": yuids}


def reducer_count_emails(keys, records):
    """
    here we count most popular fragments
    """
    total = 0
    for record in records:
        total += record.get("cnt", -1)
    if total != -1:
        yield {"fragment": keys["fragment"], "size": len(keys["fragment"]), "cnt": total}


def extract_ngrams_from_email_tables(all_fragments_table, frequency_table, yt=None, source_table=None):
    """
    here we extract all ngrams from all emails and calculate each ngram count
    """
    if not yt or not source_table:
        raise Exception("INVALID ARGUMENTS")

    yt.run_map(mapper_extract_emails, source_table, all_fragments_table, spec={"data_size_per_job": 32 * yt_common.MB})

    yt.run_sort(all_fragments_table, sort_by=["fragment"])

    yt.run_reduce(
        reducer_count_emails,
        all_fragments_table,
        frequency_table,
        reduce_by=["fragment"],
        spec={"data_size_per_job": 32 * yt_common.MB},
    )


def count_all_cnts(keys, records, threshold=None):
    """
    this method counts how many we have of this cnt
    """
    emails_with_fragment_count = 0
    for _ in records:
        emails_with_fragment_count += 1
        if emails_with_fragment_count > threshold:
            return
    yield {"cnt": keys["cnt"], "emails_with_fragment_count": -emails_with_fragment_count}


def count_ngrams_barrier(frequency_table, PAIR_COUNT_THRESHOLD, OPTIMAZE_FRAG_MAX_COUNT, yt=None):
    """
    this method returns how many n-grams we need to take
    """
    if not yt:
        raise Exception("INVALID ARGUMENT YT")

    with yt.TempTable() as frequency_table_sorted, yt.TempTable() as frequency_table_groupped:
        yt.run_sort(frequency_table, frequency_table_sorted, sort_by=["cnt"])

        yt.run_reduce(
            partial(count_all_cnts, threshold=OPTIMAZE_FRAG_MAX_COUNT),
            frequency_table_sorted,
            frequency_table_groupped,
            reduce_by=["cnt"],
            spec={"data_size_per_job": 64 * yt_common.MB},
        )

        yt.run_sort(frequency_table_groupped, sort_by=["emails_with_fragment_count"])

        total_pairs = 0

        for row in yt.read_table(frequency_table_groupped):
            count = abs(row["cnt"])
            if count == 1:
                continue
            total_pairs += count * (count + 1) * (abs(row["emails_with_fragment_count"])) // 2
            if total_pairs > PAIR_COUNT_THRESHOLD:
                return count - 1

        return count


def filter_data_for_candidates(record, max_ngram_counts=None):
    """
    mapper for MR-operation,
    which filteres fragments and extract all symbol sequences in email
    """
    how_many = abs(int(record["cnt"]))
    if 1 < how_many < max_ngram_counts:
        yield {"fragment": record["fragment"]}


@yt_with_context
def reduce_yield_candidates_groups(keys, records, context):
    """
    here we yield all email-candidates
    """
    have_filtered = False
    emails = []
    yuids = dict()
    for record in records:
        if not have_filtered and context.table_index == 1:
            return
        elif context.table_index == 0:
            have_filtered = True
            continue
        emails.append(record["email"])
        yuids[record["email"]] = record["yuids"]
    yield {"emails": emails, "yuids": yuids, "fragment": keys["fragment"]}


def map_yield_candidates(record):
    """
    here we yield all email-candidates in pairs
    """
    emails = record["emails"]
    yuids_dict = record["yuids"]
    for i, first_email in enumerate(emails):
        for second_email in islice(emails, i + 1, None):
            if first_email.split("@")[0] != second_email.split("@")[0]:
                for yuid_first in yuids_dict[first_email]:
                    for yuid_second in yuids_dict[second_email]:
                        ordered = sorted(((yuid_first, first_email), (yuid_second, second_email)), key=lambda x: x[0])
                        yield {
                            conf.Constants.YUID_LEFT: ordered[0][0],
                            conf.Constants.YUID_RIGHT: ordered[1][0],
                            "email_left": ordered[0][1],
                            "email_right": ordered[1][1],
                            "fragment": record["fragment"],
                        }


def create_email_candidates(all_fragments_table, frequency_table, max_ngram_counts, dest_t, yt=None):
    """
    here we create table with candidates to classification
    """
    if not yt:
        raise Exception("INVALID ARGUMENT YT")

    with yt.TempTable() as frequency_table_filtered, yt.TempTable() as all_candidates_groupped:
        yt.run_map(
            partial(filter_data_for_candidates, max_ngram_counts=max_ngram_counts),
            frequency_table,
            frequency_table_filtered,
            spec={"data_size_per_job": 64 * yt_common.MB, "ordered": True},
        )

        yt.run_sort(frequency_table_filtered, sort_by=["fragment"])

        yt.run_reduce(
            reduce_yield_candidates_groups,
            [frequency_table_filtered, all_fragments_table],
            all_candidates_groupped,
            reduce_by=["fragment"],
            spec={"data_size_per_job": 32 * yt_common.MB},
        )

        yt.run_map(
            map_yield_candidates, all_candidates_groupped, dest_t, spec={"data_size_per_job": 16 * yt_common.MB}
        )

        yt.run_sort(dest_t, sort_by=[conf.Constants.YUID_LEFT, conf.Constants.YUID_RIGHT])
    return


class ExtractSimilarEmailsTask(BaseTask):
    date = DateParameter()
    PAIR_COUNT_THRESHOLD = 5 * 10 ** 8
    OPTIMAZE_FRAG_MAX_COUNT = 2000

    def requires(self):
        """
        this tasks must be done to complete this task
        """
        return ExtractAllEmailsAndLoginsTask(date=self.date)

    def output(self):
        """
        result of this task
        """
        return [self.yt.targets.table_is_actual(self.destination, self.date.isoformat())]

    @cached_property
    def source_type(self):
        return conf.SourceTypes.EMAIL_SIMILAR

    @cached_property
    def source(self):
        return conf.Paths.sources.emails.ALL_EMAILS_TABLE

    @cached_property
    def destination(self):
        return conf.Paths.sources.emails.ALL_YUID_PAIRS_FROM_SIMILAR_EMAILS

    @cached_property
    def destination_schema(self):
        return conf.Paths.sources.emails.ALL_YUID_PAIRS_FROM_SIMILAR_EMAILS_SCHEMA

    def _run(self):
        with self.yt.TempTable() as all_fragments_table, self.yt.TempTable() as frequency_table:
            extract_ngrams_from_email_tables(
                all_fragments_table, frequency_table, yt=self.yt, source_table=self.source
            )

            barrier = count_ngrams_barrier(
                frequency_table, self.PAIR_COUNT_THRESHOLD, self.OPTIMAZE_FRAG_MAX_COUNT, yt=self.yt
            )

            self.yt.create_table_with_schema(
                self.destination, self.destination_schema, strict=True, recreate_if_exists=True
            )

            create_email_candidates(all_fragments_table, frequency_table, barrier, self.destination, yt=self.yt)

            self.yt.set(self.destination + "/@generate_date", self.date.isoformat())
        return
