#!/usr/bin/env python
# -*- coding: utf-8 -*-
import os

import luigi

from crypta.profile.lib import date_helpers

from crypta.profile.utils.config import config
from crypta.profile.utils.luigi_utils import (
    YtTarget,
    BaseYtTask,
    YtDailyRewritableTarget,
    OldNodesByNameCleaner,
    AttributeExternalInput,
)


MAX_WORDS_IN_QUERY = 200
MIN_DIFFERENT_CHARACTERS_IN_WORD = 2

MIN_MONTHLY_ID_WITH_WORD = 500
MIN_DAILY_YANDEXUID_WITH_WORD = 30

MAX_DAILY_WORDS_FOR_YANDEXUID = 2000
MAX_MONTHLY_WORDS_FOR_ID = 5000

N_DAYS_TO_STORE_PREPROCESSED_WORDS = 35


prepare_daily_words_query = """
$digit_set = ToSet(Unicode::ToCodePointList('0123456789'));

$word_sample = (
    SELECT
        yandexuid,
        ListUniq(Yson::ConvertToStringList(lemmas)) AS lemmas
    FROM `{input_table}`
    WHERE yandexuid IS NOT NULL
        AND yandexuid != 0
        AND ListLength(ListUniq(Yson::ConvertToStringList(lemmas))) <= {max_words_in_query}
);

$flattened_words = (
    SELECT DISTINCT yandexuid, word
    FROM $word_sample
    FLATTEN LIST BY lemmas AS word
    WHERE DictLength(ToSet(Unicode::ToCodePointList(CAST(word AS Utf8)))) > {min_different_characters_in_word}
          AND DictLength(SetIntersection(ToSet(Unicode::ToCodePointList(CAST(word AS Utf8))), $digit_set)) == 0
);

$frequent_words = (
    SELECT word
    FROM $flattened_words
    GROUP BY word
    HAVING COUNT(DISTINCT yandexuid) >= {min_daily_yandexuid_with_word}
);

$frequent_words_for_id = (
    SELECT
        t1.yandexuid AS yandexuid,
        t1.word AS word
    FROM $flattened_words AS t1
    LEFT SEMI JOIN $frequent_words AS t2
    USING (word)
);

$filtered_ids = (
    SELECT yandexuid, AGGREGATE_LIST_DISTINCT(word) AS words
    FROM $frequent_words_for_id
    GROUP BY yandexuid
    HAVING ListLength(AGGREGATE_LIST_DISTINCT(word)) <= {max_daily_words_for_yandexuid}
);

INSERT INTO `{output_table}`
WITH TRUNCATE
SELECT yandexuid, word
FROM $filtered_ids
FLATTEN LIST BY words AS word;
"""


class PrepareDailyReqansWords(BaseYtTask):
    date = luigi.Parameter()

    task_group = 'precalculate_affinity'

    def requires(self):
        return AttributeExternalInput(
            os.path.join(config.REQANS_PARSED_DIR, self.date),
            attribute_name='closed',
            attribute_value=True,
        )

    def output(self):
        return YtTarget(os.path.join(
            config.FLATTENED_REQANS_WORDS_DIRECTORY,
            self.date,
        ))

    def run(self):
        self.yt.config['spec_defaults']['pool'] = config.DEFAULT_POOL

        with self.yt.Transaction() as transaction:
            self.yql.query(
                prepare_daily_words_query.format(
                    input_table=self.input().table,
                    max_words_in_query=MAX_WORDS_IN_QUERY,
                    min_different_characters_in_word=MIN_DIFFERENT_CHARACTERS_IN_WORD,
                    min_daily_yandexuid_with_word=MIN_DAILY_YANDEXUID_WITH_WORD,
                    max_daily_words_for_yandexuid=MAX_DAILY_WORDS_FOR_YANDEXUID,
                    output_table=self.output().table,
                ),
                transaction=transaction,
            )


idf_calculation_query = """
$monthly_yandexuid_word_pairs = (
    SELECT
        word,
        yandexuid,
        CAST(yandexuid AS String) AS id,
        'yandexuid' AS id_type
    FROM RANGE(`{daily_flattened_word_directory}`, `{start_date}`, `{end_date}`)
    GROUP BY word, yandexuid
);

$total_yandexuid_count = (
    SELECT COUNT(DISTINCT yandexuid)
    FROM $monthly_yandexuid_word_pairs
);

$monthly_crypta_id_word_pairs = (
    SELECT
        t1.word AS word,
        CAST(t2.cryptaId AS Uint64) AS crypta_id
    FROM $monthly_yandexuid_word_pairs AS t1
    INNER JOIN `{vertices_no_multi_profile}` AS t2
    USING (id, id_type)
);

$total_crypta_id_count = (
    SELECT COUNT(DISTINCT crypta_id)
    FROM $monthly_crypta_id_word_pairs
);

$yandexuid_word_weights = (
    SELECT
        word,
        COUNT(DISTINCT yandexuid) AS `count`,
        Math::Log(CAST($total_yandexuid_count AS Double) / COUNT(DISTINCT yandexuid)) AS idf,
        CAST(CAST($total_yandexuid_count AS Double) / COUNT(DISTINCT yandexuid) AS Uint64) AS weight
    FROM $monthly_yandexuid_word_pairs
    GROUP BY word
    HAVING COUNT(DISTINCT yandexuid) >= {min_monthly_id_with_word}
);

$crypta_id_word_weights = (
    SELECT
        word,
        COUNT(DISTINCT crypta_id) AS `count`,
        Math::Log(CAST($total_crypta_id_count AS Double) / COUNT(DISTINCT crypta_id)) AS idf,
        CAST(CAST($total_crypta_id_count AS Double) / COUNT(DISTINCT crypta_id) AS Uint64) AS weight
    FROM $monthly_crypta_id_word_pairs
    GROUP BY word
    HAVING COUNT(DISTINCT crypta_id) >= {min_monthly_id_with_word}
);

INSERT INTO `{yandexuid_word_weights_table}` WITH TRUNCATE
SELECT word, `count`, idf, weight
FROM $yandexuid_word_weights
ORDER BY word;

INSERT INTO `{crypta_id_word_weights_table}` WITH TRUNCATE
SELECT word, `count`, idf, weight
FROM $crypta_id_word_weights
ORDER BY word;

INSERT INTO `{yandexuid_words_table}` WITH TRUNCATE
SELECT yandexuid, AGGREGATE_LIST_DISTINCT(word) AS words
FROM (
    SELECT t1.word AS word, t1.yandexuid AS yandexuid
    FROM $monthly_yandexuid_word_pairs AS t1
    LEFT SEMI JOIN $yandexuid_word_weights AS t2
    USING (word)
)
GROUP BY yandexuid
HAVING ListLength(AGGREGATE_LIST_DISTINCT(word)) <= {max_monthly_words_for_id}
ORDER BY yandexuid;

INSERT INTO `{crypta_id_words_table}` WITH TRUNCATE
SELECT crypta_id, AGGREGATE_LIST_DISTINCT(word) AS words
FROM (
    SELECT t1.word AS word, t1.crypta_id AS crypta_id
    FROM $monthly_crypta_id_word_pairs AS t1
    LEFT SEMI JOIN $crypta_id_word_weights AS t2
    USING (word)
)
GROUP BY crypta_id
HAVING ListLength(AGGREGATE_LIST_DISTINCT(word)) <= {max_monthly_words_for_id}
ORDER BY crypta_id;
"""


class CalculateReqansWordsWeights(BaseYtTask):
    date = luigi.Parameter()

    task_group = 'precalculate_affinity'

    def requires(self):
        return {
            'daily_tasks': [
                PrepareDailyReqansWords(date_string)
                for date_string in date_helpers.generate_back_dates(
                    date_helpers.get_yesterday(self.date),
                    N_DAYS_TO_STORE_PREPROCESSED_WORDS,
                )
            ],
            'cleaner': OldNodesByNameCleaner(
                self.date,
                folder=config.FLATTENED_REQANS_WORDS_DIRECTORY,
                lifetime=N_DAYS_TO_STORE_PREPROCESSED_WORDS,
            ),
        }

    def output(self):
        return {
            'yandexuid_word_weights': YtDailyRewritableTarget(
                config.YANDEXUID_WORD_WEIGHTS_TABLE,
                self.date,
            ),
            'crypta_id_word_weights': YtDailyRewritableTarget(
                config.CRYPTA_ID_WORD_WEIGHTS_TABLE,
                self.date,
            ),
            'yandexuid_words': YtDailyRewritableTarget(
                config.YANDEXUID_WORDS_TABLE,
                self.date,
            ),
            'crypta_id_words': YtDailyRewritableTarget(
                config.CRYPTA_ID_WORDS_TABLE,
                self.date,
            ),
        }

    def run(self):
        self.yt.config['spec_defaults']['pool'] = config.DEFAULT_POOL

        with self.yt.Transaction() as transaction:
            self.yql.query(
                idf_calculation_query.format(
                    daily_flattened_word_directory=config.FLATTENED_REQANS_WORDS_DIRECTORY,
                    start_date=date_helpers.get_date_from_past(self.date, N_DAYS_TO_STORE_PREPROCESSED_WORDS),
                    end_date=self.date,
                    vertices_no_multi_profile=config.VERTICES_NO_MULTI_PROFILE,
                    yandexuid_word_weights_table=self.output()['yandexuid_word_weights'].table,
                    crypta_id_word_weights_table=self.output()['crypta_id_word_weights'].table,
                    yandexuid_words_table=self.output()['yandexuid_words'].table,
                    crypta_id_words_table=self.output()['crypta_id_words'].table,
                    min_monthly_id_with_word=MIN_MONTHLY_ID_WITH_WORD,
                    max_monthly_words_for_id=MAX_MONTHLY_WORDS_FOR_ID,
                ),
                transaction=transaction,
            )

            for _, target in self.output().iteritems():
                self.yt.set_attribute(target.table, 'generate_date', self.date)


reqans_words_affinity_query_template = """
$yandexuid_words = (
    SELECT yandexuid, word
    FROM `{yandexuid_words_table}`
    FLATTEN LIST BY words AS word
);

$crypta_id_words = (
    SELECT crypta_id, word
    FROM `{crypta_id_words_table}`
    FLATTEN LIST BY words AS word
);

INSERT INTO `{yandexuid_output_table}` WITH TRUNCATE
SELECT
    words.yandexuid AS yandexuid,
    words.word AS word,
    weights.weight AS weight
FROM $yandexuid_words AS words
INNER JOIN `{yandexuid_word_weights_table}` AS weights
USING(word)
ORDER BY yandexuid;

INSERT INTO `{crypta_id_output_table}` WITH TRUNCATE
SELECT
    words.crypta_id AS crypta_id,
    words.word AS word,
    weights.weight AS weight
FROM $crypta_id_words AS words
INNER JOIN `{crypta_id_word_weights_table}` AS weights
USING(word)
ORDER BY crypta_id;
"""


class PrecalculateReqansWordsAffinity(BaseYtTask):
    date = luigi.Parameter()

    task_group = 'precalculate_affinity'

    def requires(self):
        return CalculateReqansWordsWeights(self.date)

    def output(self):
        return {
            'crypta_id': YtDailyRewritableTarget(config.CRYPTA_ID_REQANS_WORDS_AFFINITY, self.date),
            'yandexuid': YtDailyRewritableTarget(config.YANDEXUID_REQANS_WORDS_AFFINITY, self.date),
        }

    def run(self):
        self.yt.config['spec_defaults']['pool'] = config.DEFAULT_POOL

        with self.yt.Transaction() as transaction:
            self.yql.query(
                query_string=reqans_words_affinity_query_template.format(
                    yandexuid_words_table=self.input()['yandexuid_words'].table,
                    crypta_id_words_table=self.input()['crypta_id_words'].table,
                    yandexuid_word_weights_table=self.input()['yandexuid_word_weights'].table,
                    crypta_id_word_weights_table=self.input()['crypta_id_word_weights'].table,
                    yandexuid_output_table=self.output()['yandexuid'].table,
                    crypta_id_output_table=self.output()['crypta_id'].table,
                ),
                transaction=transaction,
            )

            for _, target in self.output().iteritems():
                self.yt.set_attribute(target.table, 'generate_date', self.date)
