#!/usr/bin/env python
# -*- coding: utf-8 -*-

import os

import luigi

from crypta.lib.python.yt import yt_helpers
from crypta.profile.lib import date_helpers
from crypta.profile.utils.config import config
from crypta.profile.utils.luigi_utils import YtDailyRewritableTarget, ExternalInput, BaseYtTask
from crypta.profile.utils.utils import report_ml_metrics_to_solomon
from crypta.profile.runners.matching.lib.income.parse_job_search import ParseJobSearch


merge_job_search_query_template = """
$laplace_noise_script = @@
def add_laplace_noise(median_salary, eps):
    import numpy as np

    return median_salary + np.random.laplace(0, 1. / eps)
@@;

$laplace_noise_udf = (Python::add_laplace_noise(Callable<(Double, Double)->Double>, $laplace_noise_script));

$merged_job_search = (
    SELECT puid,
        '{update_time}' AS update_time,
        $laplace_noise_udf(COALESCE(MEDIAN(salary), 0.), {eps}) AS median_salary,
        AGGREGATE_LIST_DISTINCT(`date`) AS days_active,
        AGGREGATE_LIST(salary) AS min_disared_salaries
    FROM RANGE(
        `{parsed_job_search_folder}`,
        `{first_parsed_date}`,
        `{last_parsed_date}`
    ) AS parsed_job_search
    INNER JOIN `{indevice_yandexuid_table}` AS indevice_yandexuid
    USING(yandexuid)
    WHERE indevice_yandexuid.id_type == 'puid'
    GROUP BY indevice_yandexuid.id AS puid
    HAVING ListLength(AGGREGATE_LIST_DISTINCT(`date`)) >= {min_days} AND
        ListLength(AGGREGATE_LIST(salary)) >= {min_hits} AND ListLength(AGGREGATE_LIST(salary)) <= {max_hits}
UNION ALL
    SELECT puid,
        update_time,
        median_salary,
        days_active,
        min_disared_salaries
    FROM `{output_merged_table}`
);

$merged_job_search = (
    SELECT puid,
        MAX(update_time) AS update_time,
        MAX_BY(median_salary, update_time) AS median_salary,
        MAX_BY(days_active, update_time) AS days_active,
        MAX_BY(min_disared_salaries, update_time) AS min_disared_salaries
    FROM $merged_job_search
    GROUP BY puid
    HAVING MAX(update_time) >= '{last_date_for_accounting}'
);

$percentile_A = (
    SELECT PERCENTILE(median_salary, {A_upper_percentile})
    FROM $merged_job_search
    WHERE update_time >= '{for_train_first_date}' AND update_time < '{for_train_last_date}'
);

$percentile_B1 = (
    SELECT PERCENTILE(median_salary, {B1_upper_percentile})
    FROM $merged_job_search
    WHERE update_time >= '{for_train_first_date}' AND update_time < '{for_train_last_date}'
);

$percentile_B2 = (
    SELECT PERCENTILE(median_salary, {B2_upper_percentile})
    FROM $merged_job_search
    WHERE update_time >= '{for_train_first_date}' AND update_time < '{for_train_last_date}'
);

$percentile_C1 = (
    SELECT PERCENTILE(median_salary, {C1_upper_percentile})
    FROM $merged_job_search
    WHERE update_time >= '{for_train_first_date}' AND update_time < '{for_train_last_date}'
);

INSERT INTO `{output_thresholds_table}` WITH TRUNCATE

SELECT $percentile_A AS A_upper_threshold, $percentile_B1 AS B1_upper_threshold,
    $percentile_B2 AS B2_upper_threshold, $percentile_C1 AS C1_upper_threshold;

INSERT INTO `{output_merged_table}` WITH TRUNCATE

SELECT t1.*,
    CASE
        WHEN median_salary <= $percentile_A THEN 'A'
        WHEN median_salary > $percentile_A AND median_salary <= $percentile_B1 THEN 'B1'
        WHEN median_salary > $percentile_B1 AND median_salary <= $percentile_B2 THEN 'B2'
        WHEN median_salary > $percentile_B2 AND median_salary <= $percentile_C1 THEN 'C1'
        ELSE 'C2'
    END AS income_segment,
    CASE
        WHEN update_time >= '{for_train_first_date}' AND update_time < '{for_train_last_date}' THEN 2.
        ELSE 1.
    END AS weight
FROM $merged_job_search AS t1
ORDER BY puid;
"""


class MergeJobSearch(BaseYtTask):
    date = luigi.Parameter()
    juggler_host = config.CRYPTA_ML_JUGGLER_HOST
    task_group = 'import_socdem_data'

    def requires(self):
        return {
            'parsed_job_search': ParseJobSearch(date_helpers.get_yesterday(self.date)),
            'indevice_yandexuid': ExternalInput(config.INDEVICE_YANDEXUID),
        }

    def output(self):
        return {
            'thresholds': YtDailyRewritableTarget(
                table=config.INCOME_THRESHOLDS_TABLE,
                date=self.date,
            ),
            'merged_job_search': YtDailyRewritableTarget(
                table=config.MERGED_JOB_SEARCH_TABLE,
                date=self.date
            ),
        }

    def write_thresholds_to_yt(self, thresholds):
        data = []
        for segment, threshold in thresholds.iteritems():
            data.append({
                'fielddate': self.date,
                'segment': segment.split('_')[0],
                'salary': int(threshold),
            })
        yt_helpers.write_stats_to_yt(
            yt_client=self.yt,
            table_path=config.DATALENS_SOCDEM_INCOME_THRESHOLDS_TABLE,
            data_to_write=data,
            schema={
                'fielddate': 'string',
                'segment': 'string',
                'salary': 'uint64',
            },
        )

    def run(self):
        with self.yt.Transaction() as transaction:
            self.yql.query(
                query_string=merge_job_search_query_template.format(
                    update_time=self.date,
                    eps=0.005,  # F(x<=500 RUB) = 0.95
                    parsed_job_search_folder=os.path.dirname(self.input()['parsed_job_search'].table),
                    indevice_yandexuid_table=self.input()['indevice_yandexuid'].table,
                    first_parsed_date=date_helpers.get_date_from_past(self.date, days=28),
                    last_parsed_date=date_helpers.get_date_from_past(self.date, days=1),
                    last_date_for_accounting=date_helpers.get_date_from_past(self.date, years=config.TIME_TO_ACCOUNT_INCOME_INFO_FOR_VOTING_YEARS),
                    for_train_first_date=date_helpers.get_date_from_past(self.date, days=112),  # 4 * 28
                    for_train_last_date=date_helpers.get_date_from_past(self.date, days=56),  # 2 * 28
                    min_days=2,
                    min_hits=5,
                    max_hits=1000,
                    A_upper_percentile=0.05,
                    B1_upper_percentile=0.35,
                    B2_upper_percentile=0.65,
                    C1_upper_percentile=0.955,
                    output_thresholds_table=self.output()['thresholds'].table,
                    output_merged_table=self.output()['merged_job_search'].table,
                ),
                transaction=transaction,
            )

            thresholds = next(self.yt.read_table(self.output()['thresholds'].table))
            self.write_thresholds_to_yt(thresholds)

            report_ml_metrics_to_solomon(
                service=config.SOLOMON_SOCDEM_SOURCES_SERVICE,
                metrics_to_send=[{
                    'labels': {
                        'socdem': 'income',
                        'source': self.__class__.__name__,
                        'metric': 'size',
                    },
                    'value': self.yt.row_count(self.output()['merged_job_search'].table),
                }],
            )

            self.yt.set_attribute(
                self.output()['thresholds'].table,
                'generate_date',
                self.date,
            )

            self.yt.set_attribute(
                self.output()['merged_job_search'].table,
                'generate_date',
                self.date,
            )
