import os

import luigi

from crypta.lib.python.juggler.juggler_helpers import report_event_to_juggler
from crypta.profile.lib import date_helpers
from crypta.profile.utils import (
    luigi_utils,
    loggers,
    utils,
)
from crypta.profile.utils.config import config
from crypta.profile.utils.clients import clickhouse


profiles_age_segment_to_metrica_age_segment = {
    '0_17': 17,
    '18_24': 18,
    '25_34': 25,
    '35_44': 35,
    '45_54': 45,
    '55_99': 55,
}


class MetricaConsistencyMonitoring(luigi_utils.BaseYtTask):
    date = luigi.Parameter()
    task_group = 'consistency_monitoring'

    def __init__(self, date):
        super(MetricaConsistencyMonitoring, self).__init__(date)
        self.clickhouse_client = clickhouse.ClickhouseClient(logger=self.logger)

    def requires(self):
        return luigi_utils.ExternalInput(config.YANDEXUID_EXPORT_PROFILES_14_DAYS_TABLE)

    def output(self):
        return luigi_utils.YtNodeAttributeTarget(
            path=os.path.join(
                config.YANDEXUID_DAILY_EXPORT_DIRECTORY,
                date_helpers.get_yesterday(self.date),
            ),
            attribute_name=self.__class__.__name__,
            attribute_value=True,
        )

    def get_clickhouse_report_data(self, yandexuids):
        clickhouse_query = """
                SELECT DISTINCT UserID, Sex, Age
                FROM visits_all
                WHERE StartDate = today() AND UserID in ({yandexuids})
                LIMIT {count}
               """.format(yandexuids=', '.join(map(str, yandexuids)), count=len(yandexuids))

        result = self.clickhouse_client.make_query(clickhouse_query)
        return result.split('\n')

    def check_in_clickhouse(self, socdem_chunk):
        diff = 0
        no_diff = 0
        for line in self.get_clickhouse_report_data(yandexuids=socdem_chunk.keys()):
            if line:
                is_diff = False
                yandexuid, metrica_gender, metrica_age = line.split('\t')
                yandexuid, metrica_gender, metrica_age = int(yandexuid), int(metrica_gender), int(metrica_age)
                gender, age_segment = socdem_chunk[yandexuid]

                if gender is not None and gender != metrica_gender:
                    is_diff = True
                    self.logger.error('{}: gender differs. exported: {} actual: {}'.format(yandexuid, gender, metrica_gender))

                if age_segment is not None and age_segment != metrica_age:
                    is_diff = True
                    self.logger.error('{}: age differs. exported: {} actual: {}'.format(yandexuid, age_segment, metrica_age))

                if is_diff:
                    diff += 1
                else:
                    no_diff += 1

        return diff, no_diff

    def check_metrica_consistency(self, profiles_for_14days_table, sample_chunk_count, sample_chunk_size):
        random_table_ranges_from_yt = utils.get_random_table_ranges_from_yt(
            self.yt,
            profiles_for_14days_table,
            sample_chunk_count=int(sample_chunk_count),
            sample_chunk_size=int(sample_chunk_size),
        )

        if not random_table_ranges_from_yt:
            self.logger.error('Input tables are empty')
            return

        yandexuids_without_difference_counter = 0
        yandexuids_with_difference_counter = 0

        for yt_table_range in random_table_ranges_from_yt:
            socdem_chunk = {}
            self.logger.info('yt_table_range = {yt_table_range}'.format(yt_table_range=repr(yt_table_range)))
            for row in self.yt.read_table(yt_table_range):
                metrica_gender, metrica_age_segment = None, None

                exact_socdem = row.get('exact_socdem')
                if exact_socdem:
                    if exact_socdem.get('gender'):
                        metrica_gender = 1 if exact_socdem.get('gender') == 'm' else 2

                    if exact_socdem.get('age_segment'):
                        metrica_age_segment = profiles_age_segment_to_metrica_age_segment[exact_socdem.get('age_segment')]

                socdem_chunk[row['yandexuid']] = (metrica_gender, metrica_age_segment)

            diff, no_diff = self.check_in_clickhouse(socdem_chunk)
            yandexuids_without_difference_counter += no_diff
            yandexuids_with_difference_counter += diff

        return yandexuids_with_difference_counter, yandexuids_without_difference_counter

    def run(self):
        sample_chunk_count = 1
        sample_chunk_size = 10000
        self.logger.info('Monitoring Metrica consistency')
        bad_counter, good_counter = self.check_metrica_consistency(
            profiles_for_14days_table=self.input().table,
            sample_chunk_count=sample_chunk_count,
            sample_chunk_size=sample_chunk_size,  # should't be more (limit 50000)
        )

        total_sent = sample_chunk_count * sample_chunk_size
        total_received = bad_counter + good_counter
        if total_received == 0:
            self.logger.error('No data found in clickhouse for specified {} yandexuids'.format(total_sent))
        metrica_inconsistent_keywords_percentage = (bad_counter * 100.0) / (good_counter + bad_counter)
        loggers.send_to_graphite(
            name='metrica_inconsistent_keywords_percentage',
            value=metrica_inconsistent_keywords_percentage,
        )

        self.logger.info(
            'Metrica yandexuids: consistent = {good_counter}, inconsistent = {bad_counter}, bad% = {percent}'.format(
                good_counter=good_counter,
                bad_counter=bad_counter,
                percent=metrica_inconsistent_keywords_percentage,
            ),
        )

        self.yt.set_attribute(self.output().path, self.__class__.__name__, True)

        threshold_percent = 10
        if metrica_inconsistent_keywords_percentage > threshold_percent:
            message = 'Percent of inconsistent Metrica entries={bad}% is above the threshold={threshold}%'.format(
                bad=metrica_inconsistent_keywords_percentage,
                threshold=threshold_percent,
            )

            report_event_to_juggler(
                status='WARN',
                service='offline_classification_metrica_monitoring',
                host=config.CRYPTA_PROFILE_JUGGLER_HOST,
                description=message,
                logger=loggers.get_stderr_logger(),
            )
        else:
            report_event_to_juggler(
                status='OK',
                service='offline_classification_metrica_monitoring',
                host=config.CRYPTA_PROFILE_JUGGLER_HOST,
                logger=loggers.get_stderr_logger(),
            )


if __name__ == '__main__':
    import datetime
    luigi.run(
        [
            '--scheduler-url', config.LUIGI_SCHEDULER_URL,
            '--date', str(datetime.date.today())
        ],
        main_task_cls=MetricaConsistencyMonitoring
    )
