# -*- coding: utf-8 -*-

import os

import luigi

from crypta.profile.utils.config import config
from crypta.profile.utils.socdem import socdem_storage_schema
from crypta.profile.utils.luigi_utils import ExternalInput, BaseYtTask, YtDailyRewritableTarget, YtNodeAttributeTarget


def targets_to_tables(targets):
    return [target.table for target in targets]


attrs_query = """
INSERT INTO `{output_table}` WITH TRUNCATE
SELECT
    dit_id,
    MAX_BY(gender, ts) AS gender,
    MAX_BY(age, ts) AS age,
    MAX_BY(children, ts) AS children,
    MAX_BY(realty, ts) AS realty,
    MAX_BY(car, ts) AS car,
    MAX(ts) AS ts
FROM CONCAT({input_tables})
GROUP BY dit_id
ORDER BY dit_id
"""

id_query = """
INSERT INTO `{output_table}` WITH TRUNCATE
SELECT
    dit_id,
    {id_type},
    MAX(ts) AS ts
FROM CONCAT({input_tables})
GROUP BY dit_id, {id_type}
ORDER BY dit_id
"""

all_info_query = """
$dit_id_info = (
    SELECT *
    FROM `{attrs_accumulator}`
    WHERE dit_id IS NOT NULL
);

$phone_matching = (
    SELECT phone_md5, dit_id[0] AS dit_id
    FROM (
        SELECT phone_md5, ListUniq(AGGREGATE_LIST(dit_id)) AS dit_id
        FROM `{phones_accumulator}`
        GROUP BY phone_md5
    )
    WHERE ListLength(dit_id) == 1
);

$email_matching = (
    SELECT email_md5, dit_id[0] AS dit_id
    FROM (
        SELECT email_md5, ListUniq(AGGREGATE_LIST(dit_id)) AS dit_id
        FROM `{emails_accumulator}`
        GROUP BY email_md5
    )
    WHERE ListLength(dit_id) == 1
);

$phone_md5_info = (
    SELECT
        matching.phone_md5 AS id,
        'phone_md5' AS id_type,
        info.realty AS realty,
        info.gender AS gender,
        info.children AS children,
        info.car AS car,
        info.ts AS ts,
        info.age AS age
    FROM $dit_id_info AS info
    INNER JOIN $phone_matching AS matching
    USING (dit_id)
);

$email_md5_info = (
    SELECT
        matching.email_md5 AS id,
        'email_md5' AS id_type,
        info.realty AS realty,
        info.gender AS gender,
        info.children AS children,
        info.car AS car,
        info.ts AS ts,
        info.age AS age
    FROM $dit_id_info AS info
    INNER JOIN $email_matching AS matching
    USING (dit_id)
);

INSERT INTO `{output_table}` WITH TRUNCATE
SELECT
    dit_id AS id,
    'dit_id' AS id_type,
    realty,
    gender,
    children,
    car,
    ts,
    age
FROM $dit_id_info
UNION ALL
SELECT *
FROM $email_md5_info
UNION ALL
SELECT *
FROM $phone_md5_info
"""

socdem_query = """
$all_socdem = (
    SELECT id, id_type, age, gender, ts AS update_time
    FROM `{all_info_table}`
    WHERE id IS NOT NULL AND (age IS NOT NULL OR gender IS NOT NULL)
);

INSERT INTO `{dit_id_output_table}`
SELECT
    id,
    id_type,
    gender,
    age,
    'dit' AS source,
    update_time
FROM $all_socdem
WHERE id_type == 'dit_id';


INSERT INTO `{phone_md5_output_table}`
SELECT
    id,
    id_type,
    gender,
    age,
    'dit' AS source,
    update_time
FROM $all_socdem
WHERE id_type == 'phone_md5';


INSERT INTO `{email_md5_output_table}`
SELECT
    id,
    id_type,
    gender,
    age,
    'dit' AS source,
    update_time
FROM $all_socdem
WHERE id_type == 'email_md5';
"""


class BuildDitDataAccumulator(BaseYtTask):
    date = luigi.Parameter()
    juggler_host = config.CRYPTA_ML_JUGGLER_HOST
    task_group = 'import_socdem_data'

    def __init__(self, date):
        super(BuildDitDataAccumulator, self).__init__(date)

        self.last_processed_timestamp = self.yt.get_attribute(
            config.DIT_DATA_STORAGE_FOLDER,
            'last_processed_timestamp',
            None,
        )

        self.attrs_folder = os.path.join(config.DIT_DATA_FOLDER, 'attrs')
        self.emails_folder = os.path.join(config.DIT_DATA_FOLDER, 'emails')
        self.phones_folder = os.path.join(config.DIT_DATA_FOLDER, 'phones')

        self.schema_template = {
            'dit_id': 'string',
            'ts': 'uint64',
        }

        self.attrs_columns = {
            'children': 'boolean',
            'car': 'boolean',
            'realty': 'boolean',
            'gender': 'string',
            'age': 'uint64',
        }

        self.attrs_accumulator_schema = self.schema_template.copy()
        self.attrs_accumulator_schema.update(self.attrs_columns)

        self.emails_accumulator_schema = self.schema_template.copy()
        self.emails_accumulator_schema.update({'email_md5': 'string'})

        self.phones_accumulator_schema = self.schema_template.copy()
        self.phones_accumulator_schema.update({'phone_md5': 'string'})

        self.timestamps_to_process = []

        available_timestamps = self.yt.list(self.attrs_folder)
        for timestamp in available_timestamps:
            if timestamp > self.last_processed_timestamp:
                self.timestamps_to_process.append(timestamp)

        self.last_available_timestamp = sorted(available_timestamps)[-1]

    def requires(self):
        return {
            'attrs': [
                ExternalInput(os.path.join(self.attrs_folder, ts))
                for ts in self.timestamps_to_process
            ],
            'emails': [
                ExternalInput(os.path.join(self.emails_folder, ts))
                for ts in self.timestamps_to_process
            ],
            'phones': [
                ExternalInput(os.path.join(self.phones_folder, ts))
                for ts in self.timestamps_to_process
            ],
        }

    def output(self):
        return YtNodeAttributeTarget(
            path=config.DIT_DATA_STORAGE_FOLDER,
            attribute_name='last_processed_timestamp',
            attribute_value=self.last_available_timestamp,
        )

    def _accumulate_attrs(self, transaction):
        if not self.yt.exists(config.DIT_ATTRS_ACCUMULATOR):
            self.yt.create_empty_table(
                path=config.DIT_ATTRS_ACCUMULATOR,
                schema=self.attrs_accumulator_schema,
            )

        input_tables = targets_to_tables(self.input()['attrs']) + [config.DIT_ATTRS_ACCUMULATOR]
        concat_str = ', '.join(['`{}`'.format(table) for table in input_tables])

        self.yql.query(
            attrs_query.format(
                input_tables=concat_str,
                output_table=config.DIT_ATTRS_ACCUMULATOR,
            ),
            transaction=transaction,
        )

    def _accumulate_emails(self, transaction):
        if not self.yt.exists(config.DIT_EMAILS_ACCUMULATOR):
            self.yt.create_empty_table(
                path=config.DIT_EMAILS_ACCUMULATOR,
                schema=self.emails_accumulator_schema,
            )

        input_tables = targets_to_tables(self.input()['emails']) + [config.DIT_EMAILS_ACCUMULATOR]
        concat_str = ', '.join(['`{}`'.format(table) for table in input_tables])

        self.yql.query(
            id_query.format(
                input_tables=concat_str,
                id_type='email_md5',
                output_table=config.DIT_EMAILS_ACCUMULATOR,
            ),
            transaction=transaction,
        )

    def _accumulate_phones(self, transaction):
        if not self.yt.exists(config.DIT_PHONES_ACCUMULATOR):
            self.yt.create_empty_table(
                path=config.DIT_PHONES_ACCUMULATOR,
                schema=self.phones_accumulator_schema,
            )

        input_tables = targets_to_tables(self.input()['phones']) + [config.DIT_PHONES_ACCUMULATOR]
        concat_str = ', '.join(['`{}`'.format(table) for table in input_tables])

        self.yql.query(
            id_query.format(
                input_tables=concat_str,
                id_type='phone_md5',
                output_table=config.DIT_PHONES_ACCUMULATOR,
            ),
            transaction=transaction,
        )

    def _combine_info_by_id_and_id_type(self, transaction):
        self.yql.query(
            all_info_query.format(
                attrs_accumulator=config.DIT_ATTRS_ACCUMULATOR,
                emails_accumulator=config.DIT_EMAILS_ACCUMULATOR,
                phones_accumulator=config.DIT_PHONES_ACCUMULATOR,
                output_table=config.DIT_INFO_TABLE,
            ),
            transaction=transaction,
        )

    def run(self):
        self.yt.config['spec_defaults']['pool'] = config.SEGMENTS_POOL

        with self.yt.Transaction() as transaction:
            self._accumulate_attrs(transaction)
            self._accumulate_emails(transaction)
            self._accumulate_phones(transaction)
            self._combine_info_by_id_and_id_type(transaction)

            self.yt.set_attribute(
                config.DIT_DATA_STORAGE_FOLDER,
                'last_processed_timestamp',
                self.last_available_timestamp,
            )


class DitSocdemMatching(BaseYtTask):
    date = luigi.Parameter()
    juggler_host = config.CRYPTA_ML_JUGGLER_HOST
    task_group = 'import_socdem_data'

    def requires(self):
        return BuildDitDataAccumulator(date=self.date)

    def output(self):
        return {
            'DitDataPhoneOutput': YtDailyRewritableTarget(
                os.path.join(
                    config.SOCDEM_STORAGE_YT_DIR,
                    'phone_md5',
                    'dit',
                ),
                self.date,
            ),
            'DitDataEmailOutput': YtDailyRewritableTarget(
                os.path.join(
                    config.SOCDEM_STORAGE_YT_DIR,
                    'email_md5',
                    'dit',
                ),
                self.date,
            ),
            'DitDataDitIdOutput': YtDailyRewritableTarget(
                os.path.join(
                    config.SOCDEM_STORAGE_YT_DIR,
                    'dit_id',
                    'dit',
                ),
                self.date,
            ),
        }

    def run(self):
        self.yt.config['spec_defaults']['pool'] = config.SEGMENTS_POOL

        with self.yt.Transaction() as transaction:
            for output_name, output in self.output().iteritems():
                self.yt.create_empty_table(
                    path=output.table,
                    schema=socdem_storage_schema,
                )

            self.yql.query(
                query_string=socdem_query.format(
                    all_info_table=config.DIT_INFO_TABLE,
                    dit_id_output_table=self.output()['DitDataDitIdOutput'].table,
                    email_md5_output_table=self.output()['DitDataEmailOutput'].table,
                    phone_md5_output_table=self.output()['DitDataPhoneOutput'].table,
                ),
                transaction=transaction,
            )

            self.yt.set_attribute(
                self.output()['DitDataDitIdOutput'].table,
                'generate_date',
                self.date,
            )

            self.yt.set_attribute(
                self.output()['DitDataPhoneOutput'].table,
                'generate_date',
                self.date,
            )

            self.yt.set_attribute(
                self.output()['DitDataEmailOutput'].table,
                'generate_date',
                self.date,
            )
