import hashlib
from functools import partial

import luigi
import yt.wrapper as yt

import soup_validation
from data_imports.import_dumps import graph_passport_dump
from data_imports.import_dumps import graph_people_search
from data_imports.import_dumps.social import graph_social_auth
from data_imports.import_dumps import graph_tickets
from data_imports.import_dumps import graph_yamoney
from data_imports.import_dumps.partners import import_xprod_partners, import_other_dumps
from data_imports.import_dumps.sherlock import import_vkid_mail, graph_insta
from data_imports.import_logs import app_metrica_day
from data_imports.import_logs import device_yuids_mobreport
from data_imports.import_logs import device_yuids_oauth
from data_imports.import_logs import device_yuids_redir
from data_imports.import_logs import device_yuids_sdk
from data_imports.import_logs import device_yuids_tracking
from data_imports.import_logs import graph_access
from data_imports.import_logs import graph_barnavig
from data_imports.import_logs import graph_dmp_ditmsk
from data_imports.import_logs import graph_eal
from data_imports.import_logs import graph_import_fp
from data_imports.import_logs import graph_passport
from data_imports.import_logs import graph_passport_sensitive
from data_imports.import_logs import graph_postclicks
from data_imports.import_logs import graph_rassilyator
from data_imports.import_logs import graph_sbapi_lookup
from data_imports.import_logs import graph_sovetnik
from data_imports.import_logs.bs import xuniq
from data_imports.import_logs.bs import avito
from data_imports.import_logs.idserv import import_idserv_access
from data_imports.import_logs.watch_log import graph_watch_log
from data_imports.import_logs.webvisor import graph_webvisor
from household import hh_composition
from lib import graphite_sender
from lib.luigi import base_luigi_task
from lib.luigi import yt_luigi
from matching.device_matching import device_yuid_mix_perfect_fuzzy
from matching.device_matching.app_metrica import account_manager
from matching.device_matching.fuzzy_heuristic import indevice_fuzzy2
from rtcconf import config
from utils import mr_utils as mr
from utils import utils
from v2 import ids
from v2.soup import soup_config
from v2.soup import soup_edge_type
from v2.soup import soup_utils
from v2.soup.soup_tables import SoupDailyTable, SoupDumpTable, SoupStorageTable, \
    edge_key_columns, edge_type_key_columns


class AddDayToSoup(base_luigi_task.BaseTask):
    date = luigi.Parameter()

    def __init__(self, *args, **kwargs):
        super(AddDayToSoup, self).__init__(*args, **kwargs)
        self.daily_soup_tables = [SoupDailyTable(et, self.date)
                                  for et in soup_config.ALL_EDGES
                                  if et.supply_type == soup_edge_type.SupplyType.DAILY]

    def requires(self):
        return [
            graph_import_fp.ImportFPDayTask(date=self.date, run_date=self.date),
            graph_webvisor.ImportWebvisorTask(date=self.date, run_date=self.date),
            graph_watch_log.ImportWatchLogDayTask(date=self.date, run_date=self.date),
            app_metrica_day.ImportAppMetrikaDayTask(date=self.date, run_date=self.date),
            device_yuids_sdk.ImportSdkLogsDayTask(date=self.date, run_date=self.date),
            graph_access.ImportAccessLogsDayTask(date=self.date, run_date=self.date),
            graph_barnavig.ImportBarNavigDayTask(date=self.date, run_date=self.date),
            device_yuids_oauth.ImportOauthLogsDayTask(date=self.date, run_date=self.date, dict_date=self.date),
            device_yuids_redir.ImportRedirLogsDayTask(date=self.date, run_date=self.date),
            graph_passport.ImportPassportPhoneDayTask(date=self.date, run_date=self.date),
            graph_sbapi_lookup.ImportSbApiMitbLogDayTask(date=self.date, run_date=self.date),
            graph_eal.ImportEalDayTask(date=self.date, run_date=self.date),
            graph_sbapi_lookup.ImportSbApiAccessLogDayTask(date=self.date, run_date=self.date),
            graph_passport_sensitive.ImportPassportPhoneBindingsDayTask(date=self.date, run_date=self.date),
            device_yuids_tracking.ImportMobileTrackingLogsDayTask(date=self.date, run_date=self.date),
            device_yuids_mobreport.ImportMobReportLogsDayTask(date=self.date, run_date=self.date),
            graph_postclicks.ImportPostclicksDailyTask(date=self.date, run_date=self.date),
            graph_rassilyator.ImportSenderDayTask(date=self.date, run_date=self.date),
            graph_sovetnik.ImportSovetnikDayTask(date=self.date, run_date=self.date),
            import_idserv_access.ImportIdservAccessLog(date=self.date, run_date=self.date),
            xuniq.ImportBsXuniqsLog(date=self.date, run_date=self.date),
            graph_dmp_ditmsk.ImportDitMskTask(date=self.date, run_date=self.date),
            avito.ImportAvito(date=self.date, run_date=self.date, log_source=soup_config.LOG_SOURCE_BS_RTBLOG),
            avito.ImportAvito(date=self.date, run_date=self.date, log_source=soup_config.LOG_SOURCE_BS_HITLOG),
        ]

    def run(self):

        # DANGER: don't try in prod, you may loose soup data
        if config.CRYPTA_ENV == 'testing':
            # edges
            throw_before_date = utils.get_date_before(self.date, 7)
        else:
            throw_before_date = None

        with yt.Transaction() as tr:

            in_day_table_paths = []
            storage_tables = []
            storage_table_paths = []
            out_indexes = dict()

            for day_table in self.daily_soup_tables:
                in_day_table_paths.append(
                    day_table.table_path()
                )

                storage_table = day_table.corresponding_storage_table()
                storage_tables.append(storage_table)
                storage_table_paths.append(storage_table.create(tr, recreate_if_exists=False))

                out_indexes[day_table.edge_type] = len(storage_table_paths) - 1

            mr.mkdir(soup_config.SOUP_DAY_DIR + self.date + '/validation')
            validation_errors = soup_config.SOUP_DAY_DIR + self.date + '/validation/incremental_errors'

            # assume sorted
            yt.run_map_reduce(
                None,
                partial(soup_utils.increment_day_activity,
                        out_table_indexes=out_indexes,
                        throw_before_date=throw_before_date),
                in_day_table_paths + storage_table_paths,
                storage_table_paths + [validation_errors],
                reduce_by=edge_key_columns,
                sort_by=edge_key_columns
            )

            SoupStorageTable.finalize_all(storage_tables, tr)

            report_validation_errors(validation_errors, 'incremental', self.date)

    def output(self):
        return self.log_yt_targets_check(
            [t.corresponding_storage_table().as_target()
             for t in self.daily_soup_tables]
        )


def extract_vertices(rec, filter_types):
    if rec['id1Type'] in filter_types:
        yield {'id': rec['id1'], 'idType': rec['id1Type']}
    if rec['id2Type'] in filter_types:
        yield {'id': rec['id2'], 'idType': rec['id2Type']}


def report_validation_errors(errors_table, soup_type, date):
    yt.run_sort(errors_table, sort_by=edge_type_key_columns)
    yt.run_reduce(
        partial(mr.count_by_columns, columns=edge_type_key_columns),
        errors_table,
        errors_table + '_per_source_count',
        reduce_by=edge_type_key_columns
    )

    metrics = dict()
    for r in yt.read_table(errors_table + '_per_source_count'):
        et = soup_utils.get_rec_edge_type(r)
        metrics[et.name()] = str(r['count'])

    metric_name = 'soup.validation_errors.per_source.' + soup_type
    graphite_sender.to_graphite_sender_batch(metric_name, metrics.iteritems(), date)


def uniq_vertices_by_type(vertex_key, _, out_indexes):
    out_rec = dict(vertex_key)
    out_rec['@table_index'] = out_indexes.get(out_rec['idType'], len(out_indexes))
    yield out_rec


def id_to_hash_edge(rec, out_edge_type):
    orig_value = rec.get('id')
    hash_id_type = out_edge_type.id2_type

    # for some reason normalization converts to hash !?
    hashed_value = soup_validation.normalize(hash_id_type, orig_value)
    if hashed_value:
        yield SoupStorageTable.make_rec(orig_value, hashed_value, out_edge_type, [])


def avito_denormalize_email(email):
    parts = email.split('@')
    if len(parts) != 2:
        return [email]

    username = parts[0]
    domain = parts[1]
    if domain == 'yandex.ru':
        # avito does the opposite of us when normalizing yandex emails before hashing
        avito_username = username.replace('-', '.')

        yandex_domains = ['yandex.ru',
                          'ya.ru',
                          'yandex.by',
                          'yandex.ua',
                          'yandex.kz',
                          'yandex.com',
                          'yandex.com.tr']
        return ['@'.join([avito_username, domain]) for domain in yandex_domains]
    else:
        return [email]


def gen_avito_hashes_map(rec):
    if rec['@table_index'] == 0:
        result = dict(id2=rec['id'], _table_index=rec['@table_index'])
        yield result
    else:
        orig_value = rec.get('id')
        avito_salt = 'fy5drs34dgh13ff'

        denorm_emails = avito_denormalize_email(orig_value)
        for email in denorm_emails:
            avito_hash = hashlib.md5(email + avito_salt).hexdigest()
            result = SoupStorageTable.make_rec(orig_value, avito_hash, soup_config.email_avito_hash, [])
            result['_table_index'] = rec['@table_index']
            yield result


def gen_avito_hashes_reduce(email, recs):
    found = False
    for r in recs:
        if r['_table_index'] == 0:
            found = True
        else:
            if found:
                del r['_table_index']
                yield r


def map_login_to_email(rec, out_edge_type):
    login = rec['id']
    email = utils.login_to_email(login)
    if email:
        yield SoupStorageTable.make_rec(login, email, out_edge_type, [])


def calculate_uniq_vertices_from_edges(in_soup_tables, vertices_id_types, out_dir, tr):
    """
    Calculates uniq ids of vertices_id_types from in_soup_tables to out_dir
    """

    soup_to_calculate = [
        t for t in in_soup_tables
        if t.edge_type.id1_type in vertices_id_types or t.edge_type.id2_type in vertices_id_types
    ]

    # output tables - one for each vertex
    out_tables = [out_dir + id_type for id_type in vertices_id_types]
    out_indexes = {id_type: idx for idx, id_type in enumerate(vertices_id_types)}

    for tbl in out_tables:
        mr.create_table_with_schema(tbl, {'id': 'string', 'idType': 'string'}, tr)

    # collect all ids from edges
    yt.run_map_reduce(
        partial(extract_vertices, filter_types=vertices_id_types),
        partial(uniq_vertices_by_type, out_indexes=out_indexes),
        [t.table_path() for t in soup_to_calculate],
        out_tables + [out_dir + 'unknown_id_type'],
        reduce_by=['id', 'idType']
    )
    mr.sort_all(out_tables, sort_by=['idType', 'id'])

    return out_tables


class AddDumpsToSoup(base_luigi_task.BaseTask):
    date = luigi.Parameter()

    def __init__(self, *args, **kwargs):
        super(AddDumpsToSoup, self).__init__(*args, **kwargs)
        self.dump_tables = [SoupDumpTable(et, self.date)
                            for et in soup_config.ALL_EDGES
                            if et.supply_type == soup_edge_type.SupplyType.DUMP]

    def requires(self):
        return [
            account_manager.AccountManagerUpdateDictTask(date=self.date),
            graph_passport_dump.ImportPassportDump(date=self.date),
            graph_passport_dump.ImportPassportStatboxHeavyDict(date=self.date),
            graph_people_search.ImportPeopleSearch(date=self.date),
            graph_yamoney.ImportYandexMoneyDump(date=self.date),
            graph_tickets.ImportYandexTicketsDump(date=self.date),
            graph_social_auth.ImportSocialAuthDump(date=self.date),
            import_vkid_mail.ImportVkidPuidMailDump(date=self.date),
            import_xprod_partners.ImportPartnersDump(date=self.date),
            import_other_dumps.ImportEmailPhoneDumpsToSoup(date=self.date),
            graph_insta.ImportInstagramDayTask(date=self.date),
            device_yuid_mix_perfect_fuzzy.DevidYuidMixPerfectFuzzy(date=self.date),
            hh_composition.HHCompositionTask(self.date),
            indevice_fuzzy2.Fuzzy2MonthTask(date=self.date),
        ]

    def run(self):
        mr.mkdir(soup_config.SOUP_DIR + 'validation')
        validation_errors = soup_config.SOUP_DIR + 'validation/dump_errors'
        with yt.Transaction() as tr:
            soup_storage_tables = [t.corresponding_storage_table() for t in self.dump_tables]

            in_table_paths = [t.table_path() for t in self.dump_tables]
            out_table_paths = [t.create(tr) for t in soup_storage_tables]

            yt.run_map(
                partial(soup_utils.validate_dump_rec, error_table_index=len(out_table_paths)),
                in_table_paths,
                out_table_paths + [validation_errors],
                spec=mr.DATA_SIZE_PER_JOB_20MB_SPEC
            )

            SoupStorageTable.finalize_all(soup_storage_tables, tr)

        report_validation_errors(validation_errors, 'dump', self.date)

    def output(self):
        return self.log_yt_targets_check(
            [t.corresponding_storage_table().as_target()
             for t in self.dump_tables]
        )


class SoupPreprocessing(base_luigi_task.BaseTask):
    date = luigi.Parameter()

    def __init__(self, *args, **kwargs):
        self.soup_ids_out_dir = soup_config.SOUP_DIR + 'ids/'
        self.out_uniq_ids_table = [self.soup_ids_out_dir + id_t
                                   for id_t in soup_edge_type.get_all_soup_id_types()]

        super(SoupPreprocessing, self).__init__(*args, **kwargs)

    def requires(self):
        return [
            AddDayToSoup(self.date),
            AddDumpsToSoup(self.date)
        ]

    def run(self):
        soup_ids_out_dir = self.soup_ids_out_dir
        mr.mkdir(soup_ids_out_dir)

        with yt.Transaction() as tr:  # affects shared directory, thus input should be consistent

            all_not_hashed_id_types = [id_t for id_t in soup_edge_type.get_all_soup_id_types()
                                       if id_t not in soup_config.ID_HASH_MAPPING.values()]

            soup_tables_before_preproc = [SoupStorageTable(et, self.date) for et in soup_config.ALL_EDGES
                                          if et.supply_type != soup_edge_type.SupplyType.PREPROC]

            # 1. fetch all ids from edge to make some artificial edges from them
            calculate_uniq_vertices_from_edges(
                soup_tables_before_preproc,
                all_not_hashed_id_types,
                soup_ids_out_dir,
                tr
            )

            # 2. generate yandex emails from all suitable yandex logins
            login_to_email_out_t = SoupStorageTable(soup_config.login_to_email_et, self.date)
            yt.run_map(
                partial(map_login_to_email, out_edge_type=soup_config.login_to_email_et),
                soup_ids_out_dir + config.ID_TYPE_LOGIN,
                login_to_email_out_t.create(tr)
            )

            # recalculate all affected ids after adding new edge type
            calculate_uniq_vertices_from_edges(
                soup_tables_before_preproc + [login_to_email_out_t],
                [config.ID_TYPE_EMAIL],  # only new emails were generated at previous step
                soup_ids_out_dir,
                tr
            )

            # 3. add artificial edges from original value to hash for some id types
            tables_with_hashes = []
            ops = []
            for out_edge_type in soup_config.md5_hash_preproc_edges:
                hashed_soup_t = SoupStorageTable(out_edge_type, self.date)
                tables_with_hashes.append(hashed_soup_t)

                # hashing is heavy op, trying to parallelize
                job_count = mr.calculate_optimized_mr_partition_count(
                    soup_ids_out_dir + out_edge_type.id1_type,
                    rows_per_job=500000
                )

                ops.append(
                    yt.run_map(
                        partial(id_to_hash_edge, out_edge_type=out_edge_type),
                        soup_ids_out_dir + out_edge_type.id1_type,
                        hashed_soup_t.create(tr),
                        job_count=job_count,
                        sync=False
                    )
                )

            # 3.5. Also add avito hashes for emails (salted md5).
            #      Only generate hashes that are already present among all the avito hashes
            #      That way we'll be able to connect avito stuff with ours and won't have to store
            #      useless hashes that don't match anything.
            src_emails_tbl = soup_ids_out_dir + ids.EMAIL
            existing_avito_hashes_tbl = soup_ids_out_dir + ids.AVITO_HASH
            avitified_emails_tbl = SoupStorageTable(soup_config.email_avito_hash, self.date)
            ops.append(
                yt.run_map_reduce(
                    gen_avito_hashes_map,
                    gen_avito_hashes_reduce,
                    [existing_avito_hashes_tbl, src_emails_tbl],
                    avitified_emails_tbl.create(tr),
                    sort_by=['id2', '_table_index'],
                    reduce_by=['id2'],
                    sync=False
                )
            )

            utils.wait_all(ops)

            soup_preproc_generated_tables = (
                soup_tables_before_preproc +
                [login_to_email_out_t] +
                tables_with_hashes +
                [avitified_emails_tbl]
            )

            # recalculate all affected ids after adding new edge type
            calculate_uniq_vertices_from_edges(
                soup_preproc_generated_tables,
                soup_config.ID_HASH_MAPPING.values() + [ids.AVITO_HASH],  # only hashes were generated at previous step
                soup_ids_out_dir,
                tr
            )

            SoupStorageTable.finalize_all(soup_preproc_generated_tables, tr)

            for t in self.out_uniq_ids_table:
                mr.set_generate_date(t, self.date)

    def output(self):
        uniq_ids_targets = [yt_luigi.YtDateTarget(t, self.date)
                            for t in self.out_uniq_ids_table]

        generated_soup_targets = [
            SoupStorageTable(et, self.date).as_target()
            for et in soup_config.ALL_EDGES
            if et.supply_type == soup_edge_type.SupplyType.PREPROC
        ]

        return self.log_yt_targets_check(
            uniq_ids_targets + generated_soup_targets
        )


class SoupIsReady(luigi.WrapperTask):
    date = luigi.Parameter()

    def requires(self):
        return [
            AddDayToSoup(self.date),
            AddDumpsToSoup(self.date),
            SoupPreprocessing(self.date)
        ]


if __name__ == '__main__':
    yt.config.set_proxy(config.MR_SERVER)
    yt.config["tabular_data_format"] = yt.YsonFormat(process_table_index=True)

    luigi.build([SoupPreprocessing('2017-11-21')], workers=1, scheduler_port=8083)
    #
    # tables = yt.list('//home/crypta/production/state/graph/v2/soup', absolute=True)
    # for t in tables:
    #     mod_time = yt.get_attribute(t, 'modification_time', None)[:10]
    #     node_type = yt.get_attribute(t, 'type')
    #     if mod_time <= '2017-07-01' and node_type == 'table':
    #         print mod_time
    #         print t
    #         yt.remove(t)
