# coding=utf-8
import os
from functools import partial

import luigi
import yt.wrapper as yt

from data_imports.import_dumps import graph_yamoney
from data_imports.import_logs import graph_access
from data_imports.import_logs import graph_barnavig
from data_imports.import_logs import graph_dmp_ditmsk
from data_imports.import_logs import graph_eal
from data_imports.import_logs import graph_import_fp
from data_imports.import_logs import graph_passport
from data_imports.import_logs import graph_passport_sensitive
from data_imports.import_logs import graph_rassilyator
from data_imports.import_logs import graph_sbapi_lookup
from data_imports.import_logs import graph_sovetnik
from data_imports.import_logs.watch_log import graph_watch_log
from data_imports.import_logs.webvisor import graph_webvisor
from lib.luigi import yt_luigi
from rtcconf import config
from utils import mr_utils as mr
from utils import utils

REDUCE_INCREMENTAL_MERGE_THRESHOLD = 1000


def map_split_table(rec):
    rec['@table_index'] = rec['_table_index']
    yield rec


def reduce_incremental_merge(_, recs, index_map, today, store_days):

    yuid_all_rec = None
    today_raw_rec = None
    yuid_raw_recs = []
    last_activity_by_source = dict()

    for rec_num, rec in enumerate(recs):

        if rec['@table_index'] == 0:
            yuid_all_rec = rec
        else:
            yuid_raw_recs.append(rec)
            if rec_num > REDUCE_INCREMENTAL_MERGE_THRESHOLD:
                # Threshold is passed
                if yuid_raw_recs:
                    # Flush accumulated yuid_raw records
                    for yuid_raw_rec in yuid_raw_recs:
                        pair_name = yuid_raw_rec['id_type'] + '_' + yuid_raw_rec['source_type']
                        yuid_raw_rec['@table_index'] = 3
                        yuid_raw_rec['_table_index'] = index_map[pair_name]
                        yield yuid_raw_rec
                    yuid_raw_recs = []
            else:
                # keep last activity date by every source
                pair_name = rec['id_type'] + '_' + rec['source_type']
                if pair_name in last_activity_by_source:
                    last_date, last_rec = last_activity_by_source[pair_name]
                    if rec['id_date'] > last_date:
                        last_activity_by_source[pair_name] = (rec['id_date'], rec)
                else:
                    last_activity_by_source[pair_name] = (rec['id_date'], rec)

                # active today by this source
                if rec['id_date'] == today:
                    today_raw_rec = True

    # active if updated today or had any activity in yuid_with_all
    is_active = False
    if yuid_all_rec:
        activity_dates = yuid_all_rec['all_dates']
        if activity_dates:
            last_date_str = max(activity_dates)
            if last_date_str:
                last_date = utils.date_str_to_date(last_date_str)
                today_dt = utils.date_str_to_date(today)
                delta = (today_dt - last_date).days
                if delta < store_days:  # has some another activity in the last N days
                    is_active = True
            else:
                # only None date left
                is_active = False

    elif today_raw_rec:
        is_active = True

    if is_active:
        # all activity for active yuids to yuid_raw month table
        for rec in yuid_raw_recs:
            pair_name = rec['id_type'] + '_' + rec['source_type']
            rec['@table_index'] = 0
            rec['_table_index'] = index_map[pair_name]
            yield rec

        # only the last activity for active yuids to long live table (for stats)
        for pair_name, (_, rec) in last_activity_by_source.iteritems():
            rec['@table_index'] = 1
            rec['_table_index'] = index_map[pair_name]
            yield rec
    else:
        for pair_name, (_, rec) in last_activity_by_source.iteritems():
            rec['@table_index'] = 2
            rec['_table_index'] = index_map[pair_name]
            yield rec


class IncrementalDayAndDumpMergeTask(yt_luigi.BaseYtTask):
    """
    For specified sources
    - takes previous yuid_id dict and merges new single day to it.  Only active yuids are merged
    or
    - takes dump of source and keeps only active
    """
    date = luigi.Parameter()

    def input_folders(self):
        return {
            'yuid_raw_day': config.YT_OUTPUT_FOLDER + self.date + '/yuid_raw/',  # today update
            'yuid_raw_month': config.GRAPH_YT_DICTS_FOLDER + 'yuid_raw/',  # prev day month table
            'dict': config.GRAPH_YT_DICTS_FOLDER  # to fetch yuid_with_all
        }

    def output_folders(self):
        return {
            'long_live': config.YT_OUTPUT_FOLDER + self.date + '/yuid_raw/long_live/',
            'yuid_raw_month': config.GRAPH_YT_DICTS_FOLDER + 'yuid_raw/'
        }

    def __init__(self, *args, **kwargs):

        self.yuid_source_types_incremental = [
            config.ID_TYPE_PHONE + '_' + config.ID_SOURCE_TYPE_WEBVISOR,  # events are rare
            config.ID_TYPE_EMAIL + '_' + config.ID_SOURCE_TYPE_WEBVISOR,  # events are rare
            config.ID_TYPE_EMAIL + '_' + config.ID_SOURCE_TYPE_AUTO,  # events are rare
            config.ID_TYPE_PHONE + '_' + config.ID_SOURCE_TYPE_AUTO,  # events are rare
            config.ID_TYPE_EMAIL + '_' + config.ID_SOURCE_TYPE_WATCH_LOG_MAILRU,  # 1% watch log

            config.ID_TYPE_VKCOM + '_' + config.ID_SOURCE_TYPE_WATCH_LOG,  # 1% watch log
            config.ID_TYPE_VKCOM + '_' + config.ID_SOURCE_TYPE_FP,
            config.ID_TYPE_VKCOM + '_' + config.ID_SOURCE_TYPE_BARLOG,
            config.ID_TYPE_OKRU + '_' + config.ID_SOURCE_TYPE_BARLOG,
            config.ID_TYPE_OKRU + '_' + config.ID_SOURCE_TYPE_WATCH_LOG,  # 1% watch log

            config.ID_TYPE_AVITO + '_' + config.ID_SOURCE_TYPE_WATCH_LOG,  # 1% watch log
            config.ID_TYPE_MAC + '_' + config.ID_SOURCE_TYPE_VMETRO,   # metro wi-fi redirect with mac

            config.ID_TYPE_FACEBOOK_ID + '_' + config.ID_SOURCE_TYPE_SOVETNIK,   # Sovetnik social ids Facebook
            config.ID_TYPE_OKRU + '_' + config.ID_SOURCE_TYPE_SOVETNIK,   # Sovetnik social ids OK
            config.ID_TYPE_VKCOM + '_' + config.ID_SOURCE_TYPE_SOVETNIK,   # Sovetnik social ids VK
            config.ID_TYPE_VKCOM_NAME + '_' + config.ID_SOURCE_TYPE_SOVETNIK,   # Sovetnik social names VK
            config.ID_TYPE_PHONE + '_' + config.ID_SOURCE_TYPE_DITMSK
        ]
        self.yuid_source_types_dump = [
            config.ID_TYPE_PHONE + '_' + config.ID_SOURCE_TYPE_YAMONEY,
            config.ID_TYPE_EMAIL + '_' + config.ID_SOURCE_TYPE_YAMONEY,
            config.ID_TYPE_YAMONEY_ACCOUNT + '_' + config.ID_SOURCE_TYPE_YAMONEY,
            config.ID_TYPE_YAMONEY_CARD_TOKEN + '_' + config.ID_SOURCE_TYPE_YAMONEY,
            config.ID_TYPE_YAMONEY_INTERNAL + '_' + config.ID_SOURCE_TYPE_YAMONEY,
        ]

        if config.HAS_YANDEX_TICKETS == 'yes':
            self.yuid_source_types_dump.extend([
                config.ID_TYPE_EMAIL + '_' + config.ID_SOURCE_TYPE_TICKETS,
                config.ID_TYPE_PHONE + '_' + config.ID_SOURCE_TYPE_TICKETS
            ])

        self.yuid_source_types = self.yuid_source_types_incremental + self.yuid_source_types_dump

        super(IncrementalDayAndDumpMergeTask, self).__init__(*args, **kwargs)


    def import_long_lived_pairs(self, yuid_with_all,
                                in_yuid_raw_month, in_yuid_raw_day,
                                out_long_live_f, out_yuid_raw_month):

        existing_yuid_with_id = [in_yuid_raw_month + 'yuid_with_' + s for s in self.yuid_source_types]
        existing_yuid_with_id = [t for t in existing_yuid_with_id if yt.exists(t)]  # may not exist

        today_update = [os.path.join(in_yuid_raw_day, 'yuid_with_' + s) for s in self.yuid_source_types]
        today_update = [t for t in today_update if yt.exists(t)]  # may not exist

        # all month active
        new_yuid_with_id = [out_yuid_raw_month + 'yuid_with_' + s for s in self.yuid_source_types]
        # only last rec of all active for statistics purpose
        out_long_live = [out_long_live_f + 'active_yuids_' + s for s in self.yuid_source_types]
        # all inactive yuids
        out_outdated = [out_long_live_f + 'outdated_yuids_' + s for s in self.yuid_source_types]

        index_map = dict([(s, ii)
                          for ii, s
                          in enumerate(self.yuid_source_types)])

        mr.sort_all(today_update + existing_yuid_with_id, sort_by='yuid')  # assume yuid_with_all sorted

        # Reducing a lot of stuff into a lot of tables generates a shit-ton of chunks (700k+ here),
        # so we first reduce everything into one table, merge that, and then split into final tables.
        tmp_crutch_table_new = out_long_live_f + '_tmp_crutch_new'
        tmp_crutch_table_long_live = out_long_live_f + '_tmp_crutch_long_live'
        tmp_crutch_table_outdated = out_long_live_f + '_tmp_crutch_outdated'
        huge_yuids_table = out_long_live_f + 'huge_yuids_updates_over_{}_recs'.format(REDUCE_INCREMENTAL_MERGE_THRESHOLD)
        yt.run_reduce(partial(reduce_incremental_merge,
                              index_map=index_map, today=self.date,
                              store_days=int(config.STORE_DAYS)),
                      [yuid_with_all] + existing_yuid_with_id + today_update,
                      [tmp_crutch_table_new, tmp_crutch_table_long_live, tmp_crutch_table_outdated, huge_yuids_table],
                      reduce_by='yuid',
                      spec={
                          "yt_job_max_memory_bytes": config.YT_JOB_MAX_MEMORY_BYTES
                      })

        for in_table, out_tables in zip(
                [tmp_crutch_table_new, tmp_crutch_table_long_live, tmp_crutch_table_outdated],
                [new_yuid_with_id, out_long_live, out_outdated]):
            yt.run_map(
                map_split_table,
                in_table,
                out_tables,
            )
        mr.drop(tmp_crutch_table_new)
        mr.drop(tmp_crutch_table_long_live)
        mr.drop(tmp_crutch_table_outdated)

        mr.sort_all(new_yuid_with_id + out_long_live + out_outdated, sort_by='yuid')
        for t in new_yuid_with_id:
            mr.set_generate_date(t, self.date)

    def requires(self):
        return [
            graph_barnavig.ImportBarNavigDayTask(date=self.date, run_date=self.date),
            graph_yamoney.ImportYandexMoneyDump(date=self.date),
            graph_import_fp.ImportFPDayTask(date=self.date, run_date=self.date),
            graph_webvisor.ImportWebvisorTask(date=self.date, run_date=self.date),  # webvisor
            graph_watch_log.ImportWatchLogDayTask(date=self.date, run_date=self.date),
            graph_dmp_ditmsk.ImportDitMskTask(date=self.date, run_date=self.date),
            graph_sovetnik.ImportSovetnikDayTask(date=self.date, run_date=self.date)
        ]

    def run(self):

        mr.mkdir(self.out_f('long_live'))
        mr.mkdir(self.out_f('yuid_raw_month'))

        self.import_long_lived_pairs(self.in_f('dict') + 'yuid_with_all',
                                     self.in_f('yuid_raw_month'), self.in_f('yuid_raw_day'),
                                     self.out_f('long_live'), self.out_f('yuid_raw_month'))

    def output(self):
        return [yt_luigi.YtDateTarget(os.path.join(self.out_f('yuid_raw_month'),
                                                   'yuid_with_' + s), self.date)
                for s in self.yuid_source_types]


class FullMonthYuidMergeTask(yt_luigi.BaseYtTask):
    """
    For specified sources takes all day tables and merge it all together for month
    """
    date = luigi.Parameter()

    def __init__(self, *args, **kwargs):

        # will be collected for a month
        self.yuid_source_types_day = [
            config.ID_TYPE_BAR_UI,
            config.ID_TYPE_BAR_R1,
            config.ID_SOURCE_TYPE_EAL,
            config.ID_SOURCE_TYPE_PUNTO,
            config.ID_SOURCE_TYPE_EXTERNAL_BROWSERS,
            config.ID_TYPE_BAR_UI + '_' + config.ID_SOURCE_TYPE_BROWSER_MANAGER,

            config.ID_TYPE_EMAIL + '_' + config.ID_SOURCE_TYPE_PAGE_TITLE,
            config.ID_SOURCE_TYPE_SENDER,

            config.ID_SOURCE_TYPE_FP,
            config.ID_TYPE_PUID + '_' + config.ID_SOURCE_TYPE_FP,

            config.ID_TYPE_EMAIL + '_' + config.ID_SOURCE_TYPE_KINOPOISK,
            config.ID_TYPE_KINOPOISK_UID + '_' + config.ID_SOURCE_TYPE_KINOPOISK,

            config.ID_TYPE_EMAIL + '_' + config.ID_SOURCE_TYPE_BARLOG,

            # doesn't provide ids matching, but some additional info fields instead
            config.FIELD_UA,   # client side
            config.FIELD_UA + '_' + config.ID_SOURCE_TYPE_ACCESS_LOG,  # server side
            config.ID_TYPE_IP + '_' + config.ID_SOURCE_TYPE_ACCESS_LOG]  # server side

        # will be collected for a month and enriched with yuid
        self.puid_source_types_day = [
            config.ID_TYPE_PHONE + '_' + config.ID_SOURCE_TYPE_PASSPORT,
            config.ID_TYPE_PHONE + '_' + config.ID_SOURCE_TYPE_PASSPORT_SENSITIVE,
            # config.ID_TYPE_INSTAGRAM_ID + '_' + config.ID_SOURCE_TYPE_INSTAGRAM_POCHTA,
            # config.ID_TYPE_INSTAGRAM_LOGIN + '_' + config.ID_SOURCE_TYPE_INSTAGRAM_POCHTA
        ]

        self.tables = ['yuid_with_' + s for s in self.yuid_source_types_day] + \
                      ['puid_with_' + s for s in self.puid_source_types_day]

        super(FullMonthYuidMergeTask, self).__init__(*args, **kwargs)

    def input_folders(self):
        return {
            'graph': config.YT_OUTPUT_FOLDER
        }

    def output_folders(self):
        return {
            'yuid_raw': config.GRAPH_YT_DICTS_FOLDER + 'yuid_raw/',
        }

    def requires(self):
        req = []

        for dt in utils.get_dates_before(self.date, int(config.STORE_DAYS)):
            req.append(graph_barnavig.ImportBarNavigDayTask(date=dt, run_date=self.date))
            req.append(graph_watch_log.ImportWatchLogDayTask(date=dt, run_date=self.date))
            req.append(graph_passport.ImportPassportPhoneDayTask(date=dt, run_date=self.date))
            req.append(graph_passport_sensitive.ImportPassportPhoneBindingsDayTask(date=dt, run_date=self.date))
            req.append(graph_eal.ImportEalDayTask(date=dt, run_date=self.date))
            req.append(graph_import_fp.ImportFPDayTask(date=dt, run_date=self.date))
            req.append(graph_webvisor.ImportWebvisorTask(date=dt, run_date=self.date))
            req.append(graph_access.ImportAccessLogsDayTask(date=dt, run_date=self.date))
            req.append(graph_sbapi_lookup.ImportSbApiMitbLogDayTask(date=dt, run_date=self.date))
            # req.append(graph_insta.ImportInstagramDayTask(date=dt, run_date=self.date))

            # TODO: remove this when sender logs stops being new
            if yt.exists(config.STATBOX_SENDER_FOLDER + dt):
                req.append(graph_rassilyator.ImportSenderDayTask(date=dt, run_date=self.date))

        return req

    def run(self):
        graph_folder = self.in_f('graph')

        dict_yuid_raw_out_f = self.out_f('yuid_raw')
        mr.mkdir(dict_yuid_raw_out_f)

        ops = []

        for t in self.tables:
            yuid_with_id_tables = mr.get_existing_date_tables(graph_folder, 'yuid_raw/' + t,
                                                              int(config.STORE_DAYS))

            if yuid_with_id_tables:
                # assume tables contains unique (yuid, id_value, date) recs
                # so we can just merge with no reduce
                ops.append(yt.run_sort(yuid_with_id_tables,
                                       dict_yuid_raw_out_f + t,
                                       sort_by=['yuid', 'id_date'],
                                       sync=False))

        utils.wait_all(ops)

        for month_table in self.tables:
            mr.set_generate_date(dict_yuid_raw_out_f + month_table, self.date)

    def output(self):
        dict_yuid_raw_f = self.out_f('yuid_raw')
        return [yt_luigi.YtDateTarget(dict_yuid_raw_f + month_table, self.date)
                for month_table in self.tables]



if __name__ == '__main__':
    yt.config.set_proxy(config.MR_SERVER)

    # workdir = '//home/crypta/team/artembelov/long_live/'
    # mr.mkdir(workdir)



