#!/usr/bin/python
# coding=utf-8

import copy
import ctypes
from datetime import datetime
from functools import partial

import yt.wrapper as yt

import crypta_id_generator
import graph_vertices_pretty
from lib.luigi import yt_luigi
from rtcconf import config
from utils import mr_utils as mr
from utils import utils

OOM_LIMIT = 100


def mkrec(key, subkey, value, table_index=0):
    return {'key': key, 'subkey': subkey, 'value': value, '@table_index': table_index}


def uint64str(i):
    return str(ctypes.c_uint64(i).value)


def pair_sources(rec, left_or_right):
    pair_type = rec['pair_type']
    if pair_type == 'y_y':
        id_type = rec['id_type']
        source_types = rec['yuid%s_sources' % left_or_right]
        return [id_type + '_' + st for st in source_types]
    else:
        return [rec['source_type'] + '_' + rec['match_type']]


def reduce_init_graph(pair_key, recs):
    sources_left = set()
    sources_right = set()

    for rec in recs:
        pair_type = rec['pair_type']
        sources_left.update(pair_sources(rec, 1))
        sources_right.update(pair_sources(rec, 2))

    try:
        y1, y2 = pair_key['key'].split('_')
        p1, p2 = pair_type.split('_')

        if y1 != y2:
            # check for valid yuid
            if p1 == 'y':
                int(y1)
            if p2 == 'y':
                int(y2)

            yield mkrec(y1, '2', y2, table_index=0)
            yield mkrec(y2, '2', y1, table_index=0)

            yield mkrec(y1, 'a', 's=' + ','.join(sources_left) + '\td=' + p1, table_index=1)
            yield mkrec(y2, 'a', 's=' + ','.join(sources_right) + '\td=' + p2, table_index=1)
    except ValueError:
        yield {'key': pair_key['key'], 'p': pair_type,
               'sources': ','.join(sources_left.union(sources_right)), '@table_index': 2}


def get_min_by_ts(cid1, cid2):
    ts1 = int(cid1[-10:])
    ts2 = int(cid1[-10:])

    if ts1 < ts2:
        return cid1
    elif ts1 > ts2:
        return cid2
    elif cid1 < cid2:
        return cid1
    else:
        return cid2


def get_min_cid(cid1, cid_type1, cid2, cid_type2):
    if cid_type1 == 'i':
        if cid_type2 == 'i':
            return get_min_by_ts(cid1, cid2), cid_type1
        else:
            return cid1, cid_type1
    elif cid_type2 == 'i':
        return cid2, cid_type2
    else:
        # cid_type1 will be equal to cid_type2 and equal to 'g'
        return get_min_by_ts(cid1, cid2), cid_type1


def reduce_changed(key, recs):
    recs = list(recs)

    new_vertex = mr.get_singe_rec(recs, table_index=2)
    old_vertex = mr.get_singe_rec(recs, table_index=3)

    yuid_active_today = mr.get_singe_rec(recs, table_index=0)

    all_devids = [rec for rec in recs if rec['@table_index'] == 1]
    devid_active_today = all_devids[0] if all_devids else None 

    def add_or_update(s, diff):
        """
        upload for two cases: new match or old match ttl update
        :param s: upload info
        :param diff: if rec is diff comparing to previous vertices
        """
        if new_vertex['crypta_id_size'] <= config.FINAL_MERGE_LIMIT:
            new_vertex['@table_index'] = 0
            new_vertex['status'] = s
            new_vertex['diff'] = diff
            yield new_vertex
        else:
            # single place to track final merge overlimit crypta_ids
            new_vertex['@table_index'] = 2  # overlimit stats
            new_vertex['status'] = s + ',overlimit'
            yield new_vertex

            if old_vertex:
                yield remove('overlimit')

    def remove(s):
        # clean up old crypta id matches or old vertices
        old_vertex['@table_index'] = 1
        old_vertex['status'] = s
        return old_vertex

    if new_vertex:
        # Active today or no yesterday record - yield:
        # 1) Active today - we need this to update TTL in BB
        # 2) No yesterday - we need this in case when some yuid from the past(not active today) matched
        if old_vertex:
            diff = False
            if yuid_active_today or devid_active_today:
                status = 'active_today'
                diff = True  # active today counts as diff, because we need to update time
            else:
                status = ''

            new_crypta_id = new_vertex['crypta_id']
            old_crypta_id = old_vertex['crypta_id']

            if new_crypta_id == old_crypta_id:
                status = ','.join([status, 'keep_crypta_id'])
            else:
                status = ','.join([status, 'change_crypta_id'])
                diff = True
                yield remove(status)

            for r in add_or_update(status, diff=diff):
                yield r
            return

        else:
            for r in add_or_update('new', diff=True):
                yield r
            return

    elif old_vertex:
        yield remove('obsolete')
        return
    # else:
    # otherwise not in vertices at all (single yuids and devids)


def map_final_merge_limit(rec):
    if rec['crypta_id_size'] <= config.FINAL_MERGE_LIMIT:
        yield rec


def map_orig_source(vertex):
    src = mr.get_field_value('s', vertex['value'])
    if src:
        yield mkrec(vertex['key'], vertex['subkey'], vertex['value'] + '\torig_s=' + src)
    else:
        yield vertex  # though it's probably not a vertex


def reduce_vertex_id_type(key, recs):
    key = key['key']
    ua_profile_type = None
    vertices = []

    for r in recs:
        if r['subkey'] == 'yi':
            # dict record
            ua_profile = mr.get_field_value('ua_profile', r['value'])
            if ua_profile:
                ua_profile_type = ua_profile.split('|')[0]  # 'd'/'m' for desktop/mobile
        else:
            vertices.append(r)  # there shouldn't be more than 1 record actually

    for r in vertices:
        d = mr.get_field_value('d', r['value'])
        id_type = None
        if d == 'd':
            id_type = 'deviceid'
        elif d == 'y':
            if 'm' == ua_profile_type:
                id_type = 'yuid-mobile'
            elif 'd' == ua_profile_type:
                id_type = 'yuid-desktop'
            else:
                id_type = 'yuid-unknown'

        if id_type is not None:
            yield mkrec(r['key'], r['subkey'], r['value'] + '\tid_type=' + id_type)
        else:
            yield r


def run_graph_complete(src, out_vertices_table, workdir, yuid_ua):
    import graph_vertices_reassign as crutches

    edges = workdir + 'edges'
    vertices_tmp = workdir + 'vertices'

    yt.run_map_reduce(None, reduce_init_graph, src,
                      [edges, vertices_tmp, src + '_yuid_err'],
                      sort_by=['key', 'subkey'], reduce_by='key')
    yt.run_map_reduce(None, crutches.reduce_add_cid, vertices_tmp, vertices_tmp, sort_by=['key', 'subkey'], reduce_by='key')
    yt.run_map(map_orig_source, vertices_tmp, vertices_tmp)

    edges_merges = edges + '_merges'
    edges_cc = edges + '_cc'
    yt.run_map_reduce(None,
                      crutches.cc_mr_iter,
                      edges,
                      [edges_cc, edges_merges],
                      sort_by=['key', 'value'],
                      reduce_by='key')

    while yt.row_count(edges_merges) > 0:
        yt.run_map_reduce(None,
                          crutches.cc_mr_iter,
                          edges_cc,
                          [edges_cc, edges_merges],
                          sort_by=['key', 'value'],
                          reduce_by='key')

    yt.run_map(crutches.cc_mr_remove_backwards_edges, edges_cc, edges_cc)

    edges_fakecid = edges + '_fakecid'
    yt.run_sort(edges_cc, sort_by='key')
    yt.run_reduce(crutches.reduce_add_component_center, edges_cc, edges_fakecid,
                  reduce_by='key')

    yt.run_map_reduce(crutches.map_fake_cid_ts, crutches.reduce_fake_cid, edges_fakecid, edges_fakecid,
                      sort_by=['key', 'ts'], reduce_by='key')

    yt.run_sort([edges_fakecid, vertices_tmp], vertices_tmp,
                sort_by=['key', 'subkey'])
    yt.run_reduce(crutches.join_edges_vertices_cid, vertices_tmp, vertices_tmp,
                  sort_by=['key', 'subkey'], reduce_by='key')

    yt.run_map_reduce(None, reduce_vertex_id_type,
                      [yuid_ua, vertices_tmp], vertices_tmp,
                      sort_by=['key', 'subkey'], reduce_by='key')
    yt.run_sort(vertices_tmp, out_vertices_table, sort_by='key')
    yt.remove(vertices_tmp)


def reassign_graph_cids(vertices_folder, in_vertices_table, dt, is_cids, final_vertices):
    import graph_vertices_reassign as rakes  # import here as a workaround for YT segmentation faults

    workdir = vertices_folder + 'reassign/'
    mr.mkdir(workdir)
    t = workdir + 'vertices'

    # assume all sorted
    yt.run_reduce(rakes.reduce_old_cids,
                  [in_vertices_table, is_cids],
                  t + '_1_old_cids', reduce_by='key')

    yt.run_map_reduce(None, rakes.count_cid_intersection_sizes_and_ts,
                      t + '_1_old_cids',
                      t + '_2_intersection',
                      reduce_by=['fakecid', 'oldcid'])

    yt.run_map_reduce(None, rakes.reduce_find_best_oldcids,
                      t + '_2_intersection',
                      t + '_3_proper_old_cids',
                      sort_by=['fakecid', 'intersect_size', 'oldcid_ts', 'oldcid'],
                      reduce_by='fakecid')

    yt.run_map_reduce(None, rakes.reduce_fix_duplicate_oldcid,
                      t + '_3_proper_old_cids',
                      t + '_4_no_duplicate_oldcids',
                      sort_by=['oldcid', 'intersect_size'],
                      reduce_by='oldcid')

    yt.run_map_reduce(None, rakes.mk_cid_gen_reducer(dt),
                      t + '_4_no_duplicate_oldcids',
                      t + '_5_new_cids_generated',
                      reduce_by='newcid')

    utils.wait_all([
        yt.run_map_reduce(None, rakes.mk_uniq_cid_reducer('fakecid', 'newcid'),
                          t + '_5_new_cids_generated',
                          t + '_fakecid_conflicts',
                          reduce_by='fakecid', sync=False),

        yt.run_map_reduce(None, rakes.mk_uniq_cid_reducer('newcid', 'fakecid'),
                          t + '_5_new_cids_generated',
                          t + '_newcid_conflicts',
                          reduce_by='newcid', sync=False),

        yt.run_map_reduce(rakes.map_reassign_cids_join_order, rakes.reduce_reassign_cids,
                          [t + '_5_new_cids_generated', t + '_1_old_cids'],
                          t + '_6_reassigned_cids',
                          sort_by=['fakecid', 'jord'],
                          reduce_by='fakecid', sync=False)
    ])

    yt.run_sort(t + '_6_reassigned_cids', sort_by='key')
    yt.run_reduce(rakes.join_reassigned_cid_back,
                  [in_vertices_table, t + '_6_reassigned_cids'],
                  final_vertices,
                  reduce_by='key')

    yt.run_sort(final_vertices, sort_by='key')



def change_crypta_id_for_experiment(rec, experiment_id):
    # crypta ids shouldn't intersect among experiments, let's change one symbol
    experiment_crypta_id = crypta_id_generator.change_crypta_id_for_experiment(rec['crypta_id'], experiment_id)
    if 'crypta_id_history' not in rec:
        rec['crypta_id_history'] = dict()
    rec['crypta_id_history']['before_experiment'] = rec['crypta_id']
    rec['crypta_id'] = experiment_crypta_id
    rec['experiment_id'] = experiment_id
    yield rec


def prepare_for_experiment(in_vertices_table, out_vertices_table, vertices_type):
    experiment_id = config.VERTICES_EXPERIMENTS.get(vertices_type)
    if experiment_id:
        yt.run_map(partial(change_crypta_id_for_experiment, experiment_id=experiment_id),
                   in_vertices_table,
                   out_vertices_table)
        yt.run_sort(out_vertices_table, sort_by='key')
    elif in_vertices_table != out_vertices_table:
        mr.copy(in_vertices_table, out_vertices_table)


def separate_large_crypta_ids(rec):
    if rec['crypta_id_size'] > 5000:
        rec['@table_index'] = 1
    else:
        rec['@table_index'] = 0

    yield rec


class V2VerticesConfig:
    def __init__(self, path, vertices_type, date, producing_task):
        self.path = path
        self.vertices_type = vertices_type
        self.date = date
        self.producing_task = producing_task
        self.main_vertices = False
        self.bb_experiment = False
        self.vertices_only = True

    def get_previous_vertices_config(self, days_before=1):
        return None

    def get_prev_vertices_table(self):
        return None

    def get_vertices_folder(self):
        return self.path

    def get_vertices_table(self):
        return self.path + 'vertices'

    def has_reassigns(self):
        return self.main_vertices


class VerticesConfig:
    def __init__(self, relative_path, vertices_type, date, producing_task=None,
                 base_path=config.YT_OUTPUT_FOLDER,
                 bb_experiment=False, main_vertices=False, vertices_only=False):
        self.relative_path = relative_path
        self.vertices_type = vertices_type
        self.date = date
        self.producing_task = producing_task
        self.base_path = base_path
        self.main_vertices = main_vertices
        self.bb_experiment = bb_experiment
        self.vertices_only = vertices_only

    def get_previous_vertices_config(self, days_before=1):
        self_copy = copy.copy(self)
        self_copy.date = utils.get_date_before(self.date, days_before)
        return self_copy

    def _get_date_table(self, postfix):
        return self.base_path + self.date + '/' + postfix

    def get_vertices_folder(self):
        return (self._get_date_table(self.relative_path.strip('/')) or '/') + '/'

    def get_vertices_table(self):
        return self._get_date_table(self.relative_path + 'vertices')

    def get_prev_vertices_table(self):
        return mr.get_prev_table(self.base_path, self.date, self.relative_path + 'vertices', raise_exc=False)

    def get_edges_table(self):
        return self._get_date_table(self.relative_path + 'edges')

    def get_upload_table(self):
        return self._get_date_table(self.relative_path + 'upload/vertices_add_or_update')

    def get_upload_rm_table(self):
        return self._get_date_table(self.relative_path + 'upload/vertices_remove')

    def has_reassigns(self):
        return self.main_vertices  # only main production vertices needs reassign from IS


    def __str__(self):
        # used as luigi param representation
        return '%s(vertices_type=%s, date=%s)' % (self.__class__.__name__, self.vertices_type, self.date)

    def __repr__(self):
        # used as luigi param representation in lists
        return '%s(vertices_type=%s, date=%s)' % (self.__class__.__name__, self.vertices_type, self.date)


class BaseVerticesTask(yt_luigi.BaseYtTask):

    def input_folders(self):
        return {
            'dict': config.GRAPH_YT_DICTS_FOLDER,
            'devid_dict': config.INDEVICE_YT_FOLDER + self.vertices_config.date + '/perfect/',
            'is_output': config.IS_OUTPUT_FOLDER
        }

    def output_folders(self):
        return {
            'vertices_folder': self.vertices_config.get_vertices_folder()
        }

    def __init__(self, vertices_config, **kwargs):
        self.vertices_config = vertices_config
        super(BaseVerticesTask, self).__init__(**kwargs)

    def requires(self):
        from data_imports.import_logs.idserv import graph_cid
        from matching.yuid_matching import graph_dict
        from matching.device_matching.perfect.device_yuid_perfect_dict import DeviceYuidsMergePerfectDictMonthTask
        from matching.device_matching.app_metrica import app_metrica_to_old_formats

        dt = self.vertices_config.date
        tasks = [
            graph_dict.YamrFormatDicts(dt),  # yuid_ua
            DeviceYuidsMergePerfectDictMonthTask(dt),  # devid_yuid_all for pretty
            app_metrica_to_old_formats.ConvertAppMetricaDictsToOldFormats(dt),  # dev_info for pretty
            graph_dict.YuidAllIdDictsTask(dt),  # yuid_with_all for pretty
        ]

        if self.vertices_config.has_reassigns():
            tasks.append(graph_cid.GraphCidTask(dt)),
            tasks.append(graph_cid.LostIsYuidsMonitoringTask(dt))

        return tasks

    def create_vertices(self, in_pairs_table, out_vertices_table, out_edges_table):
        """
        produces enriched vertices in pretty format
        :param in_pairs_table: all collected pairs merged to single table
        :param out_vertices_table: out pretty vertices
        :param out_edges_table: out pretty edges
        :return:
        """
        workdir = self.vertices_config.get_vertices_folder()
        vertices_raw_table = workdir + 'vertices_0'  # old style yamr vertices

        # this code only can produce yamr format vertices
        mr.mkdir(workdir + 'mr_cc')
        run_graph_complete(in_pairs_table, vertices_raw_table, workdir + 'mr_cc/',
                           self.in_f('dict') + 'yuid_ua')

        # enrich vertices and edges with additional info and make it yt style
        graph_vertices_pretty.prettify(workdir,
                                       vertices_raw_table, in_pairs_table,
                                       self.in_f('dict'),
                                       self.in_f('devid_dict'),
                                       out_vertices_table, out_edges_table)


    def reassign_crypta_id_from_is(self, in_vertices_table, out_vertices_table):
        reassign_graph_cids(self.vertices_config.get_vertices_folder(),
                            in_vertices_table,
                            datetime.strptime(self.vertices_config.date, '%Y-%m-%d'),
                            is_cids=self.in_f('is_output') + 'mapping_yuid_cid',
                            final_vertices=out_vertices_table)

    def prepare_for_bb_experiment(self, in_vertices_table, out_vertices_table):
        prepare_for_experiment(in_vertices_table, out_vertices_table, self.vertices_config.vertices_type)

    def vertices_pipeline(self, edges_raw_table, workdir):

        edges_pretty_tmp = workdir + 'edges_tmp'
        vertices_pretty_tmp = workdir + 'vertices_tmp'

        # produces enriched vertices in pretty format
        self.create_vertices(in_pairs_table=edges_raw_table,
                             out_vertices_table=vertices_pretty_tmp,
                             out_edges_table=edges_pretty_tmp)

        crypta_ids_changed = False
        if self.vertices_config.has_reassigns():
            self.reassign_crypta_id_from_is(in_vertices_table=vertices_pretty_tmp,
                                            out_vertices_table=vertices_pretty_tmp)
            crypta_ids_changed = True

        if self.vertices_config.bb_experiment:
            self.prepare_for_bb_experiment(in_vertices_table=vertices_pretty_tmp,
                                           out_vertices_table=vertices_pretty_tmp)
            crypta_ids_changed = True

        if crypta_ids_changed:
            # also merge all changes in crypta ids to edges
            yt.run_reduce(graph_vertices_pretty.join_crypta_id_from_vertices_to_edges,
                          [vertices_pretty_tmp,
                           edges_pretty_tmp],
                          [edges_pretty_tmp,
                           workdir + 'debug_lost_vertices'],
                          reduce_by='key'),

        # large vertices cause oom in later process
        utils.wait_all([
            yt.run_map(separate_large_crypta_ids,
                       edges_pretty_tmp,
                       [edges_pretty_tmp,
                        edges_pretty_tmp + '_large'],
                       sync=False),
            yt.run_map(separate_large_crypta_ids,
                       vertices_pretty_tmp,
                       [vertices_pretty_tmp,
                        vertices_pretty_tmp + '_large'],
                       sync=False)
        ])

        utils.wait_all([
            yt.run_sort(edges_pretty_tmp, workdir + 'edges',
                        sort_by=['crypta_id', 'crypta_id_size', 'id1', 'id2'], sync=False),
            yt.run_sort(edges_pretty_tmp + '_large', workdir + 'edges_large',
                        sort_by=['crypta_id', 'crypta_id_size', 'id1', 'id2'], sync=False),
            yt.run_sort(vertices_pretty_tmp, workdir + 'vertices',
                        sort_by='key', sync=False),
            yt.run_sort(vertices_pretty_tmp + '_large', workdir + 'vertices_large',
                        sort_by='key', sync=False)
        ])

        mr.drop(edges_pretty_tmp)
        mr.drop(edges_pretty_tmp + '_large')
        mr.drop(vertices_pretty_tmp)
        mr.drop(vertices_pretty_tmp + '_large')

    def output(self):
        out_folder = self.out_f('vertices_folder')
        return [yt_luigi.YtTarget(out_folder + 'vertices'),
                yt_luigi.YtTarget(out_folder + 'vertices_large', allow_empty=True),
                yt_luigi.YtTarget(out_folder + 'edges'),
                yt_luigi.YtTarget(out_folder + 'edges_large', allow_empty=True)]


if __name__ == '__main__':
    yt.config.set_proxy(config.MR_SERVER)
    yt.config["tabular_data_format"] = yt.YsonFormat(process_table_index=True)

