from functools import partial
from itertools import product

import luigi
import yt.wrapper as yt

from human_matching import graph_vertices
from lib import yt_trace
from lib.luigi import yt_luigi
from rtcconf import config
from utils import mr_utils as mr
from utils import utils


def puid_uuid_devid_mapper(rec):
    '''
    JOINING UUID-PUID AND UUID-DEVID BY UUID
    MAPPER
    '''
    uuid = rec.get('uuid')
    puid = rec.get('puid')
    devid = rec.get('devid')
    if devid and uuid:
        yield {'uuid':uuid, 'devid':devid}
    elif puid and uuid:
        yield {'uuid':uuid, 'puid':puid}

def puid_uuid_devid_reducer(_, recs):
    '''
    JOINING UUID-PUID AND UUID-DEVID BY UUID
    REDUCER
    '''
    puids = []
    devids = []
    for rec in recs:
        puid = rec.get('puid')
        devid = rec.get('devid')
        if puid:
            puids.append(puid)
        elif devid:
            devids.append(devid)
    for puid in puids:
        for devid in devids:
            yield {'puid':puid, 'devid':devid}

def prepare_input_tables(rec, in_indexes=None):
    '''
    GETTING VALUES WITH CRYPTA_ID PAIRS
    MAPPER
    '''
    table_index = rec['@table_index']

    if table_index == in_indexes['account_manager']:
        id_hash = rec.get('id_hash')
        devid = rec.get('deviceid')
        id_type = rec.get('id_type','').replace('email','emailhash').replace('phone','phonehash')

        if id_hash and devid and id_type:
            yield {
                'key': devid,
                'id_type': 'deviceid',
                'value_type': id_type,
                'value': id_hash
            }

    elif table_index == in_indexes['yuid_with_id_email']:
        yuid = rec.get('key')
        id_value = rec.get('id_value')

        if yuid and id_value:
            yield {
                'key': yuid,
                'id_type': 'yuid',
                'value_type': 'emailhash',
                'value': utils.md5(id_value)
            }

    elif table_index == in_indexes['yuid_with_id_phone']:
        yuid = rec.get('key')
        id_value = rec.get('id_value')

        if yuid and id_value:
            yield {
                'key': yuid,
                'id_type': 'yuid',
                'value_type': 'phonehash',
                'value': id_value
            }

    elif table_index in (
            in_indexes['puid_with_devid'],
            in_indexes['puid_with_through_uuid']
    ):  # {'puid':puid, 'devid':devid}
        puid = rec.get('puid')
        devid = rec.get('devid')
        if puid and devid:
            yield {
                'key': devid,
                'id_type': 'deviceid',
                'value_type': 'puid',
                'value': puid
            }

    elif table_index == in_indexes['puid_with_yuid']:  # yuid-puid pairs
        key = rec.get('yuid')
        id_value = rec.get('puid')
        if key and id_value:
            yield {
                'key': key,
                'id_type': 'yuid',
                'value_type': 'puid',
                'value': id_value
            }

    elif table_index == in_indexes['matching']:
        key = rec.get('key')
        crypta_id = rec.get('crypta_id')
        id_type = rec.get('id_type', '').split('-')[0]

        if crypta_id and key and id_type:  # key = yuid or devid (id_type)
            yield {
                'key': key,
                'id_type': id_type,
                'value_type': 'phonehash',
                'crypta_id': crypta_id
            }
            yield {
                'key': key,
                'id_type': id_type,
                'value_type': 'emailhash',
                'crypta_id': crypta_id
            }
            yield {
                'key': key,
                'id_type': id_type,
                'value_type': 'puid',
                'crypta_id': crypta_id
            }

def get_cryptaid_for_value(keys, recs, out_indexes=None):
    '''
    GETTING VALUES WITH CRYPTA_ID PAIRS
    REDUCER
    '''
    cryptaids = set()
    val_type = keys['value_type']
    values = set()
    for rec in recs:
        val = rec.get('value')
        if val:
            values.add(val)
            continue
        cryptaid = rec.get('crypta_id')
        if cryptaid:
            cryptaids.add(cryptaid)
    for val in values:
        yield {
            val_type: val,
            'sec_value': keys['key'],
            'sec_value_type': keys['id_type'].replace('deviceid', 'devid'),
            '@table_index': out_indexes[('nocryptaid', val_type)]
        }
    if len(cryptaids) == 1:
        for val in values:
            yield {
                val_type: val,
                'crypta_id': list(cryptaids)[0],
                '@table_index': out_indexes[('cryptaid', val_type)]
            }
    elif len(cryptaids) > 1:
        yield {
            'vals': ' '.join(values),
            'cryptaids': ' '.join(cryptaids),
            'valtype': val_type,
            'id_type': keys['id_type'],
            'id_value': keys['key'],
            '@table_index': out_indexes[('errors')]
        }
        for cryptaid in cryptaids:
            for val in values:
                yield {
                    val_type: val,
                    'crypta_id': cryptaid,
                    '@table_index': out_indexes[('cryptaid', val_type)]
                }

def get_vals_by_cryptaid(keys, recs, value=None):
    '''
    GETTING {VALUE} WITH {YUID/DEVID} PAIRS (THROUGH CRYPTA_ID)
    '''
    yuids = set()
    devids = set()
    values = set()
    for rec in recs:
        id_type = rec.get('id_type', '')
        if not id_type:
            val = rec.get(value)
            if val:
                values.add(val)
        elif id_type == 'deviceid':
            key = rec.get('key')
            if key:
                devids.add(key)
        elif id_type.startswith('yuid'):
            key = rec.get('key')
            if key:
                yuids.add(key)
    for val in values:
        for devid in devids:
            yield {
                value: val,
                'sec_value': devid,
                'sec_value_type': 'devid',
                'crypta_id': keys['crypta_id']
            }
        for yuid in yuids:
            yield {
                value: val,
                'sec_value': yuid,
                'sec_value_type': 'yuid',
                'crypta_id': keys['crypta_id']
            }

def join_and_split_result_tables(keys, recs, value=None):
    '''
    UNION {VALUE}_WITH_{YUID/DEVID} (THROUGH CRYPTA_ID) TABLE
    WITH {VALUE}_WITH_{YUID/DEVID} (NO CRYPTA_ID)
    '''
    cryptaid = None
    directs = False
    val = keys.get(value)
    id_type = keys['sec_value_type']
    id_value = keys['sec_value']
    if not val:
        return
    if id_type not in ('yuid', 'devid'):
        return
    for rec in recs:
        cryptaid_record = rec.get('crypta_id')
        if cryptaid_record:
            cryptaid = cryptaid_record
        else:
            directs = True
    yield {
        value: val,
        id_type: id_value,
        'crypta_id': cryptaid,
        'has_direct': directs,
        '@table_index': ('yuid', 'devid').index(id_type)
    }


class CreateImportantPairsTablesAfterMatching(yt_luigi.BaseYtTask):
    '''
    This Luigi task takes information for pairs puid/email/phone and yuid/devid
    and enriches it with our matching result, by creating additional tables
    in folder ._._._./dicts/phone_email_puid
    '''

    date = luigi.Parameter()
    exact_task = luigi.TaskParameter()
    fuzzy_task = luigi.TaskParameter()

    def input_folders(self):
        return {
            'dict': config.GRAPH_YT_DICTS_FOLDER,
            'matching': config.GRAPH_YT_DICTS_FOLDER + 'matching/',
            'devid_raw_month': config.INDEVICE_YT_FOLDER + self.date + '/perfect/devid_raw_month/'
        }

    def output_folders(self):
        return {
            'tmp_output': config.GRAPH_FOLDER + self.date + '/phone_email_puid/',
            'output': config.GRAPH_YT_DICTS_FOLDER + 'phone_email_puid/'
        }

    def requires(self):
        '''
        matching must be done to complete this task
        '''
        return [
            graph_vertices.CopyVerticesToDict(
                self.date,
                exact_task=self.exact_task,
                fuzzy_task=self.fuzzy_task
            )
        ]

    def output(self):
        '''
        list of output tables:
        all passible pairs between
        phonehash/puid/emailhash
        and
        yuid/devid/cryptaid
        '''
        out_date_folder = self.out_f('output')
        id_types = ('phonehash', 'puid', 'emailhash')
        targets = ('yuid', 'devid', 'cryptaid')
        return [
            yt_luigi.YtDateTarget(out_date_folder + '_with_'.join(tpl), self.date)
            for tpl in product(id_types, targets) 
        ]

    def run(self):
        '''
        main method
        '''
        # ### INPUT TABLES ###
        # devid
        puid_uuid_table = self.in_f('devid_raw_month') + 'puid_uuid_oauth'
        uuid_devid_table = self.in_f('dict') + 'uuid_dev_info_yt'
        puid_devid_table = self.in_f('devid_raw_month') + 'puid_devid_oauth' # puid
        email_phone_account_manager = self.in_f('dict') + 'account_manager'  # email phone

        # yuid
        puid_yuid_table = self.in_f('dict') + 'puid_yuid_yt'
        email_yuid_table = self.in_f('dict') + 'yuid_with_id_email'
        phone_yuid_table = self.in_f('dict') + 'yuid_with_id_phone'

        # matching
        cryptaid_with_all = self.in_f('matching') + 'exact_vertices_by_crypta_id'
        all_with_cryptaid = self.in_f('matching') + 'exact_vertices_by_key'

        # ### OUTPUT TABLES ###
        error_data_table = self.out_f('tmp_output') + 'tmp_cryptaid_matching_errors'

        with yt.Transaction() as transaction,\
                yt.TempTable() as tmp_puid_devid_table,\
                yt.TempTable() as cryptaid_with_email,\
                yt.TempTable() as cryptaid_with_phone,\
                yt.TempTable() as cryptaid_with_puid,\
                yt.TempTable() as nocryptaid_with_puid,\
                yt.TempTable() as nocryptaid_with_email,\
                yt.TempTable() as nocryptaid_with_phone,\
                yt.TempTable() as tmp_preresult_puid,\
                yt.TempTable() as tmp_preresult_phone,\
                yt.TempTable() as tmp_preresult_email:

            mr.mkdir(self.out_f('tmp_output'))
            
            # find puid-devid pairs through uuid (join tables)
            yt.run_map_reduce(
                puid_uuid_devid_mapper,
                puid_uuid_devid_reducer,
                [puid_uuid_table, uuid_devid_table],
                tmp_puid_devid_table,
                reduce_by=['uuid']
            )

            # create schemas
            out_date_folder = self.out_f('tmp_output')
            id_types = ('phonehash', 'puid', 'emailhash')
            targets = ('yuid', 'devid')
            for key in id_types:
                for value in targets:  # ,'crypta_id'
                    mr.create_table_with_schema(
                        out_date_folder + '{key}_with_{value}'.format(key=key, value=value),
                        {
                            key: 'string',
                            value: 'string',
                            'crypta_id': 'string',
                            'has_direct': 'boolean'
                        },
                        transaction,
                        strict=True,
                        recreate_if_exists=True,
                        # sorted_by=[key, value]
                    )
                mr.create_table_with_schema(
                    out_date_folder + '{key}_with_cryptaid'.format(key=key),
                    {key: 'string', 'crypta_id': 'string'},
                    transaction,
                    strict=True,
                    recreate_if_exists=True,
                    # sorted_by=[key]
                )


            in_tables = [
                email_phone_account_manager, # email/yuid + devid
                email_yuid_table,
                phone_yuid_table,
                puid_devid_table,  # normal devid-puid pairs
                tmp_puid_devid_table,  # matched devid-puid pairs
                puid_yuid_table,  # yuid-puid
                all_with_cryptaid  # matching
            ]
            in_indexes = {
                'account_manager': 0,
                'yuid_with_id_email': 1,
                'yuid_with_id_phone': 2,
                'puid_with_devid': 3,
                'puid_with_through_uuid': 4,
                'puid_with_yuid': 5,
                'matching': 6
            }
            out_tables = [
                cryptaid_with_phone,
                cryptaid_with_email,
                nocryptaid_with_phone,
                nocryptaid_with_email,
                cryptaid_with_puid,
                nocryptaid_with_puid,
                error_data_table
            ]
            out_indexes = {
                ('cryptaid', 'phonehash'): 0,
                ('cryptaid', 'emailhash'): 1,
                ('nocryptaid', 'phonehash'): 2,
                ('nocryptaid', 'emailhash'): 3,
                ('cryptaid', 'puid'): 4,
                ('nocryptaid', 'puid'): 5,
                ('errors'): 6,
            }

            # find cryptaid for phonses and emails and puids
            yt.run_map_reduce(
                partial(prepare_input_tables, in_indexes=in_indexes),
                partial(get_cryptaid_for_value, out_indexes=out_indexes),
                in_tables,
                out_tables,
                reduce_by=['key', 'id_type', 'value_type'],
            )

            # sorting tables *_with_cryptadi for output
            soring_ops_for_cryptaids = [
                yt.run_sort(
                    cryptaid_with_phone,
                    self.out_f('tmp_output') + 'phonehash_with_cryptaid',
                    sort_by=['phonehash', 'crypta_id'],
                    sync=False
                ),
                yt.run_sort(
                    cryptaid_with_email,
                    self.out_f('tmp_output') + 'emailhash_with_cryptaid',
                    sort_by=['emailhash', 'crypta_id'],
                    sync=False
                ),
                yt.run_sort(
                    cryptaid_with_puid,
                    self.out_f('tmp_output') + 'puid_with_cryptaid',
                    sort_by=['puid', 'crypta_id'],
                    sync=False
                ),
            ]

            # waiting for sorting result
            sorting_operations = yt.OperationsTracker()
            for operation in soring_ops_for_cryptaids:
                sorting_operations.add(operation.op)
            sorting_operations.wait_all()

            # find yuids and devids for phones, emails and puids
            get_vertices_ops = [
                yt.run_map_reduce(
                    None,
                    partial(get_vals_by_cryptaid, value='phonehash'),
                    [cryptaid_with_phone, cryptaid_with_all],
                    tmp_preresult_phone,
                    reduce_by=['crypta_id'],
                    sync=False
                ),
                yt.run_map_reduce(
                    None,
                    partial(get_vals_by_cryptaid, value='emailhash'),
                    [cryptaid_with_email, cryptaid_with_all],
                    tmp_preresult_email,
                    reduce_by=['crypta_id'],
                    sync=False
                ),
                yt.run_map_reduce(
                    None,
                    partial(get_vals_by_cryptaid, value='puid'),
                    [cryptaid_with_puid, cryptaid_with_all],
                    tmp_preresult_puid,
                    reduce_by=['crypta_id'],
                    sync=False
                ),
            ]

            # waiting for it
            vals_from_cryptaid_ops = yt.OperationsTracker()
            for operation in get_vertices_ops:
                vals_from_cryptaid_ops.add(operation.op)
            vals_from_cryptaid_ops.wait_all()

            # creating *_with_yuid and *_with_devid tables by sorted union of values,
            # which we find through cryptaid
            # and which we had before we starts (thay dont have cryptaid)
            merging_ops_for_vertices = [
                yt.run_map_reduce(
                    None,
                    partial(join_and_split_result_tables, value='phonehash'),
                    [
                        tmp_preresult_phone,
                        nocryptaid_with_phone,
                    ],
                    [
                        self.out_f('tmp_output') + 'phonehash_with_yuid',
                        self.out_f('tmp_output') + 'phonehash_with_devid',
                    ],
                    reduce_by=['phonehash', 'sec_value_type', 'sec_value'],
                    sync=False
                ),
                yt.run_map_reduce(
                    None,
                    partial(join_and_split_result_tables, value='emailhash'),
                    [
                        tmp_preresult_email,
                        nocryptaid_with_email,
                    ],
                    [
                        self.out_f('tmp_output') + 'emailhash_with_yuid',
                        self.out_f('tmp_output') + 'emailhash_with_devid',
                    ],
                    reduce_by=['emailhash', 'sec_value_type', 'sec_value'],
                    sync=False
                ),
                yt.run_map_reduce(
                    None,
                    partial(join_and_split_result_tables, value='puid'),
                    [
                        tmp_preresult_puid,
                        nocryptaid_with_puid,
                    ],
                    [
                        self.out_f('tmp_output') + 'puid_with_yuid',
                        self.out_f('tmp_output') + 'puid_with_devid',
                    ],
                    reduce_by=['puid', 'sec_value_type', 'sec_value'],
                    sync=False
                ),
            ]

            # waiting for result
            finale_operations = yt.OperationsTracker()
            for operation in merging_ops_for_vertices:
                finale_operations.add(operation.op)
            finale_operations.wait_all()

            sort_ops = []
            for key in id_types:
                for value in targets:
                    sort_ops.append(
                        yt.run_sort(
                            out_date_folder + '{key}_with_{value}'.format(key=key, value=value),
                            sort_by=[key, value],
                            sync=False
                        )
                    )

            # waiting for final sort
            finale_sort_operations = yt.OperationsTracker()
            for operation in sort_ops:
                finale_sort_operations.add(operation.op)
            finale_sort_operations.wait_all()

            # copying tables to real path
            mr.mkdir(self.out_f('output'))
            node_names = yt.list(self.out_f('tmp_output')[:-1])
            for node_name in node_names:
                yt.copy(
                    self.out_f('tmp_output') + node_name,
                    self.out_f('output') + node_name,
                    force=True
                )
                mr.set_generate_date(self.out_f('output') + node_name, self.date)
            mr.drop(self.out_f('tmp_output'))

        return

if __name__ == '__main__':
    import os
    yt.config["tabular_data_format"] = yt.YsonFormat(process_table_index=True)
    yt.config.set_proxy(os.getenv('RTCRYPTA_MR_SERVER'))
    
    yt_trace.setup_trace()

    luigi.build(
        [CreateImportantPairsTablesAfterMatching('2017-07-24')],
        workers=10,
        scheduler_port=int(config.LUIGID_PORT)
    )
