from collections import defaultdict
from collections import defaultdict
from functools import partial

import luigi
import yt.wrapper as yt

from data_imports.import_dumps.graph_people_search import ImportPeopleSearch
from lib.luigi import yt_luigi
from matching.yuid_matching import graph_merge_month
from rtcconf import config
from utils import mr_utils as mr
from utils import utils


def _join_social_data(social_id_key, recs, yuid_raw_indexes):
    social_id = social_id_key['id_value']
    social_type = social_id_key['id_type']
    if not (social_id and social_type):
        return

    yuid_raw_recs = []
    yuids = dict()
    emails = defaultdict(list)
    phones = defaultdict(list)
    ages = defaultdict(list)
    genders = defaultdict(list)
    people_search_found = False

    for r in recs:
        if 'yuid' in r and bool(r['id_value']):
            yuid_raw_recs.append(r)
            yuid = r['yuid']
            date = r['id_date']
            if yuid in yuids:
                last_date = yuids[yuid]
                if date > last_date:
                    yuids[yuid] = date
            else:
                yuids[yuid] = date

        profile_source = r.get('profile_source')
        if profile_source == 'dump':
            phone = r['phone']
            if phone:
                phones[phone].append(profile_source)
            birth_date = r['birth']
            if birth_date:
                ages[birth_date].append(profile_source)

        elif profile_source == 'people_search':
            people_search_found = True
            for phone in r['phones']:
                phones[phone].append(profile_source)
            for email in r['emails']:
                emails[email].append(profile_source)
            birth_date = r['birth_date']
            if birth_date:
                ages[birth_date].append(profile_source)
            gender = r['gender']
            if gender:
                genders[gender].append(profile_source)

    # matching
    for yuid_raw_rec in yuid_raw_recs:
        source_type = yuid_raw_rec['source_type']
        for phone, profile_sources in phones.iteritems():
            yuid_raw_rec['social_id'] = social_id
            yuid_raw_rec['id_value'] = phone
            yuid_raw_rec['id_type'] = config.ID_TYPE_PHONE
            yuid_raw_rec['source_type'] = '%s_%s' % (social_type, source_type)
            yuid_raw_rec['@table_index'] = yuid_raw_indexes[(source_type, config.ID_TYPE_PHONE)]
            yield yuid_raw_rec
        for email, profile_sources in emails.iteritems():
            yuid_raw_rec['social_id'] = social_id
            yuid_raw_rec['id_value'] = email
            yuid_raw_rec['id_type'] = config.ID_TYPE_EMAIL
            yuid_raw_rec['source_type'] = '%s_%s' % (social_type, source_type)
            yuid_raw_rec['@table_index'] = yuid_raw_indexes[(source_type, config.ID_TYPE_EMAIL)]
            yield yuid_raw_rec

    def get_best(socdem_dict):
        if not socdem_dict:
            return None

        both_sources = [x for x, sources in socdem_dict.iteritems() if len(sources) == 2]
        if both_sources:
            if len(both_sources) > 1:
                return None  # sources reported several ages or genders, don't believe it
            else:
                return both_sources[0]
        else:
            people_search = [x for x, sources in socdem_dict.iteritems() if 'people_search' in sources]
            dump = [x for x, sources in socdem_dict.iteritems() if 'dump' in sources]

            if dump and people_search and dump[0] == people_search[0]:
                return people_search[0]  # == dump[0]
            elif people_search:
                return people_search[0]
            elif dump:
                return dump[0]
            else:
                # two sources reported different socdem
                return None

    # socdem to profiles
    best_sex = get_best(genders)
    best_age = get_best(ages)

    for yuid, last_date in yuids.iteritems():
        if best_sex or best_age:
            values = ['type=%s' % social_type.upper() + '_PS']  # for people_search
            if best_age:
                values.append('yob=%s' % best_age)
            if best_sex:
                sex = 1 if best_sex == 'm' else 2
                values.append('sex=%s' % sex)

            last_ts = utils.date_str_to_ts(last_date)
            yield {'key': yuid, 'subkey': str(last_ts), 'value': '\t'.join(values),
                   '@table_index': len(yuid_raw_indexes)}
        if people_search_found:
            yield {'social_id': social_id, 'social_type': social_type,
                   'yuid': yuid, 'last_date': last_date, '@table_index': len(yuid_raw_indexes) + 1}
        else:
            yield {'social_id': social_id, 'social_type': social_type,
                   'yuid': yuid, 'last_date': last_date, '@table_index': len(yuid_raw_indexes) + 2}



class EnrichSocialIdsWithPeopleSearch(yt_luigi.BaseYtTask):
    date = luigi.Parameter()

    def input_folders(self):
        return {
            'yuid_raw': config.GRAPH_YT_DICTS_FOLDER + 'yuid_raw/',
            'dict': config.GRAPH_YT_DICTS_FOLDER
        }

    def output_folders(self):
        return {
            'yuid_raw': config.GRAPH_YT_DICTS_FOLDER + 'yuid_raw/',
        }

    def requires(self):

        tasks = [graph_merge_month.FullMonthYuidMergeTask(self.date),
                 graph_merge_month.IncrementalDayAndDumpMergeTask(self.date)]
        if config.HAS_PEOPLE_SEARCH == 'yes':
            tasks.append(ImportPeopleSearch(self.date))
        # otherwise this dict is copied directly to dicts
        return tasks

    def __init__(self, *args, **kwargs):
        self.vk_type = config.YUID_PAIR_TYPES_DICT[config.ID_TYPE_VKCOM]
        self.ok_type = config.YUID_PAIR_TYPES_DICT[config.ID_TYPE_OKRU]
        super(EnrichSocialIdsWithPeopleSearch, self).__init__(*args, **kwargs)

    def run(self):
        dump_vk = self.in_f('dict') + 'profiles'
        vk_people_search = self.in_f('dict') + 'people_search/vk'
        ok_people_search = self.in_f('dict') + 'people_search/ok'

        mr.mkdir(self.out_f('yuid_raw'))
        out_dir_social = self.out_f('yuid_raw') + 'social/'
        mr.mkdir(out_dir_social)

        # VK
        vk_in_tables = []
        vk_out_indexes = dict()
        vk_out_tables = []
        # we only can expand VK with phones for now
        for idx, s in enumerate(self.vk_type.source_types):
            vk_in_tables.append(self.in_f('yuid_raw') + 'yuid_with_%s_%s' % (config.ID_TYPE_VKCOM, s))
            vk_out_indexes[(s, config.ID_TYPE_PHONE)] = idx
            phone_vk_source = '%s_%s_%s' % (config.ID_TYPE_PHONE, config.ID_SOURCE_TYPE_VK, s)
            vk_out_tables.append(self.out_f('yuid_raw') + 'yuid_with_%s' % phone_vk_source)

        mr.sort_all([dump_vk,
                     vk_people_search] + vk_in_tables,
                    sort_by=['id_value', 'id_type'])

        vk_socdem_table = out_dir_social + config.ID_TYPE_VKCOM + '_people_search'
        vk_people_search_found = out_dir_social + config.ID_TYPE_VKCOM + '_people_search_found'
        vk_people_search_not_found = out_dir_social + config.ID_TYPE_VKCOM + '_people_search_not_found'

        yt.run_reduce(partial(_join_social_data, yuid_raw_indexes=vk_out_indexes),
                      [vk_in_tables, vk_people_search, dump_vk],
                      vk_out_tables + [vk_socdem_table, vk_people_search_found, vk_people_search_not_found],
                      reduce_by=['id_value', 'id_type'],
                      memory_limit=5 * 1024 * 1024 * 1024 / 2,
                      spec={"mapper": {"memory_reserve_factor": 1}}
                      )

        # OK
        ok_in_tables = [self.out_f('yuid_raw') + 'yuid_with_%s_%s' % (config.ID_TYPE_OKRU, s)
                        for s in self.ok_type.source_types]

        mr.sort_all([ok_people_search] + ok_in_tables, sort_by=['id_value', 'id_type'])
        # there is no data for phone and email expansion in OK

        ok_out_indexes = dict()
        ok_out_tables = []
        ok_socdem_table = out_dir_social + config.ID_TYPE_OKRU + '_people_search'
        ok_people_search_found = out_dir_social + config.ID_TYPE_OKRU + '_people_search_found'
        ok_people_search_not_found = out_dir_social + config.ID_TYPE_OKRU + '_people_search_not_found'

        yt.run_reduce(partial(_join_social_data, yuid_raw_indexes=ok_out_indexes),
                      [ok_in_tables, ok_people_search],
                      ok_out_tables + [ok_socdem_table, ok_people_search_found, ok_people_search_not_found],
                      reduce_by=['id_value', 'id_type'],
                      memory_limit=5 * 1024 * 1024 * 1024 / 2,
                      spec={"mapper": {"memory_reserve_factor": 1}}
                      )

        utils.wait_all([
            yt.run_sort(vk_socdem_table, sort_by='key', sync=False),
            yt.run_sort(ok_socdem_table, sort_by='key', sync=False),
            yt.run_sort(vk_people_search_found, sort_by='social_id', sync=False),
            yt.run_sort(vk_people_search_not_found, sort_by='social_id', sync=False),
            yt.run_sort(ok_people_search_found, sort_by='social_id', sync=False),
            yt.run_sort(ok_people_search_not_found, sort_by='social_id', sync=False),
        ])

        for t in vk_out_tables + [vk_socdem_table, ok_socdem_table,
                                  vk_people_search_found, vk_people_search_not_found,
                                  ok_people_search_found, ok_people_search_not_found]:
            mr.set_generate_date(t, self.date)

    def output(self):
        out_tables = []
        for s in self.vk_type.source_types:
            phone_vk_source = config.ID_TYPE_PHONE + '_' + config.ID_SOURCE_TYPE_VK + '_' + s
            out_tables.append(self.out_f('yuid_raw') + 'yuid_with_%s' % phone_vk_source)

        out_dir_social = self.out_f('yuid_raw') + 'social/'

        vk_socdem_table = out_dir_social + config.ID_TYPE_VKCOM + '_people_search'
        ok_socdem_table = out_dir_social + config.ID_TYPE_OKRU + '_people_search'
        vk_people_search_found = out_dir_social + config.ID_TYPE_VKCOM + '_people_search_found'
        vk_people_search_not_found = out_dir_social + config.ID_TYPE_VKCOM + '_people_search_not_found'
        ok_people_search_found = out_dir_social + config.ID_TYPE_OKRU + '_people_search_found'
        ok_people_search_not_found = out_dir_social + config.ID_TYPE_OKRU + '_people_search_not_found'

        return [yt_luigi.YtDateTarget(t, self.date)
                for t in out_tables + [vk_socdem_table, ok_socdem_table,
                                       vk_people_search_found, vk_people_search_not_found,
                                       ok_people_search_found, ok_people_search_not_found]]
