#!/usr/bin/env python

import hashlib
import os

import luigi
import yt.wrapper as yt

import hh_util as util
import main_profile_config
from enrich import HHEnrichTask
from hh_info import HouseholdInfo
from lib.luigi import yt_luigi
from rtcconf import config
from utils import mr_utils as mr
from v2.soup import soup_config
from v2.soup.soup_tables import SoupDumpTable


def map_hh(rec):
    yield {'key': str(rec['id']), 'id_type': rec['id_type'], 'hhid': rec['hhid']}


def map_hh_geo(rec):
    yield {'key': str(rec['id']), 'lat': rec['lat'], 'lon': rec['lon']}


six_age_keys = ['0_17', '18_24', '25_34', '35_44', '45_54', '55_99']
def map_profiles(rec):
    if rec.get('yandexuid', 0):
        out_rec = dict(key=str(rec['yandexuid']))
        if rec.get('user_age_6s', None):
            out_rec['6s_age'] = ','.join(map(str, [rec['user_age_6s'][k] for k in six_age_keys]))
        if rec.get('probabilistic_segments', None):
            out_rec['prob_segments'] = rec['probabilistic_segments'].keys()
        # TODO: use gender and income from crypta-yandexuid-profiles (currently profiles from yuid_ua dict are used)
        yield out_rec


def reduce_id_hh_info(key, recs):
    out_rec = dict()
    for rec in recs:
        if rec['@table_index'] == 0:
            out_rec.update(rec)
        elif out_rec and rec['@table_index'] in [1, 2]:
            out_rec.update(rec)
        elif out_rec:
            out_rec.update(util.parse_tskv(rec['value']))
    if out_rec:
        del out_rec['key']
        out_rec['id'] = key['key']
        out_rec['@table_index'] = 0
        yield out_rec


flags_to_adhocs = {'child': {
    '115',  # married with children
    '92',  # women with children
    '112',  # women with children 0-3
    '189',  # women with children 0-1 + pregnant
    '129',  # women with children 11-16
    '113',  # women with children 3-6
    '97'  # women with children 6-11
}, 'income_C': {
    '89',  # top managers
    '199'  # wealthy
}}


def hash_tns(yuid):
    return hashlib.md5(b'%s%s' % (yuid, 'e0440ebc0786e3d2cff6ef51319bc226')).hexdigest()


def to_float(socdem):
    return [float(x) for x in socdem.split(',') if x]


def above_tr(socdem, tr):
    return set([str(i) for i, (x, y) in enumerate(zip(socdem, tr)) if x >= y])


def reduce_hh_composition(key, recs):
    # TODO: use correct thresholds (?)
    age_tr = [0.3, 0.25, 0.36, 0.32, 0.27, 0.34]    # 0 - <18, 1 - 18-24, 2 - 25-34, 3 - 35-44, 4 - 45-54, 5 >=45
    sex_tr = [0.58, 0.65]                           # 0 - male, 1 - female
    main_profile_age_tr = [0.3, 0.25, 0.36, 0.32, 0.27, 0.34]
    main_profile_sex_tr = [0.58, 0.65]

    hh_data = dict()
    hh_data['brs'] = set()
    hh_data['oss'] = set()
    hh_data['pltfrms'] = set()

    ys = set()
    ds = set()
    hh_age = set()
    hh_sex = set()
    hh_inc = [0, 0, 0]
    socdem_profiles = []    # format : [((male_p, female_p), (age_0, ..., age_5), [adhoc, ...]), ...]
    smart_tv_yuids = []

    for rec in recs:
        id_type = rec['id_type']
        id = rec['id']
        if id_type == 'y':
            ys.add(id)

            yuid_sex = to_float(rec.get('sex', ''))
            yuid_age = to_float(rec.get('6s_age', ''))
            yuid_inc = to_float(rec.get('income', ''))

            if yuid_inc:
                hh_inc = [x + y for x, y in zip(yuid_inc, hh_inc)]

            yuid_age_segments = above_tr(yuid_age, age_tr)
            hh_age |= yuid_age_segments

            # The check 18-45 for sex is requested here:
            # https://wiki.yandex-team.ru/users/sirina0/notes/advideoproduct/advideoproductdetails/HouseHoldProduct#h-1
            if '0' not in yuid_age_segments and '5' not in yuid_age_segments:
                hh_sex |= above_tr(yuid_sex, sex_tr)

            adhocs = rec.get('prob_segments', [])

            socdem_profiles += [(above_tr(yuid_sex, main_profile_sex_tr),
                                 above_tr(yuid_age, main_profile_age_tr),
                                 adhocs,
                                 id)]


            if rec.get('br') and rec.get('br_v'):
                hh_data['brs'].add(rec['br'])

            if rec.get('os') and rec.get('os_v'):
                hh_data['oss'].add(rec['os'])

            if rec.get('ua_profile'):
                hh_data['pltfrms'].add(rec['ua_profile'][0])
                if rec['ua_profile'].startswith('d|tv'):
                    smart_tv_yuids.append(id)

            if rec.get('lat') and rec.get('lon'):
                hh_data['lat'] = rec['lat']
                hh_data['lon'] = rec['lon']

        elif id_type == 'd':
            ds.add(id)

            if rec.get('os'):
                hh_data['oss'].add(rec['os'])

            hh_data['pltfrms'].add('a')

    household_id = key['hhid']

    #  --- find main profile
    # get correct priority list using hash of hh_id
    map_index = (int(household_id)*19*31) % 11719     # some simple hash in [0:10000]
    map_index = map_index % main_profile_config.mapping_index_size
    priorities, sex_priority = None, None
    for i, pr, sex_pr in main_profile_config.socdem_priorities:
        if map_index <= i:
            priorities = pr
            sex_priority = sex_pr
            break

    # find yuids with max priority
    adhocs_canditats = []
    max_priority = None
    for sex, age, adhocs, yuid in socdem_profiles:
        for i, pr in enumerate(priorities):
            if str(pr.sex) in sex and str(pr.age) in age:
                if max_priority is None or i < max_priority:
                    max_priority = i
                    adhocs_canditats = [(adhocs, yuid)]
                elif i == max_priority:
                    adhocs_canditats += [(adhocs, yuid)]

    all_adhocs = []
    main_profile_yuid = None
    main_profile_sex = ""
    main_profile_age = ""
    if max_priority is not None:
        # has yuid with main profile
        main_profile_sex, main_profile_age = str(priorities[max_priority].sex), str(priorities[max_priority].age)
        all_adhocs, main_profile_yuid = max(adhocs_canditats, key=lambda x: len(x[0]))
    else:
        # no main yuid, so no adhocs, only set main socdem flags
        if str(sex_priority) in hh_sex:
            main_profile_sex = str(sex_priority)
        elif str(1 - sex_priority) in hh_sex:
            main_profile_sex = str(1 - sex_priority)
        if '0' in hh_sex or '1' in hh_sex:
            main_profile_age = '2'
        elif '5' in hh_age:
            main_profile_age = '5'
        elif '0' in hh_age:
            main_profile_age = '0'

    all_adhocs = set(all_adhocs)

    # CRYPTAIS-226
    max_inc = ''
    if any([x != 0 for x in hh_inc]):
        max_inc = str(hh_inc.index(max(hh_inc)))

    hh_data['sex'] = hh_sex
    hh_data['age'] = hh_age
    hh_data['inc'] = max_inc
    hh_data['yuids'] = ys
    hh_data['devids'] = ds
    hh_data['yc'] = len(ys) # ?
    hh_data['dc'] = len(ds) # ?
    hh_data['adhocs'] = sorted(list(a for a in all_adhocs if a.strip()), key=lambda i: int(i))
    hh_data['hh_id'] = household_id
    hh_data['mp_sex'] = set(main_profile_sex) if main_profile_sex else set()
    hh_data['mp_age'] = set(main_profile_age) if main_profile_age else set()

    hh_info = HouseholdInfo(hh_data)
    hh_info_b64_binary = hh_info.base64_binary()

    # table with human readable HH profile
    hh_data['binary_profile'] = hh_info_b64_binary
    hh_data['@table_index'] = 4
    for k in hh_data:
        if isinstance(hh_data[k], set):
            hh_data[k] = list(hh_data[k])
    yield hh_data

    for y in ys:
        yield {
            'key': y,
            'subkey': '',
            'value': 'keyword=353\tyuid=%s\tvalue=%s' % (y, hh_info_b64_binary),
            '@table_index': 0
        }
#        yield {'key': key['hhid'], 'subkey': hh_info.tns(), 'value': hash_tns(y), '@table_index': 5}

    for d in ds:
        yield {
            'key': d,
            'subkey': '',
            'value': 'keyword=353\tdeviceid=%s\tvalue=%s' % (d, hh_info_b64_binary),
            '@table_index': 1
        }

    # for smart-tv - main yandexuid pairs (see CRYPTAUP-754)
    if main_profile_yuid:
        for yuid in smart_tv_yuids:
            if yuid != main_profile_yuid:
                yield {
                    'id_type': 'hh_smart_tv',
                    'id_value': '',
                    'key': main_profile_yuid + '_' + yuid,
                    'pair_source': 'hh_smart_tv',
                    'pair_type': 'y_y',
                    'yuid1_sources': ['hh_smart_tv'],
                    'yuid2_sources': ['hh_smart_tv'],
                    '@table_index': 2
                }
                yield SoupDumpTable.make_rec(
                    main_profile_yuid,
                    yuid,
                    soup_config.yuid_yuid_smart_tv_hh,
                    [],
                    3
                )


# [ deviceid | "cryptaid" | cid ]
# [ deviceid | "" | to_bb_string ]
#  -> [ cid | "" | to_bb_string(with substituted cid) ]
def reduce_substitute_cid(key, recs):
    cid = ""
    hh_data_v = []
    for rec in recs:
        if rec['subkey'] == "cryptaid":
            cid = rec['value']
        else:
            hh_data_v += [rec['value']]
    if cid != "":
        for data in hh_data_v:
            parts = data.split('\t', 2)
            parts[1] = ("cid=%s" % cid)
            yield {
                'key': cid,
                'subkey': '',
                'value': '\t'.join(parts)
            }


def uniq_reduce(key, recs):
    rec = next(recs)
    yield rec


def make_diff(key, recs):
    values = dict([(rec['@table_index'], rec['value']) for rec in recs])
    if 1 in values:
        if 0 not in values or values[0] != values[1]:
            yield {
                'key': key['key'],
                'subkey': '',
                'value': values[1]
            }


class HHCompositionTask(yt_luigi.BaseYtTask):
    """
    Collect HH profile (socdem, adhocs, geo, etc.)
    Generate stable uint32 HH id
    Prepare data for sending to BB
    """
    date = luigi.Parameter()

    def __init__(self, *args, **kwargs):
        super(HHCompositionTask, self).__init__(*args, **kwargs)
        self.smart_tv_soup_table = SoupDumpTable(soup_config.yuid_yuid_smart_tv_hh, self.date)

    def input_folders(self):
        return {
            'hh': config.HH_FOLDER2,
            'dict': config.GRAPH_YT_DICTS_FOLDER,
        }

    def output_folders(self):
        return {
            'hh': config.HH_FOLDER2,
        }

    def requires(self):
        return [HHEnrichTask(date=self.date)]

    def run(self):
        tmp = os.path.join(self.out_f('hh'), 'tmp') + '/'
        mr.mkdir(tmp)
        devid_cid_table = os.path.join(config.IS_OUTPUT_FOLDER, 'mapping_deviceid_cid')

        with yt.TempTable() as tmp_crypta_profiles, \
                yt.TempTable() as tmp_hh_geo, \
                yt.TempTable() as tmp_hh_members, \
                yt.TempTable() as tmp_hh_members_with_all_info, \
                yt.TempTable() as tmp_to_bb_cid, \
                yt.TempTable() as tmp_to_bb_cid_uniq, \
                yt.TempTable() as tmp_to_bb_deviceid, \
                yt.TempTable() as tmp_to_bb_yuid, \
                yt.TempTable() as tmp_devid_cid:

            yt.run_map(map_hh, os.path.join(self.in_f('hh'), 'enriched_hh_reversed'), tmp_hh_members)

            yt.run_map(map_hh_geo, os.path.join(self.in_f('hh'), 'hh_geo'), tmp_hh_geo)

            last_profiles_table = sorted(yt.list(config.CRYPTA_PROFILES_LOG_DIR, absolute=True))[-1]
            yt.run_map(map_profiles,
                       yt.TablePath(last_profiles_table, columns=['yandexuid', 'user_age_6s', 'probabilistic_segments']),
                       tmp_crypta_profiles)

            mr.sort_all([tmp_hh_members, tmp_hh_geo, tmp_crypta_profiles], sort_by='key')
            yt.run_reduce(reduce_id_hh_info,
                          [
                              tmp_hh_members,
                              tmp_hh_geo,
                              tmp_crypta_profiles,
                              os.path.join(self.in_f('dict'), 'yuid_ua'),
                              os.path.join(self.in_f('dict'), 'dev_info'),
                          ],
                          tmp_hh_members_with_all_info,
                          reduce_by='key')

            yt.run_sort(tmp_hh_members_with_all_info, sort_by='hhid')

            with yt.Transaction() as tr:

                yt.run_reduce(reduce_hh_composition,
                              tmp_hh_members_with_all_info,
                              [
                                  tmp_to_bb_yuid,
                                  tmp_to_bb_deviceid,
                                  self.out_f('hh')+'smart_tv_main_yuid_pairs',
                                  self.smart_tv_soup_table.create(tr),
                                  self.out_f('hh')+'hh_profile_data'
                              ],
                              reduce_by='hhid')

                self.smart_tv_soup_table.finalize(tr)

            yt.run_sort(tmp_to_bb_deviceid, sort_by='key')
            yt.run_sort(devid_cid_table, tmp_devid_cid, sort_by='key')
            yt.run_reduce(reduce_substitute_cid,
                          [tmp_to_bb_deviceid, tmp_devid_cid],
                          tmp_to_bb_cid,
                          reduce_by='key')

            yt.run_sort(tmp_to_bb_cid, sort_by='key')
            yt.run_reduce(uniq_reduce, tmp_to_bb_cid, tmp_to_bb_cid_uniq, reduce_by='key')

            yt.run_merge([tmp_to_bb_yuid, tmp_to_bb_cid_uniq],
                         self.out_f('hh')+'households_to_bb',
                         spec={'combine_chunks': True})

            if yt.exists(self.out_f('hh')+'households_to_bb_old'):
                mr.sort_all([self.out_f('hh')+'households_to_bb', self.out_f('hh')+'households_to_bb_old'], sort_by='key')
                yt.run_reduce(make_diff,
                              [self.out_f('hh')+'households_to_bb_old', self.out_f('hh')+'households_to_bb'],
                              self.out_f('hh')+'households_to_bb_diff',
                              reduce_by=['key'])
            else:
                yt.copy(self.out_f('hh')+'households_to_bb', self.out_f('hh')+'households_to_bb_diff', force=True)

        mr.set_generate_date(self.out_f('hh') + 'households_to_bb', self.date)
        mr.set_generate_date(self.out_f('hh') + 'households_to_bb_diff', self.date)
        mr.set_generate_date(self.out_f('hh') + 'smart_tv_main_yuid_pairs', self.date)

    def output(self):
        soup_out_tables = [self.smart_tv_soup_table.as_target()]
        return soup_out_tables + [yt_luigi.YtDateTarget(self.out_f('hh') + 'households_to_bb', self.date),
                                  yt_luigi.YtDateTarget(self.out_f('hh') + 'households_to_bb_diff', self.date),
                                  yt_luigi.YtDateTarget(self.out_f('hh') + 'smart_tv_main_yuid_pairs', self.date)]


if __name__ == '__main__':
    import sys

    dt = sys.argv[1]

    yt.config.set_proxy(config.MR_SERVER)
    yt.config["tabular_data_format"] = yt.YsonFormat(process_table_index=True)

    config.HH_FOLDER2 = '//home/crypta/team/shiryaev/test_hh/'
    config.STORE_DAYS = 2
    config.CRYPTA_PROFILES_LOG_DIR = '//statbox/crypta-yandexuid-profiles-log'
    config.IS_OUTPUT_FOLDER = '//home/crypta/production/state/iscrypta/'

    task = HHCompositionTask(dt)

    print 'Starting HH composition...'

    task.run()

    print 'Done.'
