from collections import Counter
from collections import defaultdict

import yt.wrapper as yt

from rtcconf import config
from utils import mr_utils as mr
from utils import utils, geo_utils


def map_yuid_ips(rec):
    # disabled by CRYPTAIS-1294
    # # ip -> date -> count
    # ip_dates = defaultdict(lambda: defaultdict(int))
    # # join both ip activities into one. Usually it's done in graph_dict using Activity class
    # # but I just want to keep things simple
    # fp_ip_dates = rec.get(config.ID_TYPE_IP + '_' + config.ID_SOURCE_TYPE_FP + '_dates') or dict()  # default value doesn't work because of schema
    # access_ip_dates = rec.get(config.ID_TYPE_IP + '_' + config.ID_SOURCE_TYPE_ACCESS_LOG + '_dates') or dict()
    #
    # ips = set(fp_ip_dates.keys()).union(set(access_ip_dates.keys()))
    # for ip in ips:
    #     for date, hits in fp_ip_dates.get(ip, dict()).items():
    #         ip_dates[ip][date] += hits
    #     for date, hits in access_ip_dates.get(ip, dict()).items():
    #         ip_dates[ip][date] += hits

    ip_dates = rec.get(config.ID_TYPE_IP + '_' + config.ID_SOURCE_TYPE_FP + '_dates')
    if ip_dates:
        # ip -> (date -> count)
        for ip, date_dict in ip_dates.iteritems():
            yield {'yuid': rec['yuid'], 'ip': ip, 'date_count': date_dict}


def reduce_uniq_ip(key, recs):
    yield {'ip': key['ip']}


def map_ip_region(rec):
    try:
        region = geo_utils.geodata_region(rec['ip'])
        if region:
            country_id, oblast_id, city_id, main_region = geo_utils.get_region_info(
                region.id, geo_utils.get_geo_lookup(use_local_bin_file=False)
            )
            rec['region_id'] = region.id
            rec['country_id'] = country_id
            rec['oblast_id'] = oblast_id
            rec['city_id'] = city_id
            yield rec
    except RuntimeError:
        # geobase can't parse it
        pass


def reduce_yuid_ips_region(key, recs):
    region_id = None
    country_id = None
    oblast_id = None
    city_id = None
    for rec in recs:
        if rec['@table_index'] == 0:
            region_id = rec['region_id']
            country_id = rec['country_id']
            oblast_id = rec['oblast_id']
            city_id = rec['city_id']
        elif rec['@table_index'] == 1 and region_id is not None:
            rec['region_id'] = region_id
            rec['country_id'] = country_id
            rec['oblast_id'] = oblast_id
            rec['city_id'] = city_id
            rec['@table_index'] = 0
            yield rec
        else:
            return




def reduce_yuid_regs(key, recs):
    reg_activities = defaultdict(Counter)
    yuid_with_all_row = {}

    country_counter = Counter()
    obl_counter = Counter()
    city_counter = Counter()
    main_reg_counter = Counter()

    for rec in recs:
        # yuid_with_all table
        if rec['@table_index'] == 0:
            yuid_with_all_row = rec
        elif rec['@table_index'] == 1:
            region_dates_activity = rec['date_count']
            reg_activities[str(rec['region_id'])] += Counter(region_dates_activity)

            # TODO: rework using split_by_region_level

            region_dates_count = len(region_dates_activity.keys())
            if rec['country_id']:
                country_counter[rec['country_id']] += region_dates_count
            if rec['oblast_id']:
                obl_counter[rec['oblast_id']] += region_dates_count
            if rec['city_id']:
                city_counter[rec['city_id']] += region_dates_count

            main_region_id = rec['oblast_id'] or rec['city_id']
            if main_region_id:
                main_reg_counter[main_region_id] += region_dates_count

    if reg_activities:
        # region_id -> (date -> count)
        yuid_with_all_row['reg_fp_dates'] = dict(reg_activities)  # compatibility
        yuid_with_all_row[config.ID_TYPE_REGION + '_dates'] = dict(reg_activities)
        yuid_with_all_row['main_region_country'] = utils.top(country_counter)
        yuid_with_all_row['main_region_obl'] = utils.top(obl_counter)
        yuid_with_all_row['main_region_city'] = utils.top(city_counter)
        yuid_with_all_row['main_region'] = utils.top(main_reg_counter)

    yield yuid_with_all_row


def map_yuid_regs(rec):
    reg_dates_col = rec.get(config.ID_TYPE_REGION + '_dates')
    if reg_dates_col:
        reg_dates = ';'.join(
                ['%s|%s' % (reg, ','.join(['%s:%s' % (date, times)
                                           for (date, times) in datetimes.items()]))
                 for (reg, datetimes) in reg_dates_col.items()]
        )
        row = {'key': rec['yuid'],
               'subkey': 'yr',
               'value': reg_dates}
        yield row


def add_region_activities_by_ip(in_yuid_with_all, out_yuid_with_all, dict_folder, date):
    """
    Adds region activities by ip activities using geobase.
    Because of geobase  problems we need to do unique ips
    before perform map from ip to region.
    """
    # fetch only required fields to reduce io later
    yt.run_map(map_yuid_ips,
               yt.TablePath(in_yuid_with_all,
                            columns=['yuid',
                                     config.ID_TYPE_IP + '_' + config.ID_SOURCE_TYPE_FP + '_dates',
                                     config.ID_TYPE_IP + '_' + config.ID_SOURCE_TYPE_ACCESS_LOG + '_dates']),
               dict_folder + 'yuid_ips')

    # get all ips
    yt.run_sort(dict_folder + 'yuid_ips', sort_by='ip')
    yt.run_reduce(reduce_uniq_ip, dict_folder + 'yuid_ips',
                  dict_folder + 'ip_region',
                  reduce_by='ip')
    # and join region info to them in separate op to reduce job overhead
    job_count = yt.row_count(dict_folder + 'ip_region') / 500000 + 1  # 500K lines per job to speed up
    yt.run_map(geo_utils.mk_mapper_with_geobase_stats(map_ip_region),
               dict_folder + 'ip_region',
               dict_folder + 'ip_region',
               yt_files=[geo_utils.geodata_yt_table(date)],  # currently it uses local geobase bin
               job_count=job_count,
               memory_limit=1024 * 1024 * 1024)

    # join region info back to yuids
    mr.sort_all([dict_folder + 'ip_region',
                 dict_folder + 'yuid_ips'], sort_by='ip')
    yt.run_reduce(reduce_yuid_ips_region,
                  [dict_folder + 'ip_region', dict_folder + 'yuid_ips'],
                  dict_folder + 'yuid_ips_region',
                  reduce_by='ip')

    # join yuid region info back to yuid_with_all
    utils.wait_all([
        yt.run_sort(in_yuid_with_all, out_yuid_with_all, sort_by='yuid', sync=False),
        yt.run_sort(dict_folder + 'yuid_ips_region', sort_by='yuid', sync=False)
    ])
    yt.run_reduce(reduce_yuid_regs,
                  [out_yuid_with_all, dict_folder + 'yuid_ips_region'],
                  out_yuid_with_all,
                  reduce_by='yuid')

    mr.drop(dict_folder + 'yuid_ips')
    mr.drop(dict_folder + 'yuid_ips_region')

    return dict_folder + 'ip_region'


if __name__ == '__main__':
    yt.config.set_proxy(config.MR_SERVER)
    yt.config["tabular_data_format"] = yt.YsonFormat(process_table_index=True)

    workdir = '//home/crypta/team/artembelov/yuid_all_schema/'
    mr.mkdir(workdir)

    add_region_activities_by_ip(workdir + 'yuid_with_all', workdir, '2016-01-25')

