from utils import utils
from collections import defaultdict
from rtcconf import config


def unify_pair_value(pair):
    pair_value = pair.get('id_value')
    if pair['pair_source'] == config.ID_TYPE_EMAIL:
        return pair_value.split('@')[0]
    else:
        return pair_value


def get_id_sex(rec):
    sex_field = rec.get('sex')
    if sex_field:
        sex_split = sex_field.split(',')
        if not len(sex_split) == 2:
            return 'U'
        male_prob = float(sex_split[0])
        female_prob = float(sex_split[1])
        if male_prob >= 0.7:
            return 'M'
        elif female_prob >= 0.7:
            return 'F'
        else:
            return 'N'  # neutral :)
    else:
        return 'U'  # unknown


def avg(metrics):
    return sum(metrics.values()) / float(len(metrics))


def weighted(metrics_w):
    w1 = sum(metric * weight for metric, weight in metrics_w)
    w2 = sum(weight for _, weight in metrics_w)
    return w1 / float(w2)


def list_100(some_collection):
    """
    Use to avoid too large value in yt row
    """
    return list(some_collection)[:100]


def crypta_id_stats(vertices_recs):
    # yuids
    yuids = set()
    yuid_device_type_counts = defaultdict(int)
    yuid_mobile_desktop_counts = defaultdict(int)

    browsers = set()
    yuid_ua_profiles = set()

    gender_counts = defaultdict(int)
    regions = set()

    # devids
    devids = set()
    devid_device_type_counts = defaultdict(int)

    # mobile devids and yuids
    mobile_ua_profiles = set()
    unknown_ua_profiles = set()

    # values
    id_values_per_type = defaultdict(set)

    for r in vertices_recs:
        id_key = r['key']
        device_type = 'unknown'
        ua_profile = r.get('ua_profile')
        if ua_profile:
            device_type_from_ua = ua_profile.split('|')[1]
            if device_type_from_ua:
                device_type = device_type_from_ua


        if r['id_type'] == 'deviceid':
            devids.add(id_key)
            devid_device_type_counts[device_type] += 1

            if ua_profile and ua_profile.startswith('m'):
                mobile_ua_profiles.add(ua_profile)
                if '|un' in ua_profile:
                    unknown_ua_profiles.add(ua_profile)

        else:
            yuids.add(id_key)
            yuid_device_type_counts[device_type] += 1

            if ua_profile:
                yuid_ua_profiles.add(ua_profile)
                if ua_profile.startswith('d|desk'):
                    yuid_mobile_desktop_counts['d'] += 1
                elif ua_profile.startswith('m'):
                    yuid_mobile_desktop_counts['m'] += 1

            gender_counts[get_id_sex(r)] += 1

            browser = r.get('browser')
            if browser:
                browsers.add(browser)

            region = r.get('region')
            if region:
                regions.add(region)

        if 'id_values' in r:
            for value_type, id_values in r['id_values'].iteritems():
                id_values_per_type[value_type].update(id_values)

    out_rec = dict()
    out_rec['yuids'] = list_100(yuids)
    out_rec['yuids_count'] = len(yuids)
    out_rec['devids'] = list_100(devids)
    out_rec['devids_count'] = len(devids)
    out_rec['crypta_id_size'] = out_rec['yuids_count'] + out_rec['devids_count']

    out_rec['yuid_device_type_counts'] = utils.default_to_regular(yuid_device_type_counts)
    out_rec['devid_device_type_counts'] = utils.default_to_regular(devid_device_type_counts)
    out_rec['yuid_mobile_desktop_counts'] = utils.default_to_regular(yuid_mobile_desktop_counts)

    mobile_only = 'cross'
    if len(yuid_mobile_desktop_counts) == 1:
        if 'm' in yuid_mobile_desktop_counts:
            mobile_only = 'mobile'
        elif 'd' in yuid_mobile_desktop_counts:
            mobile_only = 'desktop'
    out_rec['mobile_only'] = mobile_only

    out_rec['browsers'] = list_100(browsers)
    out_rec['browsers_count'] = len(browsers)

    out_rec['regions'] = list_100(regions)
    out_rec['regions_count'] = len(regions)

    out_rec['yuid_ua_profiles'] = list_100(yuid_ua_profiles)
    out_rec['yuid_ua_profiles_count'] = len(yuid_ua_profiles)

    out_rec['genders'] = utils.default_to_regular(gender_counts)

    out_rec['mobile_ua_profiles'] = list_100(mobile_ua_profiles)
    out_rec['mobile_ua_profiles_count'] = len(mobile_ua_profiles)
    out_rec['unknown_devid_ua'] = list_100(unknown_ua_profiles)
    out_rec['unknown_devid_ua_count'] = len(unknown_ua_profiles)

    out_rec['id_values'] = utils.default_to_regular(id_values_per_type)

    return out_rec


def crypta_id_quality_metrics(crypta_id_stats_rec):
    browsers_count = crypta_id_stats_rec['browsers_count']
    devids_count = crypta_id_stats_rec['devids_count']
    regions_count = crypta_id_stats_rec['regions_count']
    browsers_penalty = 1 - (4.0 / browsers_count) if browsers_count > 4 else 0
    devices_penalty = 1 - (3.0 / devids_count) if devids_count > 3 else 0
    regions_penalty = 1 - (1.0 / regions_count) if regions_count > 1 else 0

    yuids_of_gender = crypta_id_stats_rec['genders']
    male = yuids_of_gender.get('M', 0)
    female = yuids_of_gender.get('F', 0)
    if male and female:
        sex_penalty = 1 - abs(male - female) / float(male + female)
    else:
        sex_penalty = 0

    too_many_values_penalty = 0
    for pair_type in config.YUID_PAIR_TYPES_EXACT:
        id_values_of_type = crypta_id_stats_rec['id_values'].get(pair_type.id_type)
        if id_values_of_type:
            values_limit = pair_type.ids_per_yuid_strict_limit
            p = 1 - (values_limit / float(len(id_values_of_type))) if len(id_values_of_type) > values_limit else 0
            too_many_values_penalty += p

    out_rec = dict()

    out_rec['browsers_penalty'] = browsers_penalty
    out_rec['devices_penalty'] = devices_penalty
    out_rec['regions_penalty'] = regions_penalty
    out_rec['sex_penalty'] = sex_penalty
    out_rec['too_many_values_penalty'] = too_many_values_penalty

    overmatching_penalty = weighted([(browsers_penalty, 1),
                                     (devices_penalty, 1),
                                     (regions_penalty, 1),
                                     (sex_penalty, 2),
                                     (too_many_values_penalty, 1)])

    out_rec['overmatching_penalty'] = overmatching_penalty
    return out_rec


def clustering_quality_metrics(vertices_by_components):

    all_yuids_and_devids = set()
    components_of_id_values = defaultdict(set)
    components_mobile_desktop = dict()
    crypta_id_mobile_desktop = set()

    browsers_penalty = dict()
    devices_penalty = dict()
    sex_penalty = dict()
    regions_penalty = dict()
    for component, vertices in vertices_by_components.iteritems():
        stats_rec = crypta_id_stats(vertices)
        quality_metrics_rec = crypta_id_quality_metrics(stats_rec)

        all_yuids_and_devids.update([vertex_rec['key'] for vertex_rec in vertices])

        component_mobile_desktop = stats_rec['yuid_mobile_desktop_counts'].keys()
        crypta_id_mobile_desktop.update(component_mobile_desktop)
        components_mobile_desktop[component] = component_mobile_desktop

        if 'id_values' in stats_rec:
            for id_value_type, id_values in stats_rec['id_values'].iteritems():
                for id_value in id_values:
                    components_of_id_values[id_value].add(component)

        browsers_penalty[component] = quality_metrics_rec['browsers_penalty']
        devices_penalty[component] = quality_metrics_rec['devices_penalty']
        sex_penalty[component] = quality_metrics_rec['sex_penalty']
        regions_penalty[component] = quality_metrics_rec['regions_penalty']


    out_rec = dict()

    # some stats
    components_count = len(vertices_by_components)
    out_rec['components_count'] = components_count

    # how many pair values split among components
    split_values = {
        str(value): len(components) - 1
        for value, components in components_of_id_values.iteritems()
        if len(components) > 1
    }
    all_values = components_of_id_values.keys()
    out_rec['split_values'] = split_values
    split_id_values_penalty_avg = len(split_values) / float(len(all_values)) if all_values else 0
    out_rec['split_id_values_penalty'] = split_id_values_penalty_avg

    # components should be large enough
    split_ratio_penalty_avg = (components_count - 1) / float(len(all_yuids_and_devids) - 1)
    out_rec['split_ratio_penalty'] = split_ratio_penalty_avg

    # number of browsers per person should be reasonable
    browsers_penalty_avg = avg(browsers_penalty)
    out_rec['browsers_penalty'] = browsers_penalty_avg

    # number of devices per person should be reasonable
    devices_penalty_avg = avg(devices_penalty)
    out_rec['devices_penalty'] = devices_penalty_avg

    # man and women should not be placed in single component
    sex_penalty_avg = avg(sex_penalty)
    out_rec['sex_penalty'] = sex_penalty_avg

    # usually user lives in a single region
    regions_penalty_avg = avg(regions_penalty)
    out_rec['regions_penalty'] = regions_penalty_avg

    # it's strange when we split mobile from it's desktop
    mobile_only_penalty = 0
    crypta_id_has_only_mobile = 'm' in crypta_id_mobile_desktop and len(crypta_id_mobile_desktop) == 1

    if not crypta_id_has_only_mobile:  # we can't split mobile from desktop in mobile-only crypta id
        mobile_only_components = 0
        for c in vertices_by_components.keys():
            component_mobile_desktop = components_mobile_desktop[c]
            if 'm' in component_mobile_desktop and len(component_mobile_desktop) == 1:
                mobile_only_components += 1
        mobile_only_penalty = mobile_only_components / float(components_count)

    out_rec['mobile_only_penalty'] = mobile_only_penalty


    out_rec['oversplicing_penalty'] = weighted([(split_id_values_penalty_avg, 1),
                                                (mobile_only_penalty, 1)])

    out_rec['overmatching_penalty'] = weighted([(browsers_penalty_avg, 1),
                                                (devices_penalty_avg, 1),
                                                (regions_penalty_avg, 1),
                                                (sex_penalty_avg, 2)])

    out_rec['penalty'] = weighted([(split_id_values_penalty_avg, 3),
                                   (split_ratio_penalty_avg, 1),
                                   (browsers_penalty_avg, 1),
                                   (devices_penalty_avg, 1),
                                   (sex_penalty_avg, 3),
                                   (mobile_only_penalty, 2)])
    return out_rec
