#!/usr/bin/env python
# -*- coding: utf-8 -*-

from __future__ import division
import json
import math
import random
import argparse
import tldextract
from collections import Counter
import logging
logging.getLogger('tldextract').addHandler(logging.NullHandler())


def get_host(url):
    return tldextract.extract(url).registered_domain


sr = random.SystemRandom()


def weighted_choice(choices):
    total = sum(c['weight_'] for c in choices)
    r = sr.uniform(0, total)
    upto = 0
    for e, c in enumerate(choices):
        if upto + c['weight_'] >= r:
            return e
        upto += c['weight_']


def main():
    parser = argparse.ArgumentParser()
    parser.add_argument('--input', '-i')
    parser.add_argument('--output', '-o', default='output.json')
    parser.add_argument('--urls', '-u', type=int, default=1000)
    parser.add_argument('--balance', '-b', action='store_true')
    args = parser.parse_args()

    hosts = Counter()
    j = json.load(open(args.input))

    bucket_weights = list(map(lambda x: math.log(x + 1, 2), range(1, 11)))

    for i in j:
        host = get_host(i['canon_url'])
        i['host'] = host
        hosts[host] = hosts.get(host, 0) + 1

    for i in j:
        i['host_weight'] = math.log(hosts[i['host']] + 1, 2) / hosts[i['host']]
        i['url_weight'] = math.log(i['cnt'], 2) - math.log(50, 2)
        if not args.balance:
            i['weight_'] = i['url_weight'] * i['host_weight']

    j.sort(key=lambda x: x['cnt'])

    bucket_len = len(j) // 10

    buckets = []
    for _ in range(10):
        buckets.append(j[:bucket_len])
        j = j[bucket_len:]

    if args.balance:
        for bucket in buckets:
            max_host_weight = max(x['host_weight'] for x in bucket)
            max_url_weight = max(x['url_weight'] for x in bucket)
            for x in bucket:
                x['host_weight_balanced'] = x['host_weight'] / max_host_weight
                x['url_weight_balanced'] = x['url_weight'] / max_url_weight
                x['weight_'] = (
                    x['host_weight_balanced'] * x['url_weight_balanced']
                )

    good_bucket = args.urls // 10

    result = []
    for i in range(len(buckets)):
        if len(buckets[i]) >= good_bucket:
            bucket_len = good_bucket
        else:
            bucket_len = len(buckets[i])
        for _ in range(bucket_len):
            e = weighted_choice(buckets[i])
            obj = buckets[i][e]
            obj['weight'] = bucket_weights[i]
            result.append(obj)
            parts = buckets[i][:e], buckets[i][e + 1:]
            buckets[i] = parts[0] + parts[1]
        args.urls -= good_bucket

    canon_urls = {x['canon_url'] for x in result}
    extra_spikes = []
    for b in buckets:
        extra_spikes.extend([
            x for x in b
            if x['canon_url'] not in canon_urls and
            (x['spike_ts_yandex_video'] or x['spike_ts_yandex_web'])
        ])
    sr.shuffle(extra_spikes)
    extra_spikes = extra_spikes[:1000]
    for x in extra_spikes:
        x['disable_sbr'] = 1
    result += extra_spikes

    hosts_final = Counter(x['host'] for x in result)
    print(hosts_final.most_common(10))

    json.dump(result, open(args.output, 'w'), indent=2, sort_keys=True)


if __name__ == '__main__':
    main()
