#!/usr/bin/env python
# -*- coding: utf-8 -*-
from __future__ import unicode_literals
from __future__ import division
import sys
import os
import codecs
import argparse
import json
from collections import Counter
import random
import tldextract


def extract_host(url):
    return tldextract.extract(url).registered_domain


def main():
    parser = argparse.ArgumentParser()
    parser.add_argument('--data_for_join', '-d', required=True)
    parser.add_argument('--toloka_output', '-t', required=True)
    parser.add_argument('--validate', '-v', required=True)
    parser.add_argument('--kpi', '-k', required=True)
    parser.add_argument('--by_host', '-bh', action='store_true')
    args = parser.parse_args()

    data_for_join = json.load(open(args.data_for_join))
    toloka_output = json.load(open(args.toloka_output))

    toloka_output = {
        x['ji']['url'] for x in toloka_output if x['result']['result'] == 'YES'
    }

    clean_data = [
        data_for_join[x] for x in data_for_join if x in toloka_output
    ]
    if not args.by_host:
        cats = {x['cat2n'] for x in clean_data}

    validate = []
    kpi = []

    sysrnd = random.SystemRandom()

    if args.by_host:
        sysrnd.shuffle(clean_data)
        border = len(clean_data) // 2
        validate = clean_data[:border]
        kpi = clean_data[border:]
    else:
        for cat in cats:
            data = [x for x in clean_data if x['cat2n'] == cat]
            sysrnd.shuffle(data)
            border = len(data) // 2
            validate.extend(data[:border])
            kpi.extend(data[border:])

    json.dump(validate, open(args.validate, 'w'), indent=2)
    json.dump(kpi, open(args.kpi, 'w'), indent=2)

    print('validate:')
    for pair in Counter(
        extract_host(x['canon_url']) for x in validate
    ).most_common(20):
        print('{}\t{}'.format(pair[0], pair[1]))

    print('kpi:')
    for pair in Counter(
        extract_host(x['canon_url']) for x in kpi
    ).most_common(20):
        print('{}\t{}'.format(pair[0], pair[1]))


if __name__ == "__main__":
    main()
