#!/usr/bin/env python
# -*- coding: utf-8 -*-
import sys
import codecs
import argparse
import json

import tldextract

def get_host(x):
    return tldextract.extract(x).registered_domain

bad_prefices = ('www.', 'm.')

def strip_prefices(str_):
    while any(str_.startswith(x) for x in bad_prefices):
        for x in bad_prefices:
            if str_.startswith(x):
                str_ = str_[len(x):]
    return str_

def choose_categ(x, categs):
    cat = categs.get(get_host(x['canon_url']), 'UNKNOWN')
    if cat == 'UNKNOWN' and categs.get(get_host(x['page_url'])):
        x['host'] = get_host(x['page_url'])
        cat = categs[get_host(x['page_url'])]
    else:
        x['host'] = get_host(x['canon_url'])
    return cat


if __name__ == "__main__":
    parser = argparse.ArgumentParser()
    parser.add_argument('--musca', '-m', required=True)
    parser.add_argument('--host_cats', '-с', required=True)
    parser.add_argument('--output', '-o', required=True)
    args = parser.parse_args()

    musca = json.load(open(args.musca))

    host_cats = {}
    with codecs.open(args.host_cats, 'r', 'utf8') as f:
        for line in f:
            t = line.strip().split('\t')
            if len(t) < 2:
                continue
            host_cats[strip_prefices(t[0])] = t[1]

    for x in musca:
        host_cat = choose_categ(x, host_cats)
        x['host_cat'] = host_cat

    json.dump(
        musca, codecs.open(args.output, 'w', 'utf8'),
        indent=2, ensure_ascii=False, sort_keys=True
    )
