#!/usr/bin/env python

import numpy

def load_tld_map():
    tlds = set()
    with open("tld.list") as f:
        for line in f:
            tlds.add(line.strip())

    tld_list = list(sorted(tlds))
    tld_map = {}
    for t in range(len(tld_list)):
        tld_map[tld_list[t]] = t

    return tld_map

def build_ngramms_map():
    charmap = "qwertyuiopasdfghjklzxcvbnm1234567890-_"
    ngramms = set()
    for i in range(len(charmap)):
        for j in range(len(charmap)):
            ngramms.add(charmap[i] + charmap[j])

    ngramms_list = list(sorted(ngramms))
    ngramms_map = {}

    for n in range(len(ngramms_list)):
        ngramms_map[ngramms_list[n]] = n

    return ngramms_map

def get_features(tld_map, ngramms_map, hostname, target):
    ngramms_features_offset = 3
    tld_features_offset = ngramms_features_offset + len(ngramms_map)
    features_count = tld_features_offset + len(tld_map)
    features_list = [0] * features_count

    features_list[0] = target
    features_list[1] = len(hostname)

    domains = hostname.split(".")
    tld = domains[-1]
    rest = domains[0:-1]

    features_list[2] = len(domains)

    if tld not in tld_map:
        tld = "etc"

    tld_index = tld_features_offset + tld_map[tld]
    features_list[tld_index] = 1

    for domain in rest:
        for i in range(len(domain) - 1):
            ngramm = domain[i] + domain[i + 1]
            ngramm_index = ngramms_features_offset + ngramms_map[ngramm]
            features_list[ngramm_index] += 1

    return numpy.array([features_list])

if __name__ == "__main__":
    numpy.set_printoptions(threshold=numpy.nan)
    numpy.set_printoptions(linewidth=132000)

    ngramms_map = build_ngramms_map()
    tld_map = load_tld_map()
    #print tld_map

    hostname = "lenta.ru"
    print get_features(tld_map, ngramms_map, hostname, 0)

    '''
    with open('train.txt','a') as train_file_handle:
        with open("data.shuf") as f:
            for line in f:
                target, hostname = line.strip().split()
                target = int(target)
                numpy.savetxt(train_file_handle, get_features(tld_map, ngramms_map, hostname, target), fmt='%d')
                #print counter, " ".join(get_features(tld_map, ngramms_map, hostname)), 1
    '''
