import os
import numpy as np

__author__ = 'cansucullu'

hosts = set()
f = open('hosts', 'r')
for line in f:
    hosts.add(line.strip())
f.close()

def run_exp():

    f = open('../EXPERIMENT-4520/host/full-data','r')

    urls = set()

    for line in f:
        query, source, pos, url, click, lclick, show, class_, host = line.rstrip().split('\t')
        if host in hosts:
            urls.add(url)

    print len(urls)
    print list(urls)[:10]

    f.close()

    f = open('urls_experiment','w+')
    for i in list(urls):
        f.write(i)
        f.write('\n')
    f.close()

    return urls


def run_assessors():

    f = open('../host-analysis/tw-avg-no-404-hosts')

    urls = set()

    for line in f:
        host, url, mark  = line.rstrip().split('\t')
        if host in hosts:
            urls.add(url)

    print len(urls)
    print list(urls)[:10]

    f.close()

    f = open('urls_judgements','w+')
    for i in list(urls):
        f.write(i)
        f.write('\n')
    f.close()

    return urls


def calculate_mse():

    mark_dict = dict([("HIGHEST",1.0),("HIGH",0.5),("MIDDLE",0.0),("LOW",-0.5),("LOWEST",-1.0)])

    host_dict = {}

    f = open('../host-analysis/tw-avg-no-404-hosts')

    for line in f:
        host, url, mark  = line.rstrip().split('\t')
        host_dict[url] = mark

    f.close()

    f1 = open('tw_class_set1.tsv')
    f2 = open('tw_class_set1_done.tsv','w+')

    differences = []

    missing_count = 0
    for line in f1:
        url, score  = line.rstrip().split('\t')

        url1 = "http://"+url
        url2 = "https://"+url

        is_there_mark = True

        if host_dict.has_key(url):
            mark = host_dict[url]
        elif host_dict.has_key(url1):
            mark = host_dict[url1]
        elif host_dict.has_key(url2):
            mark = host_dict[url2]
        else:
            #print "ERROR", url
            missing_count += 1
            is_there_mark = False

        if is_there_mark:
            f2.write(url+'\t'+score+'\t'+str(mark_dict[mark])+'\n')

            differences.append(float(score)-mark_dict[mark])

    f1.close()
    f2.close()

    differences = np.array(differences)
    print differences

    mse = np.sum((differences) ** 2) / len(differences)
    print mse


if __name__ == '__main__':
    # set1 = run_exp()
    # set2 = run_assessors()
    #
    # print len(list(set1.difference(set2)))
    # print len(list(set1.intersection(set2)))
    # print len(list(set2.difference(set1)))
    # print len(list(set1.union(set2)))
    # print len(list(set1))
    # print len(list(set2))

    calculate_mse()
