#!/usr/bin/env python2
# -*- coding: utf8 -*-

from optparse import OptionParser
import urllib2
import optparse
from urlparse import urlparse
from collections import defaultdict, Counter
import sys
import random
import math
import operator


def get_options():
    p = optparse.OptionParser()
    p.add_option("-i", "--inputData", type=str)
    p.add_option("-u", "--output_dataUrls", type=str)
    p.add_option("-o", "--output_dataStatistics", type=str)
    p.add_option("-p", "--param_poolSize", type=str)
    p.add_option("-t", "--param_Turbo", type=str)
    (options, args) = p.parse_args()
    return options


def main():
    options = get_options()
    zen_table = open(options.inputData)
    param_turbo = options.param_Turbo
    pool_size = int(options.param_poolSize)

    data_collect = defaultdict(list)
    data_statistics = defaultdict(list)
    hosts_list = []
    result_data_list = []
    adv_disabled_flag = '&exp_flags=adv-disabled'

    for value in zen_table:
        value_parse = value.strip().split('\t')
        host = urlparse(value_parse[0]).scheme + '://' + urlparse(value_parse[0]).netloc
        url_left = str(param_turbo) + value_parse[1] + str(adv_disabled_flag)
        url_right = value_parse[1]
        data_collect[host].append(urllib2.unquote(url_left) + '\t' + urllib2.unquote(url_right))

    for host, url in data_collect.items():
        hosts_list.append(host)

    for host, url in data_collect.items():
        for i in range(pool_size):
            if len(result_data_list) <= pool_size:
                random.shuffle(hosts_list)
                random_host = random.choice(hosts_list)
                if random_host == host:
                    random_url = random.choice(url)
                    res_data = random_url.split('\t')[0] + '\t' + random_url.split('\t')[1]
                    if not res_data in result_data_list:
                        result_data_list.append(res_data)
                        data_statistics[host].append(res_data)

    with open(options.output_dataUrls, 'w') as data_result:
        for val in result_data_list:
            data_result.write(val + '\n')

    with open(options.output_dataStatistics, 'w') as statistics_result:
        for host, url in sorted(data_statistics.items(), key=lambda item: len(item[1]), reverse=True):
            statistics_result.write(str(host) + '\t' + str(len(url)) + '\t'
                                    + str(round(float(len(url)) / float(len(data_statistics)) * 100, 1)) + '%' + '\n')


if __name__ == '__main__':
    main()
