# !/usr/bin/env python2.7
# coding=utf-8

import sys
import urllib
import random
from optparse import OptionParser
import optparse
from collections import defaultdict, OrderedDict


def get_options():
    p = optparse.OptionParser()
    p.add_option("-i", "--inputData", type=str)
    p.add_option("-o", "--outputData", type=str)
    p.add_option("-s", "--step", type=str)
    p.add_option("-l", "--sampleLen", type=str)
    (options, args) = p.parse_args()
    return options


def main():
    options = get_options()
    step = int(options.step)
    sample_len = options.sampleLen
    sampleList = []
    hosts_list = []
    agrData = defaultdict(list)
    shareData = defaultdict(list)
    agrDataSort = OrderedDict()
    tmp_coef = 1

    try:
        # collect urls, add the turbo parameter and tmp coefficient for calculating the weighted average

        for sampleString in open(options.inputData):
            sampleListValue = sampleString.strip().split('\t')
            if not sampleListValue[2] in hosts_list:
                hosts_list.append(sampleListValue[2])

            agrData[sampleListValue[2] + '\t' + str(tmp_coef)].append(
                sampleListValue[4]
                + '\t' + 'https://yandex.ru/turbo?text=' + urllib.quote_plus(sampleListValue[3]).encode('utf-8')
                + '&exp_flags=adv-disabled')

        # collect urls on the host (preserving the sampling sequence) in OrderedDict

        for host in hosts_list:
            for hosts, urls in agrData.items():
                if host == hosts.split('\t')[0]:
                    agrDataSort[hosts] = urls

                    # check the maximum number of documents from the host (if more then step take random) and updating coefficient
        for hosts, urls in agrDataSort.items():
            coef = int(tmp_coef)

            if len(urls) > step:
                coef = float(len(urls)) / step
                random.shuffle(urls)
                for random_urls in urls[: int(step)]:
                    if len(sampleList) < int(sample_len):
                        sampleList.append(
                            str(hosts.split('\t')[0]) + '\t' + str(random_urls) + '\t' + str(coef))
            else:
                coef = coef
                for url in urls:
                    if len(sampleList) < int(sample_len):
                        sampleList.append(
                            str(hosts.split('\t')[0]) + '\t' + str(url) + '\t' + str(coef))


        for data in sampleList:
            shareData[data.split('\t')[0] + '\t' + str(data.split('\t')[3])].append(data.split('\t')[1] + '\t'
                                                                                    + data.split('\t')[2])

        with open(options.outputData, 'w') as res_data:
            for hosts, urls in shareData.items():
                coef = float(hosts.split('\t')[1])
                for url in urls:
                    res_data.write(str(url.split('\t')[0]) + '\t' + str(url.split('\t')[1]) + '\t' + str(coef) + '\n')


    except Exception as errors:
        print(sys.stderr, errors)
        pass


if __name__ == '__main__':
    main()
