#!/usr/bin/env python

import urllib2
import yt.wrapper
from yt.wrapper.client import Yt
import os

def get_host_spamness(hostnames):
    request = urllib2.Request(url = "http://webmaster.dev.search.yandex.net:25050/predictSpamHost", data=";".join(hostnames))
    response = urllib2.urlopen(request)
    content = response.read()
    predicted = {}
    for p in content.split():
        hostname, probability = p.split("=")
        predicted[hostname] = probability
    return predicted

client = Yt(proxy='banach.yt.yandex.net', token=os.getenv("YT_TOKEN"))

input_table = "//home/webmaster/prod/export/archive/webmaster-verified-hosts/webmaster-verified-hosts.20170817"

user_ids = {}

for row in client.read_table(input_table + "[:]", raw=False, format=yt.wrapper.DsvFormat()):
    uid = int(row["user_id"])
    hostname = row["host_url"]
    if uid not in user_ids:
        user_ids[uid] = []
    user_ids[uid].append(hostname)

for uid in user_ids:
    spamness = get_host_spamness(user_ids[uid])
    avg = 0.0
    for host in spamness:
        avg += float(spamness[host])
    avg = avg / float(len(spamness))
    print "%d\t%f\t%s" % (uid, avg, ";".join(user_ids[uid]))

    #print get_host_spamness([hostname])
