#!/usr/bin/env python
# -*- coding: utf-8 -*-
from __future__ import division
import sys
import argparse
import datetime
import time
import numpy
import yt.wrapper as yt
import json
from collections import defaultdict

def HandleOption():
    parser = argparse.ArgumentParser()
    parser.add_argument("--base", dest = "base", help = "input with shiny database", required = True)
    parser.add_argument("--server", dest = "server", help = "mapreduce server", default = "hahn.yt.yandex.net:80", required = False)
    return parser

def normalize(text):
    text = text.decode('utf-8')
    text = ''.join(e if e.isalnum() or e.isspace() else ' ' for e in text)
    text = ' '.join(text.split())
    return text.lower().encode('utf-8')

def main():
    args = HandleOption().parse_args()
    yt.update_config({"proxy": {"url": args.server}})

    table_right = "//tmp/itajn/1d760e6e-a710cceb-c6f46431-2bb0a2aa"
    table_left = "//tmp/itajn/46f64f59-d44014d4-f79ea5e5-49bab6b3"

    base = {}
    with open(args.base) as basefile:
        for line in basefile:
            tmp = line.strip().split("\t")
            query = tmp[3]
            url = tmp[1]
            base[normalize(query)] = url

    users = {}
    i = 0
    for row in yt.read_table(table_right):
        if i % 100000 == 241:
            print >> sys.stderr, "right column line %s" %i
        i += 1
        if not row["uid"] in users:
            users[row["uid"]] = []
        if base.get(normalize(row["query"])):
            users[row["uid"]].append(base.get(normalize(row["query"])))
        # else:
            # print >> sys.stderr, normalize(row["query"])

    i = 0
    for row in yt.read_table(table_left):
        if i % 100000 == 241:
            print >> sys.stderr, "left column line %s" %i
        i += 1
        if not row["uid"] in users:
            users[row["uid"]] = []
        if base.get(normalize(row["query"])):
            users[row["uid"]].append(base.get(normalize(row["query"])))
        # else:
            # print >> sys.stderr, normalize(row["query"])

    res = []
    for u in users:
        if users[u]:
            res.append(len(set(users[u]))/len(users[u]))

    print "average", numpy.average(res)
    print "median", numpy.median(res)
    print numpy.average(res)
    print numpy.median(res)

if __name__ == "__main__":
    main()
