#!/usr/bin/env python
# -*- coding: utf-8 -*-

import argparse
import json
import sys
import urllib2
import base64
from urlparse import urlparse
import yt.wrapper as yt
from collections import defaultdict


sportlist = {u"футбол" : set([u"футбол"]), u"хокке" : set([u"хокке", u"кхл"]), u"биатлон"  : set([u"биатлон"]), u"волейбол"  : set([u"волейбол", u"воллейбол"]), u"баскетбол" : set([u"баскетбол"]), u"бокс" : set([u"бокс", u"борец", u"бой", u"мма"]), u"фигурн" : set([u"фигурн", u"фигурист"]), u"теннис"  : set([u"теннис", u"тенис"]), u"лыжи"  : set([u"лыжн", u"трамплин"]), u"формула 1"  : set([u"формула 1", u"формула один"])}

def HandleOptions():
    parser = argparse.ArgumentParser(description="Tool for collecting data from uppersearch and middlesearch")
    parser.add_argument("--server", dest = "server", help = "mapreduce server", default = "hahn.yt.yandex.net:80", required = False)
    parser.add_argument("-f", "--file", dest="file", help="File with queries", required = True)
    return parser


def match_hosts(rec):
    hostclass = "unmarked"
    host = rec["host"]
    sug = json.loads(rec["suggestions"].replace("\'", "\""))
    hostclass = "unknown"
    max = 0
    for c in sug.keys():
        if c == "unknown":
            continue
        if sug[c] > max:
            hostclass = c
            max = sug[c]
    yield {"host": host, "class" : hostclass}

def main():
    parser = HandleOptions()
    args = parser.parse_args()
    yt.update_config({"proxy": {"url": args.server}})

    hostmatches = "//home/search-research/hosts-markup"
    hoststat = set(["allboxing.ru", "allhockey.ru", "allsportinfo.ru", "biathlonrus.com", "championat.com", "euro-football.ru", "football.kulichki.net", "khl.ru",
                    "livesport.ru", "matchtv.ru", "news.sportbox.ru", "rsport.ria.ru", "rusfootball.info", "soccernews.ru", "soccer.ru", "sovsport.ru", "sport.business-gazeta.ru",
                    "sport-express.ru", "sportfm.ru", "sport.ru", "sportsdaily.ru", "sports.ru", "stadium.ru", "vseprosport.ru"
                    ])
    # with yt.TempTable(prefix = "quickhosts") as table:
        # yt.run_map(match_hosts,
                   # source_table = [hostmatches],
                   # destination_table = table,
            # )
        # for row in yt.read_table(table):
            # if row["class"] == "sport":
                # hoststat.add(row["host"])
    with open("sporthost.txt", "r") as hostf:
        for line in hostf:
            hoststat.add(line.strip())
        # lines = []
        # for h in hoststat:
            # lines.append(h + "\n")
        # hostf.writelines(lines)

    with open(args.file, "r") as f:
        i = 0
        j = 0
        k = 0
        queries = {}
        for line in f:
            i += 1
            tsport = defaultdict(int)
            sni = []
            tmp = line.split('\t')
            query = tmp[0].strip().decode('utf-8')
            w = tmp[1].strip()

            url = "https://hamster.yandex.ru/search?text=" + urllib2.quote(query.encode('utf-8')) + "&noredirect=1&nocache=da&lr=213&json_dump=searchdata.docs&no-tests=1&json_dump=wizplaces.carousel"
            try:
                response = urllib2.urlopen(url).read()
            except:
                print >> sys.stderr, 'connection error'
                k += 1
                continue
            try:
                result = json.loads(response)
            except:
                print >> sys.stderr, response
                k += 1
                continue
            for r in result['searchdata.docs']:
                title = unicode(r.get("doctitle")).replace("\u0007", "").lower()
                snippet = ""
                if "snippets" in r:
                    if "main" in r["snippets"]:
                        snippet = unicode(r["snippets"]["main"].get("passages")).replace("\u0007", "").lower()
                for sportw in sportlist:
                    for sport in sportlist[sportw]:
                        if sport in title:
                            tsport[sportw] += 1
                        if sport in snippet:
                            tsport[sportw] += 1
            sportclass = "None"
            Misc = False
            for s in tsport.keys():
                if tsport[s] > tsport[sportclass]:
                    sportclass = s
                elif tsport[s] == tsport[sportclass]:
                    Misc = True
            print query.encode('utf-8') + "\t" + w + "\t" + ("Misc" if Misc else sportclass.encode('utf-8'))
            print >> sys.stderr, i, j, k


if __name__ == "__main__":
    main()
