#!/usr/bin/env python
# -*- coding: utf-8 -*-

import argparse
import json
import sys
import urllib2
import base64
from urlparse import urlparse
import yt.wrapper as yt

def HandleOptions():
    parser = argparse.ArgumentParser(description="Tool for collecting data from uppersearch and middlesearch")
    parser.add_argument("--server", dest = "server", help = "mapreduce server", default = "hahn.yt.yandex.net:80", required = False)
    parser.add_argument("-f", "--file", dest="file", help="File with queries", required = True)
    return parser


def match_hosts(rec):
    hostclass = "unmarked"
    host = rec["host"]
    sug = json.loads(rec["suggestions"].replace("\'", "\""))
    hostclass = "unknown"
    max = 0
    for c in sug.keys():
        if c == "unknown":
            continue
        if sug[c] > max:
            hostclass = c
            max = sug[c]
    yield {"host": host, "class" : hostclass}

def main():
    parser = HandleOptions()
    args = parser.parse_args()
    yt.update_config({"proxy": {"url": args.server}})

    hostmatches = "//home/search-research/hosts-markup"
    hoststat = set(["allboxing.ru", "allhockey.ru", "allsportinfo.ru", "biathlonrus.com", "championat.com", "euro-football.ru", "football.kulichki.net", "khl.ru",
                    "livesport.ru", "matchtv.ru", "news.sportbox.ru", "rsport.ria.ru", "rusfootball.info", "soccernews.ru", "soccer.ru", "sovsport.ru", "sport.business-gazeta.ru",
                    "sport-express.ru", "sportfm.ru", "sport.ru", "sportsdaily.ru", "sports.ru", "stadium.ru", "vseprosport.ru"
                    ])
    # with yt.TempTable(prefix = "quickhosts") as table:
        # yt.run_map(match_hosts,
                   # source_table = [hostmatches],
                   # destination_table = table,
            # )
        # for row in yt.read_table(table):
            # if row["class"] == "sport":
                # hoststat.add(row["host"])
    with open("sporthost.txt", "r") as hostf:
        for line in hostf:
            hoststat.add(line.strip())
        # lines = []
        # for h in hoststat:
            # lines.append(h + "\n")
        # hostf.writelines(lines)

    with open(args.file, "r") as f:
        i = 0
        j = 0
        k = 0
        queries = []
        for line in f:
            i += 1
            sporthost = 0
            livescore = False
            tournament = False
            tvsport = False
            nw = False
            vw = False
            tmp = line.split('\t')
            query = tmp[0].strip().decode('utf-8')
            if query in queries:
                continue
            else:
                queries.append(query)

            url = "https://hamster.yandex.ru/search?text=" + urllib2.quote(query.encode('utf-8')) + "&noredirect=1&nocache=da&lr=213&json_dump=searchdata.docs&no-tests=1&json_dump=wizplaces.carousel"
            try:
                response = urllib2.urlopen(url).read()
            except:
                print >> sys.stderr, 'connection error'
                k += 1
                continue
            try:
                result = json.loads(response)
            except:
                print >> sys.stderr, response
                k += 1
                continue
            for r in result['searchdata.docs']:
                if "host" in r.keys():
                    host = r["host"]
                    if host.startswith("www."):
                        host = host[4:]
                    if host in hoststat:
                        sporthost += 1
                    if host == "SPORTWIZARD_URL":
                        livescore = True
            if "wizplaces.carousel" in result:
                for w in result["wizplaces.carousel"]:
                    if "counter_prefix" in w and "special/event" in w["counter_prefix"]:
                        if w.get("subtype") == "hockey" or w.get("subtype") == "football":
                            tournament = True
                        else:
                            try:
                                tvsport = "sport" in w["data"]["parent_collection"]["id"]
                            except:
                                pass
            if sporthost or tvsport or livescore or tournament:
                print "\t".join([line.strip(), str(sporthost), str(tvsport or livescore or tournament)])

            print >> sys.stderr, i, j, k


if __name__ == "__main__":
    main()
