#!/usr/bin/env python
# -*- coding: utf-8 -*-

import sys
import argparse
import random
import urllib2
import json

def HandleOption():
    parser = argparse.ArgumentParser()
    parser.add_argument("--in", dest = "input", help = "input queries", required = True)
    parser.add_argument("--out", dest = "output", help = "filtered queries", required = True)
    parser.add_argument("--count", dest = "count", help = "input queries", required = True)
    return parser


def main():
    args = HandleOption().parse_args()
    full_list = []

    i = 0
    with open(args.input, "r") as input:
        for line in input:
            tmp = line.strip().split('\t')
            if len(tmp) > 1:
                query = tmp[0].decode('utf-8')
                ip = float(tmp[1])
                full_list.append((query, ip))
                print >> sys.stderr, i #query.encode('utf-8'), ip
                i += 1

    random.shuffle(full_list)
    quota = int(args.count)
    queries = {}
    i = 0
    j = 0
    k = 0

    while i < quota and j < len(full_list):
        print >> sys.stderr, i, j, k
        if full_list[j][1] > 0.4 and not full_list[j][0] in queries:

            query = full_list[j][0]
            url = "https://hamster.yandex.ru/touchsearch?text=" + urllib2.quote(query.encode('utf-8')) + "&noredirect=1&nocache=da&no-tests=1&json_dump=searchdata.docs&json_dump=search_props"
            try:
                response = urllib2.urlopen(url).read()
            except:
                print >> sys.stderr, 'connection error'
                k += 1
                j += 1
                continue
            try:
                result = json.loads(response)
            except:
                print >> sys.stderr, response
                k += 1
                j += 1
                continue

            if len(result['searchdata.docs']) == 0:
                j += 1
                continue
            if not "Fresh.IntentProbability" in result["search_props"]["UPPER"][0]["properties"] or not float(result["search_props"]["UPPER"][0]["properties"]["Fresh.IntentProbability"]) > 0.4:
                j += 1
                continue

            queries[full_list[j][0]] = full_list[j][1]
            i += 1
        j += 1
    with open(args.output, "w") as out:
        for i in queries.keys():
            out.write(i.encode('utf-8')+'\t'+'213\n')

if __name__ == '__main__':
    main()
