#!/usr/bin/env python
# -*- coding: utf-8 -*-

from __future__ import division

import sys
import argparse
import random
import urllib2
import json
import random

def HandleOption():
    parser = argparse.ArgumentParser()
    parser.add_argument("--in", dest = "input", help = "input queries", required = True)
    parser.add_argument("--out", dest = "output", help = "filtered queries", required = True)
    parser.add_argument("--count", dest = "count", help = "input queries", required = True)
    parser.add_argument("--device", dest = "device", help = "touch or desktop", default = "touch")
    parser.add_argument("--threads", dest = "threads", help = "Take less queries for parallel runs", default = "0")
    return parser


def main():
    args = HandleOption().parse_args()
    full_list = []

    i = 0
    with open(args.input, "r") as input:
        for line in input:
            tmp = line.strip().split('\t')
            if len(tmp) > 1:
                query = tmp[0].decode('utf-8')
                ip = float(tmp[1])
                full_list.append((query, ip))
                print >> sys.stderr, i #query.encode('utf-8'), ip
                i += 1

    random.shuffle(full_list)
    threads = int(args.threads)
    if threads:
        quota = int(args.count) / threads + int(args.count) / 50
    else:
        quota = int(args.count)
    queries = {}
    i = 0 #counter for selected queries
    j = 0 #counter for looked queries
    k = 0 #counter for network errors

    while i < quota and j < len(full_list):
        print >> sys.stderr, i, j, k
        nw = False
        if not full_list[j][0] in queries:

            query = full_list[j][0]
            if args.device == "touch":
                url = "https://hamster.yandex.ru/search/touch?text=" + urllib2.quote(query.encode('utf-8')) + "&noredirect=1&nocache=da&no-tests=1&json_dump=searchdata.docs&json_dump=search_props"
            else:
                url = "https://hamster.yandex.ru/search?text=" + urllib2.quote(query.encode('utf-8')) + "&noredirect=1&nocache=da&no-tests=1&json_dump=searchdata.docs&json_dump=search_props"
            try:
                response = urllib2.urlopen(url).read()
            except:
                print >> sys.stderr, 'connection error'
                k += 1
                j += 1
                continue
            try:
                result = json.loads(response)
            except:
                print >> sys.stderr, response
                k += 1
                j += 1
                continue

            if len(result['searchdata.docs']) == 0:
                j += 1
                continue
            for r in result['searchdata.docs']:
                if "_markers" in r.keys():
                    for m in r["_markers"]:
                        if "NEWS_WIZARD" in m:
                            nw = True
            if not nw:
                j += 1
                continue
            queries[full_list[j][0]] = full_list[j][1]
            i += 1
            j += 1
        j += 1
    with open(args.output, "w") as out:
        for i in queries.keys():
            if args.device == "touch":
                out.write(i.encode('utf-8') +'\t225\tnews\t[class*=t-construct-adapter__news]\n')
            else:
                out.write(i.encode('utf-8') +'\t225\tnews_desktop\t[class*=t-construct-adapter__news]\n')

if __name__ == '__main__':
    main()
