#!/usr/bin/env python
# -*- coding: utf-8 -*-

import argparse
import json
import sys
import urllib2
import base64
import random
import yt.wrapper as yt
import copy
import datetime
import time
import base64

def HandleOptions():
    parser = argparse.ArgumentParser()
    parser.add_argument("-f", "--file", dest="file", help="File with queries")
    parser.add_argument("--server", dest = "server", help = "mapreduce server", default = 'hahn.yt.yandex.net:80', required = False)
    return parser

def main():
    parser = HandleOptions()
    args = parser.parse_args()
    yt.update_config({'proxy': {'url': args.server}})

    videoparams = { '1.plain_video' : "http://hamster.yandex.ru/video/search?no-tests=1&nocache=da&timeout=9999999&waitall=da&json_dump=searchdata.clips.*.VisibleURL",
                    '2.videoquick' : "http://hamster.yandex.ru/video/search?no-tests=1&nocache=da&timeout=9999999&waitall=da&json_dump=searchdata.clips.*.VisibleURL&srcask=VIDEOQUICK&srcask=VIDEOQUICK_MISSPELL",
                    '3.videoquick_news' : "http://hamster.yandex.ru/video/search?no-tests=1&nocache=da&timeout=9999999&waitall=da&json_dump=searchdata.clips.*.VisibleURL&srcask=VIDEOQUICK&srcask=VIDEOQUICK_MISSPELL&relev=boost_is_news=10",
                    '4.videoquick_news_24h' : "http://hamster.yandex.ru/video/search?no-tests=1&nocache=da&timeout=9999999&waitall=da&json_dump=searchdata.clips.*.VisibleURL&srcask=VIDEOQUICK&srcask=VIDEOQUICK_MISSPELL&relev=boost_is_news=10&relev=news_time_threshold=0.85&relev=news_time_value=10",
                    '5.videoquick_24h' : "http://hamster.yandex.ru/video/search?no-tests=1&nocache=da&timeout=9999999&waitall=da&json_dump=searchdata.clips.*.VisibleURL&srcask=VIDEOQUICK&srcask=VIDEOQUICK_MISSPELL&relev=news_time_threshold=0.85&relev=news_time_value=10",
                    '6.videoquick_touch_24h' : "http://hamster.yandex.ru/video/touch/search?no-tests=1&nocache=da&timeout=9999999&waitall=da&json_dump=searchdata.clips.*.VisibleURL&noredirect=1&relev=news_time_threshold=0.85&relev=news_time_value=10&srcask=VIDEOQUICK&srcask=VIDEOQUICK_MISSPELL",
                    '7.videoquick_touch_news_24h' : "http://hamster.yandex.ru/video/touch/search?no-tests=1&nocache=da&timeout=9999999&waitall=da&json_dump=searchdata.clips.*.VisibleURL&noredirect=1&relev=news_time_threshold=0.85&relev=news_time_value=10&relev=boost_is_news=10&srcask=VIDEOQUICK&srcask=VIDEOQUICK_MISSPELL"
                  }

    yt_path = '//home/freshness/staff/itajn/FR-2722/2017-06-21_requid'
    if not yt.exists(yt_path):
        yt.create_table(path = yt_path, recursive = True)

    weights = {}
    with open(args.file, "r") as f:
        for line in f:
            tmp = line.split('\t')
            query = tmp[0].strip()
            if not query in weights:
                weights[query] = 1
            else:
                weights[query] += 1

    query_list = weights.keys()
    random.shuffle(query_list)

    for query in query_list:
        newswiz = False
        nw_num = "241"
        q = query.decode("utf-8")
        url = "https://hamster.yandex.ru/touchsearch?text=" + urllib2.quote(q.encode("utf-8")) + "&noredirect=1&lr=213&json_dump=searchdata.docs&no-tests=1&&metahost2=QUICK%3Amcquack.search.yandex.net%3A8879&waitall=da&nocache=da&lr=213&json_dump=search_props.QUICK"
        try:
            response = urllib2.urlopen(url).read()
        except:
            print >> sys.stderr, 'connection error'
            continue
        try:
            result = json.loads(response)
        except:
            print >> sys.stderr, response
            continue
        for r in result['searchdata.docs']:
            if "_markers" in r.keys():
                for m in r["_markers"]:
                    if "NEWS_WIZARD" in m:
                        newswiz = True
                        nw_num = r["num"]
                        break
        if not newswiz or int(nw_num) >= 5:
            continue

        quickurls =  base64.b64decode(result["search_props.QUICK"][0]["properties"]["NewsFromQuickMiddle.FetchedDocs"]).split(' ')
        timestamp = int(time.mktime(datetime.datetime.now().timetuple()))

        record = {'query' : query, 'weight' : weights[query], 'timestamp' : timestamp, 'quick_urls' : json.dumps(quickurls)}
        for param in videoparams:
            q = query.decode("utf-8")
            url = videoparams[param] + '&text=' + urllib2.quote(q.encode("utf-8")) + '&json_dump=reqparam.reqid'
            try:
                response = urllib2.urlopen(url).read()
            except:
                print >> sys.stderr, 'connection error ' + param
                continue
            try:
                result = json.loads(response)
            except:
                print >> sys.stderr, param
                print >> sys.stderr, response
                continue
            record[param] = json.dumps(result)

        yt.write_table(yt.TablePath(yt_path, append = True), [record])

if __name__ == "__main__":
    main()
