import json
import urllib
import requests
from bs4 import BeautifulSoup
import codecs
import argparse
import logging
from collections import defaultdict
from serpparser.tagger import SerpTagger
from serpparser.serp_parser_common import SerpMetadata
import multiprocessing
from itertools import repeat, izip

logger = logging.getLogger(__name__)


def get_tags(tagger, url, meta):
    parsed_html = urllib.urlopen(url).read()
    soup = BeautifulSoup(parsed_html, "html.parser")
    return tagger.parse_serp_tags(soup, meta)


def argument_parser():
    parser = argparse.ArgumentParser(description='Get parameters')
    parser.add_argument("-ui", dest="ui", help="touch or desktop", required=True)
    parser.add_argument("-t", dest="ticket_file", help="filepath to ticket numbers", required=True)
    parser.add_argument("--y_sys_id", dest="yandex_sys_id", default="0")
    parser.add_argument("--g_sys_id", dest="google_sys_id", default="1")
    parser.add_argument("--out_prefix", dest="out_prefix", default="")
    parser.add_argument("--max_pos", dest="max_position", default=6, type=int)
    parser.add_argument("--min_count", dest="min_count", default=1, type=int)
    return parser.parse_args()


def proccess_tickets(ticket, ui, yandex_sys, google_sys, max_position):
    r = requests.get("https://sbs.yandex-team.ru/api/experiment/{}".format(ticket),
                     headers={"Content-Type": "application/json"},
                     verify=False
                     )
    data = json.loads(r.content)

    yandex_meta = SerpMetadata("yandex", ui)
    google_meta = SerpMetadata("google", ui)
    tagger = SerpTagger()

    results = defaultdict(set)
    for i, query in enumerate(data['results']['queries']):
        text = query['query']['text']
        results["all_queries"].add(text)

        try:
            yandex_url = query['systems'][yandex_sys]['html_url']
            yandex_tags = get_tags(tagger, yandex_url, yandex_meta)
            for k, v in yandex_tags.iteritems():
                if k == "yandex_snippet_image" and v < 3:
                    results[k].add(text)
                if k != "yandex_snippet_image" and v < max_position:
                    results[k].add(text)
        except Exception:
            pass
          #  logger.error("Can't process query for yandex {}".format(text))

        try:
            google_url = query['systems'][google_sys]['html_url']
            google_tags = get_tags(tagger, google_url, google_meta)
            for k, v in google_tags.iteritems():
                if k == "google_snippet_image" and v < 3:
                    results[k].add(text)
                if k != "google_snippet_image" and v < max_position:
                    results[k].add(text)
        except Exception:
            pass
           # logger.error("Can't process query for google {}".format(text))

    return results


def one_process_star(all_args):
    return proccess_tickets(*all_args)


def save_to_file(filename, queries):
    with codecs.open(filename, "w", encoding="utf-8") as f:
        for q in queries:
            f.write(q + u"\n")


def add_set_to_dict(d, s):
    for el in s:
        d[el] += 1


def get_set(d, min_count):
    return {query for query, count in d.iteritems() if count >= min_count}


def main():
    args = argument_parser()
    ui = args.ui

    yandex_sys = args.yandex_sys_id
    google_sys = args.google_sys_id

    wizards = ["fact", "geo", "video", "entity", "images", "bno", "misspell", "snippet_image"]
    only_yandex_wizards = ["yandex_collections", "yandex_market", "yandex_chats"]
    only_google_wizards = ["google_recommend"]
    y_wizards = ["yandex_{}".format(wizard) for wizard in wizards]
    g_wizards = ["google_{}".format(wizard) for wizard in wizards]

    with open(args.ticket_file, "r") as f:
        tickets = [line.strip() for line in f]

    total_buckets = defaultdict(lambda: defaultdict(int))
    only_yandex_bucket = defaultdict(lambda: defaultdict(int))
    only_google_bucket = defaultdict(lambda: defaultdict(int))
    all_queries = set()

    pool = multiprocessing.Pool(3)
    for result in pool.imap(one_process_star, izip(tickets,
                                                   repeat(ui),
                                                   repeat(yandex_sys),
                                                   repeat(google_sys),
                                                   repeat(args.max_position))):
        for y_wizard, g_wizard in zip(y_wizards, g_wizards):
            add_set_to_dict(total_buckets[y_wizard], result[y_wizard])
            add_set_to_dict(total_buckets[g_wizard], result[g_wizard])
            all_queries = result["all_queries"]
        for y_wizard in only_yandex_wizards:
            add_set_to_dict(only_yandex_bucket[y_wizard], result[y_wizard])
        for g_wizard in only_google_wizards:
            add_set_to_dict(only_google_bucket[g_wizard], result[g_wizard])

    wizard_sets = {k: get_set(v, args.min_count) for k, v in total_buckets.iteritems()}
    only_yandex_sets = {k: get_set(v, args.min_count) for k, v in only_yandex_bucket.iteritems()}
    only_google_sets = {k: get_set(v, args.min_count) for k, v in only_google_bucket.iteritems()}

    snippets_queries = all_queries
    for wizard, y_wizard, g_wizard in zip(wizards, y_wizards, g_wizards):
        save_to_file("results/{}_any_{}".format(args.out_prefix, wizard),
                     wizard_sets[y_wizard] | wizard_sets[g_wizard])
        save_to_file("results/{}_both_{}".format(args.out_prefix, wizard),
                     wizard_sets[y_wizard] & wizard_sets[g_wizard])
        save_to_file("results/{}_only_google_{}".format(args.out_prefix, wizard),
                     wizard_sets[g_wizard] - wizard_sets[y_wizard])
        save_to_file("results/{}_only_yandex_{}".format(args.out_prefix, wizard),
                     wizard_sets[y_wizard] - wizard_sets[g_wizard])
        if wizard != "misspell":
            snippets_queries = snippets_queries - (wizard_sets[y_wizard] | wizard_sets[g_wizard])

    for y_wizard in only_yandex_wizards:
        save_to_file("results/{}_only_{}".format(args.out_prefix, y_wizard), only_yandex_bucket[y_wizard])
        snippets_queries = snippets_queries - only_yandex_sets[y_wizard]
    for g_wizard in only_google_wizards:
        save_to_file("results/{}_only_{}".format(args.out_prefix, g_wizard), only_google_bucket[g_wizard])
        snippets_queries = snippets_queries - only_google_sets[g_wizard]

    save_to_file("results/{}_{}".format(args.out_prefix, "snippets"), snippets_queries)


if __name__ == "__main__":
    main()

