#!/usr/bin/env python2
# -*- coding: utf-8 -*-
import sys
sys.path.append("/home/terminutz/arcadia/quality/yaqlib")
from bs4 import BeautifulSoup
from urllib.request import urlopen
from serpparser.tagger import SerpTagger
from serpparser.serp_parser_common import SerpMetadata
from multiprocessing import Pool
import argparse
import codecs

from yql.api.v1.client import YqlClient



def get_urls(args):
 #   try:
    tagger, meta, url = args
    parsed_html = urlopen(url).read()
    soup = BeautifulSoup(parsed_html, "html.parser")
    return tagger.get_snippet_urls(soup, meta)
#    except Exception:
#        print("can't parse html {}".format(url))
#        return {}

def argument_parser():
    parser = argparse.ArgumentParser(description='Get parameters')
    parser.add_argument(
        "-t",
        dest="sbs_ticket",
        type=int,
        help="id of sbs ticket"
    )
    parser.add_argument(
        "--sysid",
        dest="sysid",
        type=int,
        help="id of system in given sbs experiment"
    )
    parser.add_argument(
        "--out",
        dest="out_path",
        type=str,
        help="path to output file"
    )
    parser.add_argument(
        "--token",
        dest="yql_token",
        type=str,
        help="yql oauth token"
    )
    parser.add_argument(
        "--platform",
        dest="platform",
        type=str,
        help="platform of htmls touch/desktop"
    )
    args = parser.parse_args()
    return args

if __name__ == "__main__":
    args = argument_parser()
    client = YqlClient(db='hahn', token=args.yql_token)
    request = client.query(
        """SELECT
        CAST(Yson::LookupString(`query`, "text") as String) as query,
        CAST(Yson::LookupInt64(`query`, "region") as Int64) as region,
        CAST(Yson::LookupString(`page`, "html-url") as String) as `html-url`
        FROM `home/sbs/prod/sbs_plan_pages`
        WHERE `ticket-id` = {}
        and Yson::LookupString(`page`, "sys-id") == "{}"
        """.format(args.sbs_ticket, args.sysid),
        syntax_version=1
    )

    request.run()
    yandex_meta = SerpMetadata("yandex", args.platform)
    tagger = SerpTagger()
    queries_with_tags = {}
    for table in request.get_results():
        full_t = table.fetch_full_data()
        pool = Pool(8)
        tags = pool.map(get_urls, ((tagger, yandex_meta, row[2]) for row in table.rows))
        with codecs.open(args.out_path, "w", "utf-8") as of:
            for tbl_row, cur_tag in zip(table.rows, tags):
                of.write("{}\t{}\t{}\t{}\n".format(bytes(tbl_row[0], 'iso-8859-1').decode('utf8'), tbl_row[1], tbl_row[2], cur_tag))
