# -*- coding: utf-8 -*-

import logging
import urllib
import re
import time
import socket

import sandbox.common.types.misc as ctm

from sandbox.sandboxsdk import parameters
from sandbox.sandboxsdk.task import SandboxTask
from sandbox.sandboxsdk.channel import channel
from sandbox.projects import resource_types
from sandbox.projects.common import apihelpers
from sandbox.projects.common import decorators
from sandbox.projects.common import dolbilka
from sandbox.projects.common import error_handlers as eh
from sandbox.projects.common import file_utils as fu
from sandbox.projects.common import requests_wrapper
from sandbox.projects.common import string
from sandbox.projects.common import utils
from sandbox.projects.common.search import components as sc
from sandbox.projects.common.search.response import cgi


PLATFORMS = {
    "ANDROID": "android",
    "IPHONE": "ios",
}


def create_wiz_queries_params(group_name="Wizard queries parameters"):
    class WizQueriesParams(object):
        """
            Wizard queries parameters
        """
        class UpperHost(parameters.SandboxStringParameter):
            name = 'upper_host'
            description = 'Uppersearch host'
            default_value = 'https://hamster.yandex.ru'
            group = group_name

        class ApphostEnabled(parameters.SandboxBoolParameter):
            name = 'apphost_enabled'
            description = 'Apphost enabled'
            default_value = True
            group = group_name

        class TextReqs(parameters.ResourceSelector):
            name = 'text_requests'
            description = 'Text requests'
            required = True
            resource_type = [resource_types.USERS_QUERIES, resource_types.PLAIN_TEXT_QUERIES]
            group = group_name

        class LimReqs(parameters.SandboxIntegerParameter):
            name = 'limit_requests'
            description = 'Limit requests'
            group = group_name

        class GetSubRequests(parameters.SandboxBoolParameter):
            name = 'get_sub_requests'
            description = 'Get subrequests (disable metarank)'
            group = group_name

        class GetMmetaRequests(parameters.SandboxBoolParameter):
            name = 'get_mmeta_requests'
            description = 'Get mmeta requests'
            group = group_name

        class AddCGI(parameters.SandboxStringParameter):
            name = 'add_cgi'
            description = 'Additional cgi parameter'
            default_value = ''
            group = group_name

        class CreatePlans(parameters.SandboxBoolParameter):
            name = 'create_plans'
            description = 'Create plans'
            group = group_name

        class Attr(parameters.SandboxStringParameter):
            name = 'attribute'
            description = 'Additional attributes for resources'
            default_value = ''
            group = group_name

        class DisableExperiments(parameters.SandboxBoolParameter):
            name = 'no_tests'
            description = 'Disable TDI experiments (no-tests=1)'
            default_value = True
            group = group_name

        class Haha(parameters.SandboxBoolParameter):
            name = 'remove_haha'
            description = 'Remove haha=da'
            default_value = True
            group = group_name

        class WizardBuild2(parameters.TaskSelector):
            name = 'wizard_build_2_task_id'
            default_value = None
            description = 'Wizard build task'
            task_type = 'BUILD_WIZARD_2'
            group = 'Extended'

        class WizardPackage(parameters.ResourceSelector):
            name = 'wizard_package_resource_id'
            default_value = None
            description = 'Wizard runtime package resource'
            resource_type = resource_types.WIZARD_RUNTIME_PACKAGE
            group = 'Extended'

        class BegemotTimeout(parameters.SandboxIntegerParameter):
            name = 'begemot_timeout'
            default_value = 0
            description = 'Increase begemot services timeouts for n (ms)'
            group = group_name

        class LimitExecution(parameters.SandboxBoolParameter):
            name = 'limit_execution'
            default_value = False
            description = 'Limit task execution by 2.5 hours'
            group = group_name

        params = [
            UpperHost,
            ApphostEnabled,
            TextReqs,
            LimReqs,
            GetSubRequests,
            GetMmetaRequests,
            AddCGI,
            CreatePlans,
            Attr,
            DisableExperiments,
            Haha,
            WizardBuild2,
            WizardPackage,
            BegemotTimeout,
            LimitExecution,
        ]

    return WizQueriesParams


wiz_queries_params = create_wiz_queries_params()


_re_json_dump = re.compile(r'&?json_dump=search_props\.WEB')
_re_collect_subreqs = re.compile(r'&?pron=collect_subreqs')

_SEARCH_PROPS_WEB = "search_props.WEB"
_RDAT_REQID = "rdat.reqid"
_MAX_BAD_QUERIES = 5000


def _remove_dump_subreqs_params_from_q(q):
    return _re_json_dump.sub('', _re_collect_subreqs.sub('', q))


class WizardQueriesProd(SandboxTask):
    """
        **Описание**

            Берет запросы в виде USERS_QUERIES и обстреливает ими URL, заданный во входных параметрах.
            Получает переколдованные запросы на средний и базовый.
            Опционально генерирует из полученных запросов планы.

        **Требуемые ресурсы**

        * Text requests (типа USERS_QUERIES)

        **Параметры**

        * **Uppersearch host** - репорт для обстрела (по умолчанию hamster.yandex.ru),
        * **Get subrequests** - получать запросы на базовый (эта опция отключает
         режим ранжирования на среднем),
        * **Get mmeta requests** - получать запросы на средний,
        * **Create plans** - сохранять запросы дополнительно в виде бинарных планов
        * **Attribute** - атрибуты для ресурсов (запросов и плана)
        * **Remove haha=da** - удалять в переколдованных запросах "haha=da"
         (влияет только на запросы на базовый поиск, т.к. запросы на средний
         этого cgi-параметра не содержат)

        **Создаваемые ресурсы**

        * PLAIN_TEXT_QUERIES
        * BASESEARCH_PLAN (если включена опция **Create plans**)

    """
    type = "WIZARD_QUERIES_PROD"
    dns = ctm.DnsType.DNS64
    input_parameters = wiz_queries_params.params
    execution_space = 60 * 1024  # 60 Gb

    attrs_for_out_resources = None

    def on_enqueue(self):
        SandboxTask.on_enqueue(self)
        for search_type in self._search_types():
            self.ctx["wizarded_queries_for_{}".format(search_type)] = self.create_resource(
                "Wizarded queries for {}, {}".format(search_type, self.descr),
                "wizarded_queries_for_{}.txt".format(search_type),
                resource_types.PLAIN_TEXT_QUERIES,
            ).id
            if self.ctx.get(wiz_queries_params.CreatePlans.name):
                self.ctx["wizarded_plan_for_{}".format(search_type)] = self.create_resource(
                    "Wizarded plan for {}, {}".format(search_type, self.descr),
                    "wizarded_plan_for_{}.bin".format(search_type),
                    resource_types.BASESEARCH_PLAN,
                ).id
        self.ctx["bad_requests_resource"] = self.create_resource(
            "Wizard bad requests", "bad_requests.txt", resource_types.OTHER_RESOURCE
        ).id

    def _search_types(self):
        types = []
        if self.ctx.get(wiz_queries_params.GetSubRequests.name):
            types.append("basesearch")
        if self.ctx.get(wiz_queries_params.GetMmetaRequests.name):
            types.append("mmeta")
        return types

    def on_execute(self):
        common_attrs = string.parse_attrs(utils.get_or_default(self.ctx, wiz_queries_params.Attr))
        self.attrs_for_out_resources = {st: common_attrs for st in self._search_types()}

        wizard_custom_build_task = utils.get_or_default(self.ctx, wiz_queries_params.WizardBuild2)
        req_info = self._prepare_requests_info()

        if wizard_custom_build_task:
            wizard_data = self.sync_resource(apihelpers.get_task_resource_id(
                wizard_custom_build_task, resource_type=resource_types.WIZARD_SHARD
            ))
            wizard_binary = self.sync_resource(apihelpers.get_task_resource_id(
                wizard_custom_build_task, resource_type=resource_types.REMOTE_WIZARD
            ))
            wizard_config = self.sync_resource(apihelpers.get_task_resource_id(
                wizard_custom_build_task, resource_type=resource_types.WIZARD_CONFIG
            ))
            runtime_data = self.sync_resource(utils.get_or_default(self.ctx, wiz_queries_params.WizardPackage))

            wizard = sc.get_wizard(wizard_binary, wizard_config, wizard_data, runtime_data)
            with wizard as wiz:
                port = wiz.port
                if utils.get_or_default(self.ctx, wiz_queries_params.ApphostEnabled):
                    port = wiz.apphost_port
                reqs = self._create_requests(req_info, wizard_host=socket.gethostname(), wizard_port=port)
                responses = self._get_fetched_responses(reqs)

        else:
            reqs = self._create_requests(req_info)
            responses = self._get_fetched_responses(reqs)

        paths = self._save_responses(responses)
        if self.ctx.get(wiz_queries_params.CreatePlans.name):
            self._gen_plans(paths)

    def _parse_user_queries(self, queries_res):
        user_request_re = re.compile(r"user_request=([^&]+)")
        tld_re = re.compile(r"tld=([^&]+)")
        region_re = re.compile(r"relev=.*relevgeo%3D([^&;]+)")
        users_queries_res = self.create_resource(
            "Users queries from plain text, {}".format(self.descr),
            "users_queries_from_plain_text.txt",
            resource_types.USERS_QUERIES,
        )
        bad_q_count = 0
        with open(self.sync_resource(queries_res.id)) as f, open(users_queries_res.path, "w") as out_f:
            for line in f:
                if bad_q_count > _MAX_BAD_QUERIES:
                    eh.check_failed("Too many bad queries in input file")
                maybe_user_req = user_request_re.findall(line)
                maybe_tld = tld_re.findall(line)
                maybe_region = region_re.findall(line)
                if not(maybe_user_req and maybe_region and maybe_tld):
                    logging.error(
                        "Got incorrect info from plain test:\n"
                        "{}\nuser_req={}, region={}, tld={}\n"
                        "Skip this query!".format(line, maybe_user_req, maybe_region, maybe_tld)
                    )
                    bad_q_count += 1
                else:
                    user_req = urllib.unquote_plus(maybe_user_req[0].strip(" \n"))
                    region = maybe_region[0].strip(" \n")
                    yield (user_req, region, maybe_tld[0], None)
                    out_f.write("{}\t{}\n".format(urllib.unquote_plus(user_req), region))

    def _prepare_requests_info(self):
        queries_res = channel.sandbox.get_resource(self.ctx[wiz_queries_params.TextReqs.name])
        if queries_res.type == resource_types.USERS_QUERIES:
            return utils.get_user_queries(queries_res)
        elif queries_res.type == resource_types.PLAIN_TEXT_QUERIES:
            return self._parse_user_queries(queries_res)
        eh.check_failed("Unknown queries resource type: {}".format(queries_res.type))

    def _create_requests(self, req_info, wizard_host=None, wizard_port=None):
        """
            :param req_info: generator, so requires only one cycle
        """
        logging.info("Creating requests...")
        common_url = cgi.UrlCgiCustomizer(
            base_url=utils.get_or_default(self.ctx, wiz_queries_params.UpperHost) + "/yandsearch"
        )
        common_url.add_custom_params([
            ("json_dump", _SEARCH_PROPS_WEB),
            ("json_dump", _RDAT_REQID),
            ("wizextra", "misspell_timeout=1500ms"),
            ("flag", "noapache_json_req=app_host:upper"),
            ("dbgwzr", "2"),
            ("rearr", "need_debug_http_url"),
        ])
        begemot_services_timeouts = utils.get_or_default(self.ctx, wiz_queries_params.BegemotTimeout)
        if begemot_services_timeouts > 0:
            common_url.add_custom_params([
                ("srcrwr", "BEGEMOT_MERGER:::{}".format(begemot_services_timeouts)),
                ("srcrwr", "BEGEMOT_GRAPH:::{}".format(begemot_services_timeouts)),
                ("srcrwr", "WIZARD:::{}".format(begemot_services_timeouts)),
            ])
        common_url.add_custom_pron("collect_subreqs").disable_mda().disable_cache().set_max_timeout()
        if utils.get_or_default(self.ctx, wiz_queries_params.DisableExperiments):
            common_url.disable_experiments()
        if utils.get_or_default(self.ctx, wiz_queries_params.GetSubRequests):
            common_url.add_custom_pron("nometarank")
        if utils.get_or_default(self.ctx, wiz_queries_params.AddCGI):
            for add_cgi in _parse_add_cgi(self.ctx[wiz_queries_params.AddCGI.name]):
                common_url.add_custom_param(*add_cgi)
        if wizard_port is not None:
            common_url.add_custom_param("wizhosts", "{}:{}".format(wizard_host, wizard_port))

        reqs = []
        for text, reg, tld, device in req_info:
            device_u = device.upper() if device else None
            iter_url = cgi.UrlCgiCustomizer(common_url.base_url, common_url.params)
            iter_url.add_text(text).add_region(reg)
            if tld:
                iter_url.replace_tld(tld)
            if device and device_u in PLATFORMS:
                iter_url.add_custom_param("wizextra", "filter_platform={}".format(PLATFORMS[device_u]))
            reqs.append(iter_url)
        return reqs

    @decorators.retries(3)
    def _get_json_response(self, url):
        return requests_wrapper.get(url.base_url, params=url.params, verify=False).json()

    def _get_response(self, url, num_tries):
        for i in range(num_tries):
            resp = self._get_json_response(url)
            if _SEARCH_PROPS_WEB in resp and resp[_SEARCH_PROPS_WEB] and resp[_SEARCH_PROPS_WEB][0]["properties"]:
                return {"retry_helps": i != 0, "resp": resp}
        else:
            return {"retry_helps": False, "resp": self._get_json_response(url)}

    def _get_fetched_responses(self, reqs):
        logging.info("Dumping responses...")
        responses = {"basesearch": [], "mmeta": []}
        self.get_base = self.ctx.get(wiz_queries_params.GetSubRequests.name)
        self.get_mmeta = self.ctx.get(wiz_queries_params.GetMmetaRequests.name)
        start_time = time.time()
        reask_fail_count = 0
        fail_count = 0
        empty_props_count = 0.0
        bad_req = channel.sandbox.get_resource(self.ctx["bad_requests_resource"])
        requested_queries = 0
        retries_count = 0
        with open(bad_req.path, "wb") as br_fd:
            for url in reqs:
                resp = None
                requested_queries += 1
                try:
                    response = self._get_response(url, 3)
                    if response["retry_helps"]:
                        retries_count += 1
                    resp = response["resp"]

                    if _SEARCH_PROPS_WEB not in resp:
                        logging.error("No requested searcher props in response:\n%s\n----------", resp)
                        br_fd.write(str(url) + '\n')
                    else:
                        for props in resp[_SEARCH_PROPS_WEB]:
                            if not props["properties"]:
                                empty_props_count += 1
                                logging.warning("Empty property response detected!\nUrl: %s", url)
                                break
                            if self.get_base:
                                responses["basesearch"].append(
                                    _remove_dump_subreqs_params_from_q(
                                        props["properties"]["SubRequests.debug"]
                                    )
                                )
                            if self.get_mmeta:
                                if "CurHttpUrl.debug" in props["properties"]:
                                    mmeta_url = props["properties"]["CurHttpUrl.debug"]
                                    if mmeta_url:
                                        responses["mmeta"].append(_remove_dump_subreqs_params_from_q(mmeta_url))
                                    else:
                                        logging.warning("Empty CurHttpUrl.debug in props: %s\nUrl: %s", props, url)
                                else:
                                    logging.warning("Can't find CurHttpUrl.debug in %s\nUrl: %s", resp, url)

                    if (
                        self.ctx[wiz_queries_params.LimReqs.name] and
                        max(
                            len(responses["basesearch"]),
                            len(responses["mmeta"])
                        ) >= self.ctx[wiz_queries_params.LimReqs.name]
                    ):
                        break
                    time_delta = time.time() - start_time
                    # not more than 2.5 hours
                    if utils.get_or_default(self.ctx, wiz_queries_params.LimitExecution) and time_delta > 9000:
                        break
                except Exception:
                    reqid = 'UNKNOWN'
                    if resp:
                        reqid = resp.get(_RDAT_REQID, reqid)

                    if '-REASK' in reqid:
                        # FIXME(mvel) reanimate this
                        reask_fail_count += 1
                        # it is misspell, typically
                        logging.warning("REASK detected. ReqId: %s, Url: %s\n", reqid, url)
                    else:
                        fail_count += 1
                        logging.error(
                            "\n\n==================\n"
                            "Error during request processing:\n%s\n"
                            "Url: %s\n"
                            "ReqId: %s\n"
                            "\n==================\n\n",
                            eh.shifted_traceback(), url, reqid,
                        )

        logging.debug("retries for empty props helps %s times, %s empty props left", retries_count, empty_props_count)

        time_delta = time.time() - start_time
        if time_delta < 0.1:
            time_delta = 0.1
        rps = len(reqs) / time_delta
        self.ctx['wizardings_rps'] = rps

        base_count = len(responses["basesearch"])
        if self.get_base:
            eh.ensure(base_count, "No basesearch requests were collected")

        mmeta_count = len(responses["mmeta"])
        if self.get_mmeta:
            eh.ensure(mmeta_count, "No mmeta requests were collected")
            eh.ensure(
                float(len(responses["mmeta"])) / requested_queries > 0.5,
                "Too much answers without CurHttpUrl.debug"
            )

        eh.ensure(empty_props_count / requested_queries < 0.5, "Too much answers with empty properties!")
        self.set_info(
            "Fetched {} basesearch and {} mmeta requests with {} rps, "
            "got {} errors and {} reask errors".format(
                base_count,
                mmeta_count,
                rps,
                fail_count,
                reask_fail_count,
            )
        )
        eh.ensure(float(fail_count) / mmeta_count < 0.05, "Too many non-reask errors")

        return responses

    def _save_responses(self, responses):
        logging.info("Saving responses ...")
        paths = {}

        for search_type, search_responses in responses.iteritems():
            if len(search_responses) == 0:
                continue

            patched_responses = self._patch_responses(search_responses)

            res = channel.sandbox.get_resource(self.ctx["wizarded_queries_for_{}".format(search_type)])
            queries_written = fu.write_lines(res.path, patched_responses)
            logging.info("%s %s queries were saved", queries_written, search_type)
            self.attrs_for_out_resources[search_type]["queries_amount"] = str(queries_written)
            utils.set_resource_attributes(res, self.attrs_for_out_resources[search_type])
            paths[search_type] = res.path

        return paths

    def _patch_responses(self, responses):
        logging.info("Patching responses ...")
        # cut everything before cgi request begin (?param1=value1&param2=...)
        responses = (resp[max(resp.find("?"), 0):] for resp in responses)

        # remove haha=da if necessary
        if self.ctx.get(wiz_queries_params.Haha.name):
            responses = (resp.replace("haha=da", "") for resp in responses)

        return responses

    def _gen_plans(self, paths):
        logging.info("Generating plans ...")
        for search_type, queries_path in paths.items():
            plan_res = channel.sandbox.get_resource(self.ctx["wizarded_plan_for_{}".format(search_type)])
            dolbilka.convert_queries_to_plan(queries_path, plan_res.path)
            utils.set_resource_attributes(plan_res, self.attrs_for_out_resources[search_type])


def _parse_add_cgi(add_cgi):
    for single_cgi in add_cgi.strip("&").split("&"):
        if single_cgi and "=" in single_cgi:
            yield tuple(single_cgi.split("=", 1))


__Task__ = WizardQueriesProd
