#!/usr/bin/env python
# -*- coding: utf-8 -*-

from sandbox.common.types.client import Tag
from sandbox.common.types.task import Status
from sandbox.sandboxsdk import parameters as sp
from sandbox.sandboxsdk import process
from sandbox.sandboxsdk.task import SandboxTask
from sandbox import sdk2

from sandbox.projects.common import environments
from sandbox.projects.common import apihelpers
from sandbox.projects.common import error_handlers as eh
from sandbox.projects.common import string
from sandbox.projects.common import utils
from sandbox.projects.common import ra2
from sandbox.projects.common.search.response import cgi

from sandbox.projects import GetRandomRequests as grr
from sandbox.projects.release_machine.tasks.ScrapeRequests2 import parameters as sr_params
from sandbox.projects.release_machine.tasks.ScrapeRequests2 import static as sr_static
from sandbox.projects import resource_types

from sandbox.projects.release_machine.tasks.GetReportSimultaneousResponses import response_threading as rt
from sandbox.projects.release_machine.tasks.GetReportSimultaneousResponses import yt_script

import os
import logging
import sys

DOMAINS = {
    "ru": "213",    # Moscow
    "tr": "11508",  # Istanbul
    "ua": "143",    # Kiev
    "kz": "162",    # Almaty
    "by": "157",    # Minsk
}

REQUEST_COUNT = "request_count"
TOTAL_REQUEST_COUNT = "total_request_count"
RESPONSES_ID = "responses_id"
SUBKEY_ERROR = "subkey_error"

MAX_SUBKEY_LENGTH = 4096

FIRST_BETA_TEMPLATE = "first_beta"
SECOND_BETA_TEMPLATE = "second_beta"
FIRST_BETA_TEMPLATE_DICT = "first_beta_template_dict"
SECOND_BETA_TEMPLATE_DICT = "second_beta_template_dict"

UNANSWERED_INFO_GROUP = "Parameters for search without unanswered sources"
YT_INFO_GROUP = "YT parameters"
QUERIES_INFO_GROUP = "Queries parameters"
RESPONSES_INFO_GROUP = "Responses parameters"
BATCHED_REQUESTS_GROUP = "Batched requests parameters"


class UseUnansweredCheckParameter(sp.SandboxBoolParameter):
    name = "use_unanswered"
    description = "Use checker for unanswered source"
    group = UNANSWERED_INFO_GROUP


class UnansweredSourcesParameter(sp.SandboxStringParameter):
    name = "unanswered_sources"
    description = "Names for checking sources (CSV)"
    default_value = ""
    group = UNANSWERED_INFO_GROUP


class UnansweredTriesNumberParameter(sp.SandboxIntegerParameter):
    name = "unanswered_tries_count"
    description = "Tries count"
    default_value = 5
    group = UNANSWERED_INFO_GROUP


class UnansweredDiffThresholdParameter(sp.SandboxFloatParameter):
    name = "unanswered_diff_threshold"
    description = "Maximum allowable absolute difference between unanswered data for one source"
    default_value = 1e-3
    group = UNANSWERED_INFO_GROUP


class FullyAnsweredSourcesParameter(sp.SandboxStringParameter):
    name = "success_only"
    description = "Sources with all answered subsources (CSV)"
    default_value = ""
    group = UNANSWERED_INFO_GROUP


class TryTillFullSuccessParameter(sp.SandboxBoolParameter):
    name = "try_till_full_success"
    description = "Should we use all tries hoping for no unanswers at all"
    default_value = True
    group = UNANSWERED_INFO_GROUP


class YtUsingParameter(sp.SandboxBoolParameter):
    name = "use_mapreduce"
    description = (
        "Use yt as responses storage "
        "(uses internal task REPORT_RESPONSES if switched off, enabled by default)"
    )
    default_value = True
    group = YT_INFO_GROUP


class RequestsParameter(sr_params.RequestsResource):
    required = False


class NumberOfRandomRequestsParameter(sp.SandboxIntegerParameter):
    name = "random_requests"
    description = "Number of random requests (if there is no resource with queries)"
    default_value = 3000
    group = QUERIES_INFO_GROUP


class UseLatestQueries(sp.SandboxBoolParameter):
    name = "use_latest_queries"
    description = "Use latest random queries, instead of generating new"
    default_value = False
    group = QUERIES_INFO_GROUP


class TriesCountParameter(sp.SandboxIntegerParameter):
    name = "tries_count"
    description = "Number of tries to get a response"
    required = True
    default_value = 5
    group = QUERIES_INFO_GROUP


class ScraperProfileName(sp.SandboxStringParameter):
    name = "scraper_profile_name"
    description = (
        "Name of profile which is used in Scraper configuration, "
        "leave it empty to not use profile (read Scraper wiki for more info)"
    )
    default_value = ""
    group = RESPONSES_INFO_GROUP


class FirstBetasDict(sp.DictRepeater, sp.SandboxStringParameter):
    name = "first_betas_dict"
    description = "Betas for first (sample)"
    required = False
    group = BATCHED_REQUESTS_GROUP


class SecondBetasDict(sp.DictRepeater, sp.SandboxStringParameter):
    name = "second_betas_dict"
    description = "Betas for second (test)"
    required = False
    group = BATCHED_REQUESTS_GROUP


class QueriesDict(sp.DictRepeater, sp.SandboxStringParameter):
    """
        Use ids of resources in string
    """
    name = "queries_dict"
    description = "Queries for betas"
    required = False
    group = BATCHED_REQUESTS_GROUP


GENERAL_PARAMETERS = (
    ScraperProfileName,
)

FIRST_BETA_PARAMETERS = (
    sr_params.FirstBetaHostParameter,
    sr_params.FirstBetaCollectionParameter,
    sr_params.FirstBetaUserAgentParameter,
    sr_params.FirstBetaCgiParameter,
    sr_params.FirstBetaDumpAllJson,
    sr_params.FirstBetaJsonToStandardFormat,
)

SECOND_BETA_PARAMETERS = (
    sr_params.SecondBetaHostParameter,
    sr_params.SecondBetaCollectionParameter,
    sr_params.SecondBetaUserAgentParameter,
    sr_params.SecondBetaCgiParameter,
    sr_params.SecondBetaDumpAllJson,
    sr_params.SecondBetaJsonToStandardFormat,
)

YT_PARAMETERS = (
    YtUsingParameter,
    sr_params.YtServerParameter,
    sr_params.YtUserParameter,
    sr_params.OutputTableParameter,
)

UNANSWERED_PARAMETERS = (
    UseUnansweredCheckParameter,
    UnansweredSourcesParameter,
    FullyAnsweredSourcesParameter,
    UnansweredTriesNumberParameter,
    UnansweredDiffThresholdParameter,
    TryTillFullSuccessParameter,
)

QUERIES_INFO_PARAMETERS = (
    RequestsParameter,
    NumberOfRandomRequestsParameter,
    UseLatestQueries,
    TriesCountParameter,
)

BATCHED_MODE_PARAMETERS = (
    FirstBetasDict,
    SecondBetasDict,
    QueriesDict,
)

_GET_QUERIES_SUBTASK = "get_queries_subtask"
_GET_UIDS_SUBTASK = "get_uids_subtask"
_SCRAPER_SUBTASK = "scraper_subtask"
_OUTPUT_NAME = "json_responses"


class GetReportSimultaneousResponses(SandboxTask):
    """
    Task for getting JSON responses from two upper betas
    Try to get responses for one query simultaneously
    """
    type = "GET_REPORT_SIMULTANEOUS_RESPONSES"
    input_parameters = (
        FIRST_BETA_PARAMETERS +
        SECOND_BETA_PARAMETERS +
        GENERAL_PARAMETERS +
        QUERIES_INFO_PARAMETERS +
        ra2.PERSONAL_UIDS_PARAMETERS +
        YT_PARAMETERS +
        UNANSWERED_PARAMETERS +
        (
            sr_params.ParserBinaryParameter,
            grr.SearchPropFilterParameter,
            grr.UseLatestRandomQueriesBinary,
        ) +
        (
            sr_params.ScraperOverYtPoolParameter,
        ) +
        BATCHED_MODE_PARAMETERS
    )
    client_tags = Tag.LINUX_PRECISE | Tag.LINUX_TRUSTY  # Lucid is ignored due to SEARCH-2285
    environment = (
        environments.SandboxMapReduceEnvironment(),
        environments.PipEnvironment('yandex-yt', use_wheel=True),
        environments.PipEnvironment('yandex-yt-yson-bindings-skynet', use_wheel=True)
    )
    cores = 1
    required_ram = 12 * 1024

    threads = []
    output = None

    def _get_mr_params(self):
        server = self.ctx[sr_params.YtServerParameter.name]
        table = self.ctx[sr_params.OutputTableParameter.name]
        user = self.ctx[sr_params.YtUserParameter.name]
        mr_yt_binary = self.sync_resource(
            apihelpers.get_last_released_resource(resource_types.MAPREDUCE_YT_EXECUTABLE).id
        )
        os.environ["YT_TOKEN"] = self.get_vault_data("SEARCH-RELEASERS", "yt_token")
        return server, user, table, mr_yt_binary

    def _leave_responses_from_both_betas(self, output_table):
        server, _, _, _ = self._get_mr_params()
        commands = [
            (
                "{command} {input} {output} --server {server}".format(
                    command="{} {}".format(sys.executable, yt_script.__file__),
                    input=output_table,
                    output=output_table,
                    server=server,
                ),
                "yt_script.py"
            ),
        ]
        for cmd, log_prefix in commands:
            logging.info("Start %s part of task with command %s", log_prefix, cmd)
            process.run_process(
                [cmd], work_dir="./", timeout=3600, shell=True, check=True,
                log_prefix=log_prefix, outputs_to_one_file=True, environment=os.environ.copy(),
            )

    def _get_table_row_count(self, table_path):
        server, user, _, mr_yt_binary = self._get_mr_params()
        temp_file = 'temp_output_file'
        cmd = "{mr_yt_binary} -server {server} -user {user} -get {table}/@row_count > {output}".format(
            mr_yt_binary=mr_yt_binary,
            server=server,
            user=user,
            table=table_path,
            output=temp_file
        )
        sr_static.run_cmd(cmd, "MR")

        with open(temp_file, 'r') as file:
            content = file.read()
        return int(content)

    def _get_collection_param(self, n, param):
        return sr_static.COLLECTIONS[self.ctx["beta_collection_{}".format(n)]][param]

    def _generate_search_url(self, n):
        beta = self.ctx["beta_host_{}".format(n)]
        collection = self._get_collection_param(n, "name")
        add_cgi = cgi.UrlCgiCustomizer(params=self.ctx["beta_cgi_params_{}".format(n)])
        if self.ctx["beta_dump_all_json_{}".format(n)]:
            add_cgi.add_custom_param("json_dump", "1")

        self.ctx[FIRST_BETA_TEMPLATE_DICT] = {}
        self.ctx[SECOND_BETA_TEMPLATE_DICT] = {}
        url_template = "https://{}/{}?text={{text}}&lr={{lr}}"
        first_beta_dict = self.ctx[FirstBetasDict.name]
        second_beta_dict = self.ctx[SecondBetasDict.name]

        if first_beta_dict and second_beta_dict:
            for region in first_beta_dict.keys():
                self.ctx[FIRST_BETA_TEMPLATE_DICT][region] = add_cgi.apply_to_query(
                    url_template.format(
                        first_beta_dict.get(region),
                        collection,
                    )
                )
                self.ctx[SECOND_BETA_TEMPLATE_DICT][region] = add_cgi.apply_to_query(
                    url_template.format(
                        second_beta_dict.get(region),
                        collection,
                    )
                )

        result = url_template.format(beta, collection)

        return add_cgi.apply_to_query(result)

    def _should_personalize(self):
        return self.ctx.get(ra2.UsePersonalUidsParameter.name)

    def prepare_execute(self):
        required_cgi = (
            cgi.UrlCgiCustomizer()
            .disable_mda()
            .enable_light_cookie()
            .add_custom_param("json_dump", "search_props.UPPER.0.properties")
            .add_custom_param("json_dump", "rdat.reqid")
        )
        self.ctx[FIRST_BETA_TEMPLATE] = required_cgi.apply_to_query(self._generate_search_url(1))
        self.ctx[SECOND_BETA_TEMPLATE] = required_cgi.apply_to_query(self._generate_search_url(2))
        logging.info("First beta template: %s", self.ctx[FIRST_BETA_TEMPLATE])
        logging.info("Second beta template: %s", self.ctx[SECOND_BETA_TEMPLATE])

    def _get_headers_dict(self, n):
        specified_user_agent = self.ctx.get("beta_user_agent_{}".format(n), "")
        if specified_user_agent:
            return {"User-Agent": specified_user_agent}
        elif self.ctx["beta_collection_{}".format(n)] not in {"search", "video"}:
            return {"User-Agent": self._get_collection_param(n, "user_agent")}
        else:
            return {}

    def _check_params(self):
        sources = map(lambda item: item.strip(), self.ctx[UnansweredSourcesParameter.name].split(","))
        for source in sources:
            if source.find(":") != -1:
                source, params = source.rsplit(":", 1)
                params = params.strip().split("|")
                keys_found = set()
                for param in params:
                    if param.count("=") == 1:
                        key, value = param.split("=")
                        key = key.strip()
                        keys_found.add(key)
                if rt.MAX_DELTA not in keys_found:
                    logging.info(
                        "Cannot find %s parameter in config, use default value: %s",
                        rt.MAX_DELTA,
                        UnansweredDiffThresholdParameter.cast(self.ctx[UnansweredDiffThresholdParameter.name])
                    )

    def _get_subtasks(self):
        return (self.ctx.get(_GET_QUERIES_SUBTASK), self.ctx.get(_GET_UIDS_SUBTASK))

    def on_execute(self):
        self.prepare_execute()

        subtasks = self._get_subtasks()

        queries_id = self._get_queries_id()

        utils.check_subtasks_fails()
        if subtasks != self._get_subtasks():
            utils.wait_all_subtasks_stop()

        if _SCRAPER_SUBTASK in self.ctx:
            self._leave_responses_from_both_betas(self.ctx[sr_params.OutputTableParameter.name])
            self.ctx[REQUEST_COUNT] = self._get_table_row_count(self.ctx[sr_params.OutputTableParameter.name]) / 2
            subtask_id = self.ctx[_SCRAPER_SUBTASK]
            req_count = sdk2.Task[subtask_id].Context.total_requests_count
            self.ctx[TOTAL_REQUEST_COUNT] = req_count or self.ctx[NumberOfRandomRequestsParameter.name]
        else:
            self._download_using_scraper(queries_id)

    def _get_queries_id(self):
        if self.ctx.get(UseLatestQueries.name):
            req_attr = "random_requests={},geo_id={}".format(
                NumberOfRandomRequestsParameter.cast(self.ctx[NumberOfRandomRequestsParameter.name]),
                int(DOMAINS.get(self.ctx[sr_params.FirstBetaHostParameter.name].rsplit(".", 1)[1], "213")),
            )
            logging.info("Get queries from task which collected random queries with attrs: %s", req_attr)
            reqs_resource = apihelpers.get_last_resource_with_attrs(
                resource_types.USERS_QUERIES,
                attrs=string.parse_attrs(req_attr),
                all_attrs=True,
            )
            if reqs_resource:
                logging.info("Found " + str(reqs_resource.task_id) + " task which collected random queries. Syncing...")
                return reqs_resource.id
        queries_id = self.ctx.get(sr_params.RequestsResource.name)
        if queries_id:
            logging.info("Collect queries from resource #%s", queries_id)
        elif _GET_QUERIES_SUBTASK in self.ctx:
            subtask_id = self.ctx[_GET_QUERIES_SUBTASK]
            logging.info("Get queries from child subtask #%s", subtask_id)
            queries_resources = apihelpers.list_task_resources(
                subtask_id, resource_type=resource_types.USERS_QUERIES
            )
            eh.verify(queries_resources, "No queries generated in task #{}".format(subtask_id))
            queries_id = queries_resources[0].id
        else:
            logging.info("Try to get queries from GetRandomRequests task")
            subtask = self.create_subtask(
                task_type="GET_RANDOM_REQUESTS",
                description="Get requests for GetReportSimultaneousResponses task #{}: '{}'".format(
                    self.id, self.descr
                ),
                input_parameters={
                    "notify_if_finished": "",
                    "num_of_requests": NumberOfRandomRequestsParameter.cast(
                        self.ctx[NumberOfRandomRequestsParameter.name]),
                    "mr_server": self.ctx[sr_params.YtServerParameter.name],
                    "region": int(DOMAINS.get(
                        self.ctx[sr_params.FirstBetaHostParameter.name].rsplit(".", 1)[1], "213"
                    )),
                    "services": "web",
                    grr.SearchPropFilterParameter.name: self.ctx[grr.SearchPropFilterParameter.name],
                    grr.UseLatestRandomQueriesBinary.name: self.ctx[grr.UseLatestRandomQueriesBinary.name],
                },
            )
            self.ctx[_GET_QUERIES_SUBTASK] = subtask.id

        return queries_id

    def _run_scrape_requests_2(self, queries_id):
        logging.info("Launching sdk2 version of scrape requests")
        task_class = sdk2.Task["SCRAPE_REQUESTS_2"]
        personal_uids_resource_id = self.ctx.get(ra2.PersonalUidsResourceParameter.name)
        input_parameters = {
            "first_beta_hosts": {"ru": self.ctx[sr_params.FirstBetaHostParameter.name]},
            "first_beta_user_agent": self.ctx.get(sr_params.FirstBetaUserAgentParameter.name, ""),
            "first_beta_collection": self.ctx[sr_params.FirstBetaCollectionParameter.name],
            "first_beta_cgi": self.ctx[sr_params.FirstBetaCgiParameter.name],
            "first_beta_dump_all_json": self.ctx[sr_params.FirstBetaDumpAllJson.name],
            "first_beta_json_to_standard_format": self.ctx[sr_params.FirstBetaJsonToStandardFormat.name],
            "second_beta_hosts": {"ru": self.ctx[sr_params.SecondBetaHostParameter.name]},
            "second_beta_user_agent": self.ctx.get(sr_params.SecondBetaUserAgentParameter.name, ""),
            "second_beta_collection": self.ctx[sr_params.SecondBetaCollectionParameter.name],
            "second_beta_cgi": self.ctx[sr_params.SecondBetaCgiParameter.name],
            "second_beta_dump_all_json": self.ctx[sr_params.SecondBetaDumpAllJson.name],
            "second_beta_json_to_standard_format": self.ctx[sr_params.SecondBetaJsonToStandardFormat.name],
            "requests_resource": sdk2.Resource[queries_id],
            "output_table": self.ctx[sr_params.OutputTableParameter.name],
            "yt_server": self.ctx[sr_params.YtServerParameter.name],
            "yt_user": self.ctx[sr_params.YtUserParameter.name],
            "parser_binary": sdk2.Resource[self.ctx[sr_params.ParserBinaryParameter.name]],
            "filter_cgi": False,
            "scraper_over_yt_pool": self.ctx[sr_params.ScraperOverYtPoolParameter.name],
            "use_personal_uids": utils.get_or_default(self.ctx, ra2.UsePersonalUidsParameter),
            "kill_timeout": 3 * 3600,
            "active_waiting_duration": 15 * 60,
            "passive_waiting_tries": 20,
            "passive_waiting_duration": 3600 // 2,
        }
        if personal_uids_resource_id:
            input_parameters["personal_uids_resource"] = personal_uids_resource_id
        input_parameters.update({
            "first_beta_hosts": self.ctx[FirstBetasDict.name],
            "second_beta_hosts": self.ctx[SecondBetasDict.name],
            "use_dict_resources": True,
            "requests_resources": {domain: int(res_id) for domain, res_id in self.ctx[QueriesDict.name].items()},
        })
        subtask = task_class(
            task_class.current,
            description="Get requests for GetReportSimultaneousResponses task #{}: \"{}\"".format(self.id, self.descr),
            owner=self.owner,
            priority=self.priority,
            **input_parameters
        )
        logging.info("Subtask created")
        subtask.enqueue()
        return subtask

    def _download_using_scraper(self, queries_id):
        logging.info("Trying to get responses with ScrapeRequests2 task")
        subtask = self._run_scrape_requests_2(queries_id)
        self.ctx[_SCRAPER_SUBTASK] = subtask.id
        self.wait_tasks(
            [subtask],
            statuses=tuple(Status.Group.FINISH) + tuple(Status.Group.BREAK),
            wait_all=True,
            state=None,
        )


__Task__ = GetReportSimultaneousResponses
