# -*- coding: utf-8 -*-

import datetime
import logging
import os
import random
import re
import time
import sandbox.common.errors as common_errors
from sandbox.common.types.client import Tag

import sandbox.projects.release_machine.security as rm_sec
from sandbox.sandboxsdk.task import SandboxTask
from sandbox.sandboxsdk.channel import channel
from sandbox.sandboxsdk import process
from sandbox.sandboxsdk import parameters
from sandbox.sandboxsdk import paths
from sandbox.projects.common import cms
from sandbox.projects.common import decorators
from sandbox.projects.common import error_handlers as eh
from sandbox.projects.common import file_utils as fu
from sandbox.projects.common import string
from sandbox.projects.common import utils
from sandbox.projects.common.search import instance_resolver as ir
from sandbox.projects.common.search.eventlog import eventlog
from sandbox.projects.common.search.settings import WebSettings
from sandbox.projects.common.search import cluster_names
from sandbox.projects.common.search import queries as search_queries
from sandbox.projects.common.dolbilka import DolbilkaPlanner
from sandbox.projects.common.search.config import SearchConfig
from sandbox.projects import resource_types
from sandbox.projects.images.metasearch import resources as images_metasearch_resources
from sandbox.projects.websearch.middlesearch import resources as ms_res


_WEB_MMETA_OPTIONS = "Web mmeta options"
_WEB_INT_OPTIONS = "Web int options"

_IMG_MMETA_OPTIONS = "Images mmeta options"
_IMG_INT_OPTIONS = "Images int options"

_VIDEO_MMETA_OPTIONS = "Video mmeta options"
_VIDEO_INT_OPTIONS = "Video int options"

_REFRESH_OPTIONS = "Refresh options"


class GetMmeta(object):
    class Config(parameters.SandboxBoolParameter):
        name = 'get_mmeta_cfg'
        description = 'Get mmeta config'
        group = _WEB_MMETA_OPTIONS

    class Requests(parameters.SandboxBoolParameter):
        name = 'get_mmeta_reqs'
        description = 'Get mmeta requests'
        group = _WEB_MMETA_OPTIONS
        sub_fields = {'true': ['mark_queries']}

    class Data(parameters.SandboxBoolParameter):
        name = 'get_mmeta_data'
        description = 'Get mmeta data'
        group = _WEB_MMETA_OPTIONS

    class Plan(parameters.SandboxBoolParameter):
        name = 'get_mmeta_plan'
        description = 'Get mmeta plan'
        group = _WEB_MMETA_OPTIONS

    class ArchiveModels(parameters.SandboxBoolParameter):
        name = 'get_mmeta_models'
        description = 'Get mmeta models'
        group = _WEB_MMETA_OPTIONS

    params = (
        Config,
        Requests,
        Plan,
        ArchiveModels,
        Data,
    )


class GetInt(object):
    class Config(parameters.SandboxBoolParameter):
        name = 'get_int_cfg'
        description = 'Get int config'
        group = _WEB_INT_OPTIONS

    class Requests(parameters.SandboxBoolParameter):
        name = 'get_int_reqs'
        description = 'Get int requests'
        group = _WEB_INT_OPTIONS

    class Plan(parameters.SandboxBoolParameter):
        name = 'get_int_plan'
        description = 'Get int plan'
        group = _WEB_INT_OPTIONS

    params = (Config, Requests, Plan)


class GetImgMmeta(object):
    class Config(parameters.SandboxBoolParameter):
        name = 'get_img_mmeta_cfg'
        description = 'Get img mmeta config'
        group = _IMG_MMETA_OPTIONS

    class Requests(parameters.SandboxBoolParameter):
        name = 'get_img_mmeta_reqs'
        description = 'Get img mmeta requests'
        group = _IMG_MMETA_OPTIONS
        sub_fields = {'true': ['mark_queries']}

    class Data(parameters.SandboxBoolParameter):
        name = 'get_img_mmeta_img_data'
        description = 'Get img mmeta data'
        group = _IMG_MMETA_OPTIONS

    class Plan(parameters.SandboxBoolParameter):
        name = 'get_img_mmeta_plan'
        description = 'Get img mmeta plan'
        group = _IMG_MMETA_OPTIONS

    class Index(parameters.SandboxBoolParameter):
        name = 'get_img_mmeta_img_index'
        description = 'Get img mmeta index'
        group = _IMG_MMETA_OPTIONS

    params = (
        Config,
        Requests,
        Plan,
        Data,
        Index,
    )


class GetImgInt(object):
    class Config(parameters.SandboxBoolParameter):
        name = 'get_img_int_cfg'
        description = 'Get img int config'
        group = _IMG_INT_OPTIONS

    class Requests(parameters.SandboxBoolParameter):
        name = 'get_img_int_reqs'
        description = 'Get img int requests'
        group = _IMG_INT_OPTIONS

    class Plan(parameters.SandboxBoolParameter):
        name = 'get_img_int_plan'
        description = 'Get img int plan'
        group = _IMG_INT_OPTIONS

    params = (Config, Requests, Plan)


class GetVideoMmeta(object):
    class Config(parameters.SandboxBoolParameter):
        name = 'get_video_mmeta_cfg'
        description = 'Get video mmeta config'
        group = _VIDEO_MMETA_OPTIONS

    params = (
        Config,
    )


class GetVideoInt(object):
    class Config(parameters.SandboxBoolParameter):
        name = 'get_video_int_cfg'
        description = 'Get video int config'
        group = _VIDEO_INT_OPTIONS

    params = (
        Config,
    )


class GetRefresh(object):
    class Config(parameters.SandboxBoolParameter):
        name = 'get_refresh_cfg'
        description = 'Get refresh config'
        group = _REFRESH_OPTIONS

    class Requests(parameters.SandboxBoolParameter):
        name = 'get_refresh_reqs'
        description = 'Get refresh requests'
        group = _REFRESH_OPTIONS

    class Plan(parameters.SandboxBoolParameter):
        name = 'get_refresh_plan'
        description = 'Get refresh plan'
        group = _REFRESH_OPTIONS

    class ArchiveModels(parameters.SandboxBoolParameter):
        name = 'get_refresh_models'
        description = 'Get refresh models'
        group = _REFRESH_OPTIONS

    params = (
        Config,
        Requests,
        Plan,
        ArchiveModels,
    )


class MmetaInstance(parameters.SandboxStringParameter):
    name = 'mmeta_instance'
    description = 'Mmeta host:port (random by default, without ".yandex.ru")'
    group = 'Instances'


class IntInstance(parameters.SandboxStringParameter):
    name = 'int_instance'
    description = 'Int host:port (random by default, without ".yandex.ru")'
    group = 'Instances'


class RefreshInstance(parameters.SandboxStringParameter):
    name = 'refresh_instance'
    description = 'Refresh instance host:port (random by default, without ".yandex.ru")'
    group = 'Instances'


class ImgMmetaInstance(parameters.SandboxStringParameter):
    name = 'img_mmeta_instance'
    description = 'IMG Mmeta host:port (random by default, without ".yandex.ru")'
    group = 'Instances'


class ImgIntInstance(parameters.SandboxStringParameter):
    name = 'img_int_instance'
    description = 'IMG Int host:port (random by default, without ".yandex.ru")'
    group = 'Instances'


class VideoMmetaInstance(parameters.SandboxStringParameter):
    name = 'video_mmeta_instance'
    description = 'Video Mmeta host:port (random by default, without ".yandex.ru")'
    group = 'Instances'


class VideoIntInstance(parameters.SandboxStringParameter):
    name = 'video_int_instance'
    description = 'Video Int host:port (random by default, without ".yandex.ru")'
    group = 'Instances'


class QueriesOpts(object):
    class MarkQueries(parameters.SandboxBoolParameter):
        name = 'mark_queries'
        description = 'Mark not cached queries as "&pron=nocachehit"'
        group = 'Queries options'

    class ByFormula(parameters.SandboxStringParameter):
        """
            Do not work with mark_queries enabled !!!
        """
        name = 'filter_queries_by_formula'
        description = 'Queries only for formula:'
        default_value = ''
        group = 'Queries options'

    class UniqueQueries(parameters.SandboxBoolParameter):
        name = 'get_only_unique_queries'
        description = 'Sort queries and del repetitions'
        group = 'Queries options'

    class MakeUniqueReqids(parameters.SandboxBoolParameter):
        name = 'make_unique_reqids'
        description = 'Make reqids unique by adding "001" * n'
        default_value = True
        group = 'Queries options'

    class AddRandomPron(parameters.SandboxBoolParameter):
        name = 'add_random_pron'
        description = 'Add random pron to defeat cache issues (see SEARCH-2284)'
        default_value = True
        group = 'Queries options'

    class TurQueries(parameters.SandboxBoolParameter):
        name = 'get_only_tur_queries'
        description = 'Get only tur queries'
        group = 'Queries options'

    params = (
        MarkQueries,
        ByFormula,
        UniqueQueries,
        MakeUniqueReqids,
        TurQueries,
    )


class Options(object):
    class WebMmetaConf(parameters.SandboxStringParameter):
        name = 'mmeta_conf'
        description = 'Web mmeta conf'
        group = _WEB_MMETA_OPTIONS
        # as stated 13.06.2017 by mcden@ (hamster by default)
        default_value = cluster_names.C.vla_jupiter_mmeta_hamster

    class WebMmetaTag(parameters.SandboxStringParameter):
        name = 'mmeta_tag'
        description = 'Web mmeta tag'
        group = _WEB_MMETA_OPTIONS
        default_value = ''

    class WebMmetaReqsCount(parameters.SandboxIntegerParameter):
        name = 'mmeta_reqs_count'
        description = 'Web mmeta request count'
        group = _WEB_MMETA_OPTIONS
        default_value = 5000

    class WebMmetaPlanRps(parameters.SandboxIntegerParameter):
        name = 'mmeta_plan_rps'
        description = 'Web mmeta plan rps'
        group = _WEB_MMETA_OPTIONS
        default_value = 20

    class WebIntConf(parameters.SandboxStringParameter):
        name = 'int_conf'
        description = 'Int conf'
        group = _WEB_INT_OPTIONS
        default_value = cluster_names.C.sas_web_int_hamster

    class WebIntTag(parameters.SandboxStringParameter):
        name = 'int_tag'
        description = 'Int tag'
        group = _WEB_INT_OPTIONS
        default_value = ''

    class WebIntReqsCount(parameters.SandboxIntegerParameter):
        name = 'int_reqs_count'
        description = 'Int request count'
        group = _WEB_INT_OPTIONS
        default_value = 10000

    class WebIntPlanRps(parameters.SandboxIntegerParameter):
        name = 'int_plan_rps'
        description = 'Int plan rps'
        group = _WEB_INT_OPTIONS
        default_value = 100

    class RefreshMmetaTag(parameters.SandboxStringParameter):
        name = 'refresh_tag'
        description = 'Refresh mmeta tag'
        group = _REFRESH_OPTIONS
        default_value = ''

    class RefreshMmetaConf(parameters.SandboxStringParameter):
        name = 'refresh_conf'
        description = 'Refresh configuration prefix'
        group = _REFRESH_OPTIONS
        default_value = cluster_names.C.saas_refresh_3day_production_mmeta

    class RefreshReqsCount(parameters.SandboxIntegerParameter):
        name = 'refresh_reqs_count'
        description = 'Refresh request count'
        group = _REFRESH_OPTIONS
        default_value = 10000

    class RefreshPlanRps(parameters.SandboxIntegerParameter):
        name = 'refresh_plan_rps'
        description = 'Refresh plan rps'
        group = _REFRESH_OPTIONS
        # rps are slow there (this helps with eventlog dumps)
        default_value = 1

    class ImgMmetaConf(parameters.SandboxStringParameter):
        name = 'img_mmeta_conf'
        description = 'Img mmeta conf'
        group = _IMG_MMETA_OPTIONS
        default_value = cluster_names.C.hamster_imgmmeta_man

    class ImgMmetaTag(parameters.SandboxStringParameter):
        name = 'img_mmeta_tag'
        description = 'Img mmeta tag'
        group = _IMG_MMETA_OPTIONS
        default_value = ''

    class ImgMmetaReqsCount(parameters.SandboxIntegerParameter):
        name = 'img_mmeta_reqs_count'
        description = 'Img mmeta request count'
        group = _IMG_MMETA_OPTIONS
        default_value = 5000

    class ImgMmetaPlanRps(parameters.SandboxIntegerParameter):
        name = 'img_mmeta_plan_rps'
        description = 'Img mmeta plan rps'
        group = _IMG_MMETA_OPTIONS
        default_value = 20

    class ImgIntConf(parameters.SandboxStringParameter):
        name = 'img_int_conf'
        description = 'Img int conf'
        group = _IMG_INT_OPTIONS
        default_value = cluster_names.C.hamster_imgsint_man

    class ImgIntTag(parameters.SandboxStringParameter):
        name = 'img_int_tag'
        description = 'Img int tag'
        group = _IMG_INT_OPTIONS
        default_value = ''

    class ImgIntReqsCount(parameters.SandboxIntegerParameter):
        name = 'img_int_reqs_count'
        description = 'Img int request count'
        group = _IMG_INT_OPTIONS
        default_value = 10000

    class ImgIntPlanRps(parameters.SandboxIntegerParameter):
        name = 'img_int_plan_rps'
        description = 'Img int plan rps'
        group = _IMG_INT_OPTIONS
        default_value = 100

    class VideoMmetaConf(parameters.SandboxStringParameter):
        name = 'video_mmeta_conf'
        description = 'Video mmeta conf'
        group = _VIDEO_MMETA_OPTIONS
        default_value = cluster_names.C.production_hamster_vidmmeta

    class VideoMmetaTag(parameters.SandboxStringParameter):
        name = 'video_mmeta_tag'
        description = 'Video mmeta tag'
        group = _VIDEO_MMETA_OPTIONS
        default_value = 'a_ctype_hamster'

    class VideoIntConf(parameters.SandboxStringParameter):
        name = 'video_int_conf'
        description = 'Video int conf'
        group = _VIDEO_INT_OPTIONS
        default_value = cluster_names.C.production_hamster_vidsint

    class VideoIntTag(parameters.SandboxStringParameter):
        name = 'video_int_tag'
        description = 'Video Int tag'
        group = 'Options'
        default_value = 'a_ctype_hamster'

    class ResPrefix(parameters.SandboxStringParameter):
        name = 'resource_attr_name_prefix'
        description = 'Resource attr name prefix for testenv (value = datetime)'
        do_not_copy = True
        group = 'Options'

    class AdditionalAttr(parameters.SandboxStringParameter):
        """
            Добавить атрибуты к созданному ресурсу шарда (формат: attr1=v1, attr2=v2)
        """
        name = 'additional_attr'
        description = 'Additional attributes to the created resources (ex.: attr1=v1, attr2=v2)'
        default_value = ''
        group = 'Options'

    params = (
        WebMmetaConf,
        WebMmetaTag,
        WebMmetaReqsCount,
        WebMmetaPlanRps,

        WebIntConf,
        WebIntTag,
        WebIntReqsCount,
        WebIntPlanRps,

        RefreshMmetaTag,
        RefreshMmetaConf,
        RefreshReqsCount,
        RefreshPlanRps,

        ImgMmetaConf,
        ImgMmetaTag,
        ImgMmetaReqsCount,
        ImgMmetaPlanRps,
        ImgIntConf,
        ImgIntTag,
        ImgIntReqsCount,
        ImgIntPlanRps,

        VideoMmetaConf,
        VideoMmetaTag,
        VideoIntConf,
        VideoIntTag,

        ResPrefix,
        AdditionalAttr,
    )


class FailIfNoInstances(parameters.SandboxBoolParameter):
    """
        Если True, задача падает в FAILURE, иначе в EXCEPTION.
    """
    name = 'fail_if_no_instances'
    description = 'FAILURE instead of EXCEPTION when no instances encountered'
    default_value = False
    group = 'Flow Control'


class GetMiddlesearchResources(SandboxTask):
    """
        **Описание**

            Умеет получать ресурсы для среднего, инта и фьюжена: конфиг, базу, запросы, план, модели.

        **Параметры**

            *Get mmeta/int/refresh parameters*

            * **Get {int,mmeta,refresh} {config, requests, plan}** - семейство опций, задающих, получать ли
              конфиг, запросы и план для различных поисковых компонент (int, mmeta, refresh).
              Получение плана программно синхронизируется с получением запросов.
            * **Get mmeta/refresh data** (`get_mmeta_data`) - получать ли базу,
            * **Get mmeta models** (`get_mmeta_models`) - получать ли модели,

            *Queries options*

            * **Mark not cached queries as "&pron=nocachehit"** (``mark_queries``) -
                нужен для задачи приемки базы, позволяет помечать и потом использовать только те запросы,
                которые не попали в кеш,
            * **Queries only for formula:** (``get_only_unique_queries``) - позволяет фильтровать
                только те запросы, на которых сработала указанная формула (например, "Ru" или "RuDownload")
                `!!! не работет вместе с mark_queries !!!`
            * **Sort queries and del repetitions** (``get_only_unique_queries``) - Сортировать ресурсы
                по "user_request" и удалить повторяющиеся запросы,
            * **Get only tur queries** (``get_only_tur_queries``) - получать только турецкие запросы

            *Options*

            * **Mmeta/Int/Refresh tag** (``mmeta/int/refresh_tag``) - из какого тэга cms получать инстансы,
                если тегов несколько - объединяем результаты.
            * **Mmeta/Int/Refresh request count** (``mmeta/int/refresh_reqs_count``) - колическво запросов,
            * **Mmeta/Int/Refresh plan rps** (``mmeta/int/refresh_plan_rps``) - количество rps в планах,
            * **Resource attr name prefix for testenv (value = datetime)** (``resource_attr_name_prefix``) -
                добавляет атрибут 'TE_<attr_prefix>_production_<host_type>_<resource_type>' для всех ресурсов,
            * **Additional attributes to the created resources (ex.: attr1=v1, attr2=v2)** (``additional_attr``) -
                добавляет указанный атрибут ко всем ресурсам таска

                * <attr_prefix> - строка в параметре ``resource_attr_name_prefix``,
                * <host_type> - ['mmeta', 'int', 'refresh'],
                * <resource_type> - ["cfg", "data", "reqs", "plan", "models"]


        **Создаваемые ресурсы**

            В зависимости от параметров:

            * MIDDLESEARCH_CONFIG
            * MIDDLESEARCH_DATA
            * IMAGES_MIDDLESEARCH_DATA
            * IMAGES_MIDDLESEARCH_INDEX
            * PLAIN_TEXT_QUERIES
            * MIDDLESEARCH_PLAN
            * DYNAMIC_MODELS_ARCHIVE
    """

    type = 'GET_MIDDLESEARCH_RESOURCES'

    execution_space = 54 * 1024  # Max detected usage is 49 Gb

    # disable LXC due to cqudp problems (see SEARCH-4055)
    client_tags = Tag.Group.LINUX & Tag.PORTOD

    input_parameters = (
        GetMmeta.params +
        GetInt.params +
        GetRefresh.params +
        GetImgMmeta.params +
        GetImgInt.params +
        GetVideoMmeta.params +
        GetVideoInt.params +
        QueriesOpts.params +
        Options.params +
        (
            MmetaInstance,
            IntInstance,
            RefreshInstance,
            ImgMmetaInstance,
            ImgIntInstance,
            FailIfNoInstances,
        )
    )

    res_types = {
        "cfg": ["cfg", resource_types.MIDDLESEARCH_CONFIG, 'load_config'],
        "data": ["data", resource_types.MIDDLESEARCH_DATA, 'load_data'],
        "img_data": ["img data", images_metasearch_resources.IMAGES_MIDDLESEARCH_DATA, 'load_img_data'],
        "img_index": ["img data", resource_types.IMAGES_MIDDLESEARCH_INDEX, 'load_img_index'],
        "reqs": ["requests", resource_types.PLAIN_TEXT_QUERIES, 'load_requests'],
        "plan": ["plan", ms_res.MiddlesearchPlan, 'load_plan'],
        "models": ["models", resource_types.DYNAMIC_MODELS_ARCHIVE, 'load_models'],
    }
    res_download_order = ["cfg", "data", "img_data", "img_index", "reqs", "plan", "models"]
    host_types = ['mmeta', 'int', 'refresh', 'img_mmeta', 'img_int', 'video_mmeta', 'video_int']

    def on_enqueue(self):
        SandboxTask.on_enqueue(self)

        # By some unknown reasons getting plans synchronised with getting queries
        self.ctx[GetInt.Plan.name] = self.ctx[GetInt.Requests.name]
        self.ctx[GetMmeta.Plan.name] = self.ctx[GetMmeta.Requests.name]
        self.ctx[GetImgMmeta.Plan.name] = self.ctx[GetImgMmeta.Requests.name]
        self.ctx[GetRefresh.Plan.name] = self.ctx[GetRefresh.Requests.name]

        # to get mmeta database need mmeta config
        if self.ctx[GetMmeta.Data.name]:
            self.ctx[GetMmeta.Config.name] = self.ctx[GetMmeta.Data.name]
        if self.ctx[GetImgMmeta.Data.name]:
            self.ctx[GetImgMmeta.Config.name] = self.ctx[GetImgMmeta.Data.name]

        for resource_type in self.res_download_order:
            for host_type in self.host_types:
                get_flag = 'get_{}_{}'.format(host_type, resource_type)
                if self.ctx.get(get_flag, False):
                    self._create_middle_resource(resource_type, host_type)

    def on_execute(self):
        nanny_token = rm_sec.get_rm_token(self)
        instances = {
            "mmeta": _MmetaInstance(tout=self.ctx.get("kill_timeout", 10800) / 2 + 1, nanny_token=nanny_token),
            "img_mmeta": _ImgMmetaInstance(tout=self.ctx.get("kill_timeout", 10800) / 2 + 1, nanny_token=nanny_token),
            "video_mmeta":
                _VideoMmetaInstance(tout=self.ctx.get("kill_timeout", 10800) / 2 + 1, nanny_token=nanny_token),
            "int": _IntInstance(nanny_token=nanny_token),
            "img_int": _ImgIntInstance(nanny_token=nanny_token),
            "video_int": _VideoIntInstance(nanny_token=nanny_token),
            "refresh": _RefreshInstance(),
        }

        some_action_requested = False
        task_timestamp = time.strftime('%d-%m-%y_%H:%M')

        for resource_type in self.res_download_order:
            for host_type in self.host_types:
                get_flag = 'get_{}_{}'.format(host_type, resource_type)
                if self.ctx.get(get_flag, False):
                    logging.info('current flag = %s', get_flag)
                    some_action_requested = True
                    instance = instances[host_type]
                    _, _, maker = self.res_types[resource_type]
                    res_name = self.get_res_name(host_type)
                    name = '{}_{}'.format(res_name, resource_type)
                    resource = channel.sandbox.get_resource(self.ctx[name])
                    self._load_resource(instance, maker, resource)

                    # different searches need different resources in TestEnv,
                    # so we add prefixes for attributes

                    attr_prefix = self.ctx.get(Options.ResPrefix.name)
                    if attr_prefix:
                        source = 'production'
                        if (
                            resource_type == 'cfg' and ('mmeta' in host_type or 'int' in host_type) and
                            instance._conf_param and 'hamster' in instance._conf_param
                        ):
                            source = 'hamster'
                        channel.sandbox.set_resource_attribute(
                            resource.id,
                            'TE_{}_{}_{}'.format(attr_prefix, source, name),
                            task_timestamp,
                        )

        eh.ensure(some_action_requested, "Nothing to do, please select at least one of Get ... parameters")

    def _create_middle_resource(self, res_type, host_type):
        (descr, rtype, maker) = self.res_types[res_type]

        date = datetime.datetime.now().strftime("%d.%m.%Y")
        res_name = self.get_res_name(host_type)
        descr = '{} {}, {}'.format(res_name, descr, date)
        name = '{}_{}'.format(res_name, res_type)

        attributes = string.parse_attrs(self.ctx.get(Options.AdditionalAttr.name))

        resource = self.create_resource(descr, name, rtype, attributes=attributes)
        self.ctx[name] = resource.id

    def get_res_name(self, host_type):
        return self.ctx.get("{}_conf".format(host_type), "").strip("-") or host_type

    def _load_resource(self, instance, maker, resource):
        max_attempts = 20
        for i in range(0, max_attempts):
            try:
                logging.info('try loading resource %s, attempt # %s of %s', resource.type, i, max_attempts)
                getattr(instance, maker)(resource)
                if (
                    resource.type == ms_res.MiddlesearchPlan and
                    utils.get_or_default(self.ctx, QueriesOpts.MarkQueries)
                ):
                    channel.sandbox.set_resource_attribute(
                        resource.id, 'priemka_basesearch_database_plan', 'yes'
                    )
                logging.info('loaded resource %s', resource.type)
                return

            except Exception:
                logging.error('error: %s', eh.shifted_traceback())
                if i != max_attempts - 1:
                    instance.choose_random_instance()
                else:
                    raise
        eh.fail('Cannot load resource {}'.format(resource.type))


####################################################################################################


class _InstanceBase(object):
    def __init__(self):
        self.req_file_name = None
        self.instances = None
        self.random_choice = True
        self.host = None
        self.port = None
        self.instance_name = None
        self.config_path = None
        self.owner = None

        self._conf_param = None
        self._instance_param = None
        self._tag_param = None
        self._nanny_token = None

    @staticmethod
    def _ctx_param(param):
        return utils.get_or_default(channel.task.ctx, param)

    def _parse(self, instance, conf='HEAD'):
        if not instance:
            # ignore empty strings
            return False

        m = re.match(r'([^:]+):(\d+)', instance)
        eh.ensure(m, "Cannot parse instance '{}'".format(instance))

        self.host = m.group(1)
        self.port = int(m.group(2))
        self.instance_name = '{}:{}@{}'.format(self.host, self.port, conf)
        self.random_choice = False
        return True

    def _get_instances(self):
        """
            Instances lazy initializer
        """
        if self.instances:
            # init once
            return

        if self._parse(self._ctx_param(self._instance_param), self._conf_param):
            # instance is nailed
            return

        self.instances = _get_instances_internal(
            instance_tag_name=self._ctx_param(self._tag_param),
            conf=self._conf_param,
            nanny_token=self._nanny_token,
        )
        self.choose_random_instance()

    def choose_random_instance(self):
        if not self.random_choice:
            # instance is nailed
            logging.debug("Instance is nailed, choosing the same %s", self.instance_name)
            return

        message = 'No more instances for {}'.format(self.__class__.__name__)
        if self._ctx_param(FailIfNoInstances):
            eh.ensure(self.instances, message)
        else:
            eh.verify(self.instances, message)

        instance = random.choice(self.instances)
        self.instances.remove(instance)

        self.host = instance["host"]
        self.port = instance["port"]
        self.instance_name = '{}:{}@{}'.format(self.host, self.port, self._conf_param)

        logging.info("Chosen instance: %s:%s for %s", self.host, self.port, self.__class__.__name__)

    def load_config(self, resource):
        self._get_instances()
        while True:
            utils.get_config_by_info_request(self.host, self.port, resource.path)
            self.config_path = resource.path
            config_contents = fu.read_file(self.config_path)
            # we don't need strange configuration files
            if '_CONQUISTA_' in config_contents:
                self.choose_random_instance()
                continue
            break

    def _load_all_queries_from_evlog(self, resource, params, dumped_count, do_mark, filter_by_formula):
        events_filter = dict(
            min=dumped_count,
            max=dumped_count,
            fields=[0],
        )
        if do_mark:
            event_ids = [287, 308]  # ContextCreated, SearchedResultStat
            events_filter.update(dict(
                frame=True,
                name=True,
            ))
        elif filter_by_formula:
            event_ids = [287, 313]  # ContextCreated, DebugMessage
            events_filter.update(dict(
                frame=True,
                name=True,
            ))
        else:
            event_ids = [286, 287]  # FastCacheHit, ContextCreated
            events_filter.update({'prefix': '?'})
        queries_path = channel.task.abs_path(resource.path)
        try:
            utils.receive_skynet_key(channel.task.owner)
            res = eventlog.get_eventlogs(
                [
                    "current-eventlog-{}-{}".format(params['name'], params['port']),
                    "eventlog-{}-{}.PREV".format(params['name'], params['port']),
                ],
                [self.host],
                target=queries_path,
                port=params['port'],
                compress=False,
                event_ids=event_ids,
                events_filter=events_filter,
                start_tstamp=dict(
                    count=dumped_count,
                    rps=params['rps'],
                ),
                user='prod-resource-getter',
            )
            eh.verify(res, 'Cannot load queries from host {}'.format(self.host))

            all_queries = fu.read_lines(queries_path)
            logging.info('All queries length from file %s: %s', queries_path, len(all_queries))

            # remove 'production-woodpecker' queries
            return [q for q in all_queries if q != '?ms=proto&hr=da']
        except Exception as e:
            logging.debug('Error traceback: %s', eh.shifted_traceback())
            if isinstance(e, common_errors.VaultNotFound):
                message = 'There is no key in vault for user {}\n{}'.format(self.owner, e)
                eh.check_failed(message)
            logging.info('Ignore error: %s, %s', self.host, e)
        return []

    def _load_requests_impl(self, resource, req_count, params):
        #  if want only unique queries - more initial queries should be dumped
        sorted_and_unique = self._ctx_param(QueriesOpts.UniqueQueries)
        make_unique_reqids = self._ctx_param(QueriesOpts.MakeUniqueReqids)
        add_random_pron = self._ctx_param(QueriesOpts.AddRandomPron)
        do_mark = self._ctx_param(QueriesOpts.MarkQueries)
        formula = self._ctx_param(QueriesOpts.ByFormula)
        only_tur = self._ctx_param(QueriesOpts.TurQueries)
        #  need more queries to get rid of some bad queries later
        #  as there are only few tur queries - dump all eventlog in that case
        #  if use formula filter - dump all eventlog
        dumped_count = int(req_count * (2 + sorted_and_unique)) + 10 if not only_tur and not formula else 0
        logging.info("Requested %s queries, trying to dump...", req_count)
        loaded_queries = []

        while len(loaded_queries) < req_count:
            queries = self._load_all_queries_from_evlog(resource, params, dumped_count, do_mark, formula)
            logging.info("got %s queries from this instance", len(queries))

            if params['name'] == 'int':
                # фильтруем только поисковые запросы (SEARCH-1089)
                logging.info("filtering only search queries for int")
                queries = [q for q in queries if "&dh=" not in q]
            if do_mark:
                queries = self._get_marked_queries(queries)
            if formula:
                queries = self._filter_queries_by_formula(queries, formula)
            if only_tur:
                logging.info("filter only tur queries")
                queries = [q for q in queries if 'tld=com.tr' in q]
            if add_random_pron:
                # add unique pron parameters for each query to defeat cache issues, see SEARCH-2284
                index = 0
                new_queries = []
                for q in queries:
                    index += 1
                    # here str(q) is same as q but PyCharm wonders...
                    new_q = str(q) + '&pron=search2284-' + str(index)
                    new_queries.append(new_q)
                queries = new_queries

            if sorted_and_unique:
                queries = list(search_queries.get_unique_queries(queries, 'user_request'))
            elif make_unique_reqids:
                queries = self._make_unique_reqids(queries)

            loaded_queries += queries
            logging.info("got %s queries after filtering", len(loaded_queries))
            self.choose_random_instance()

        fu.write_lines(resource.path, loaded_queries[:req_count])
        channel.sandbox.set_resource_attribute(resource.id, 'amount_of_queries', req_count)
        self.req_file_name = resource.path

    @staticmethod
    def _make_unique_reqids(queries):
        result = []
        reqids = []
        reqid_re = re.compile(r"&reqid=([^&\s]*)")
        for q in queries:
            curr_reqids = reqid_re.findall(q)
            if not curr_reqids:
                logging.warning("Skip query without '&reqid': '%s'", q)
                continue
            curr_reqid = curr_reqids[0]
            if not curr_reqid:
                logging.warning("Skip query with empty '&reqid': '%s'", q)
                continue
            init_reqid = curr_reqid
            counter = 0
            while curr_reqid in reqids:
                counter += 1
                logging.debug("Make unique reqid = '%s'", curr_reqid)
                curr_reqid_parts = curr_reqid.split("-")
                if counter > 1:
                    # cut piece previously appended
                    appendix_len = 2 + len(str(counter))
                    curr_reqid_parts[1] = curr_reqid_parts[1][:-appendix_len]
                curr_reqid_parts[1] += "00{}".format(counter)  # append some digits to hash
                curr_reqid = "-".join(curr_reqid_parts)
            result.append(q if counter == 0 else q.replace(init_reqid, curr_reqid))
            reqids.append(curr_reqid)
        return result

    @staticmethod
    def _filter_queries_by_formula(fields, formula_name):
        logging.info('start filtering queries by formula: %s', formula_name)

        r = re.compile(r"rank model.*\[{}\]".format(formula_name))

        frames_with_formula = []
        queries_by_frames = {}

        for field in fields:
            frame, name, text = field.split("\t")
            if name == 'DebugMessage' and r.search(text):
                frames_with_formula.append(frame)
            elif name == 'ContextCreated':
                queries_by_frames[frame] = '?' + text

        logging.info(
            'finish filtering queries by formula\n'
            'num of queries: %s, num of frames_with_formula: %s',
            len(queries_by_frames), len(frames_with_formula)
        )

        return [queries_by_frames[f] for f in set(frames_with_formula)]

    @staticmethod
    def _get_marked_queries(fields):
        logging.info('start marking nocached queries')

        mark_frames = []
        queries_by_frames = {}

        for field in fields:
            frame, name, query = field.split()
            if name == 'SearchedResultStat':
                mark_frames.append(frame)
            elif name == 'ContextCreated':
                queries_by_frames[frame] = '?' + query

        logging.info('mark_frames = %s', ', '.join(mark_frames))

        for frame in mark_frames:
            if frame in queries_by_frames:
                queries_by_frames[frame] += '&pron=nocachehit'

        logging.info(
            'num of queries: %s, num of marked queries: %s',
            len(queries_by_frames), len(mark_frames)
        )
        logging.info('finish marking nocached queries')

        return queries_by_frames.values()

    def _load_plan_impl(self, resource, rps):
        if not self.req_file_name:
            channel.sandbox.delete_resource(resource.id)
            return

        tmp_reqs = channel.task.abs_path("{}.tmp".format(os.path.split(self.req_file_name)[1]))
        with open(self.req_file_name, 'r') as in_file:
            with open(tmp_reqs, 'w') as out_file:
                for line in in_file:
                    out_file.write("http://localhost:12345/yandsearch{}".format(line))

        d_planner_path = DolbilkaPlanner.get_planner_path()

        cmd = [d_planner_path, '-l', tmp_reqs, '-o', resource.path, '-t', 'plain', '-q', str(rps)]
        process.run_process(cmd, log_prefix='planner', wait=True, timeout=300)

    def load_data(self, resource):
        eh.fail("Default load_data should not be called for {}".format(resource))

    def load_models(self, resource):
        # default faulty behaviour (cms is a history)
        self._load_models_cms(resource)

    def _load_models_cms(self, resource):
        self._get_instances()
        logging.info("Load models, instance name: %s", self.instance_name)
        with open(resource.path, 'wb') as f:
            models = cms.load_cms_resource(
                'models.archive',
                instance_name=self.instance_name,
                cms_configuration=self._conf_param,
            )
            f.write(models)


class _MmetaInstance(_InstanceBase):
    def __init__(self, tout=10800, nanny_token=None):
        super(_MmetaInstance, self).__init__()
        self._tout = tout
        self._instance_param = MmetaInstance
        self._tag_param = Options.WebMmetaTag
        self._conf_param = self._ctx_param(Options.WebMmetaConf)
        self._nanny_token = nanny_token

    def load_data(self, resource):
        paths.make_folder(resource.path, delete_content=True)
        channel.task.remote_copy(
            WebSettings.MIDDLESEARCH_DATA, resource.path,
            protocol='skynet',
            create_resource=False,
            timeout=self._tout,
        )

    def load_requests(self, resource):
        self._get_instances()
        self._load_requests_impl(
            resource, self._ctx_param(Options.WebMmetaReqsCount),
            dict(rps=5, name='mmeta', port=self.port),
        )

    def load_plan(self, resource):
        self._get_instances()
        self._load_plan_impl(resource, int(self._ctx_param(Options.WebMmetaPlanRps)))

    def load_models(self, resource):
        # _load_cms_models is useless here, so we obtain latest stable released models
        models_id = utils.get_and_check_last_released_resource_id(resource_type=resource_types.DYNAMIC_MODELS_ARCHIVE)
        models = channel.task.sync_resource(models_id)
        proc = process.run_process(['cp', models, resource.path], shell=True, log_prefix="copy_models")
        eh.ensure(proc.returncode == 0, "Copying resource failed")


class _IntInstance(_InstanceBase):
    def __init__(self, nanny_token=None):
        super(_IntInstance, self).__init__()
        self._instance_param = IntInstance
        self._tag_param = Options.WebIntTag
        self._conf_param = self._ctx_param(Options.WebIntConf)
        self._nanny_token = nanny_token

    def load_data(self, resource):
        eh.fail("Int.load_data must not be called, {}".format(resource))

    def load_models(self, resource):
        eh.fail("Int.load_models must not be called, {}".format(resource))

    def load_requests(self, resource):
        self._get_instances()
        self._load_requests_impl(
            resource, self._ctx_param(Options.WebIntReqsCount),
            dict(rps=20, name='int', port=self.port)
        )

    def load_plan(self, resource):
        self._get_instances()
        self._load_plan_impl(resource, self._ctx_param(Options.WebIntPlanRps))


class _ImgMmetaInstance(_MmetaInstance):
    def __init__(self, tout=10800, nanny_token=None):
        super(_ImgMmetaInstance, self).__init__(tout)
        self._instance_param = ImgMmetaInstance
        self._tag_param = Options.ImgMmetaTag
        self._conf_param = self._ctx_param(Options.ImgMmetaConf)
        self._nanny_token = nanny_token

    def load_img_data(self, resource):
        self.load_resource(resource, 'Collection/RearrangeDataDir', 'rearrange data')

    def load_img_index(self, resource):
        self.load_resource(resource, 'Collection/RearrangeIndexDir', 'rearrange index')

    def load_resource(self, resource, opt, descr):
        if self.host is None:
            self._get_instances()
        paths.make_folder(resource.path, delete_content=True)
        path_to_db = self._get_db_rel_path_from(opt)
        logging.info("get {} from rsync://%s/BASE/%s", self.host, path_to_db)
        _remote_copy("rsync://{}/{}".format(self.host, path_to_db), resource)

    def _get_db_rel_path_from(self, opt):
        return SearchConfig.get_config_from_file(self.config_path).get_db_rel_path_from(opt)

    def load_models(self, resource):
        # default faulty behaviour (cms is a history)
        self._load_models_cms(resource)


class _ImgIntInstance(_IntInstance):
    def __init__(self, nanny_token=None):
        super(_ImgIntInstance, self).__init__()
        self._instance_param = ImgIntInstance
        self._tag_param = Options.ImgIntTag
        self._conf_param = self._ctx_param(Options.ImgIntConf)
        self._nanny_token = nanny_token

    def load_requests(self, resource):
        self._get_instances()
        self._load_requests_impl(
            resource, self._ctx_param(Options.ImgIntReqsCount),
            dict(rps=20, name='int', port=self.port)
        )

    def load_plan(self, resource):
        self._get_instances()
        self._load_plan_impl(resource, self._ctx_param(Options.ImgIntPlanRps))


class _VideoMmetaInstance(_MmetaInstance):
    def __init__(self, tout=10800, nanny_token=None):
        super(_VideoMmetaInstance, self).__init__(tout)
        self._instance_param = VideoMmetaInstance
        self._tag_param = Options.VideoMmetaTag
        self._conf_param = self._ctx_param(Options.VideoMmetaConf)
        self._nanny_token = nanny_token

    def load_models(self, resource):
        # default faulty behaviour (cms is a history)
        self._load_models_cms(resource)


class _VideoIntInstance(_IntInstance):
    def __init__(self, nanny_token=None):
        super(_VideoIntInstance, self).__init__()
        self._instance_param = VideoIntInstance
        self._tag_param = Options.VideoIntTag
        self._conf_param = self._ctx_param(Options.VideoIntConf)
        self._nanny_token = nanny_token


class _RefreshInstance(_InstanceBase):
    def __init__(self, tout=10800):
        super(_RefreshInstance, self).__init__()
        self._tout = tout
        self._instance_param = RefreshInstance
        self._tag_param = Options.RefreshMmetaTag
        self._conf_param = self._ctx_param(Options.RefreshMmetaConf)

    def load_data(self, resource):
        eh.fail("Refresh.load_data must not be called (resource passed: {})".format(resource))

    def load_requests(self, resource):
        self._get_instances()
        self._load_requests_impl(
            resource, self._ctx_param(Options.RefreshReqsCount),
            dict(rps=5, name='mmeta', port=self.port)
        )

    def load_plan(self, resource):
        self._get_instances()
        self._load_plan_impl(resource, int(self._ctx_param(Options.RefreshPlanRps)))


@decorators.retries(10)
def _get_instances_internal_yp(endpoint_set):
    from infra.yp_service_discovery.python.resolver.resolver import Resolver
    from infra.yp_service_discovery.api import api_pb2

    cluster_name, endpoint_set_id = endpoint_set.split("@", 1)

    resolver = Resolver(client_name='GetMiddlesearchResources', timeout=60)
    request = api_pb2.TReqResolveEndpoints()
    request.cluster_name = cluster_name
    request.endpoint_set_id = endpoint_set_id

    logging.info("Resolving endpoint set id '%s' in dc '%s'", endpoint_set_id, cluster_name)
    return resolver.resolve_endpoints(request).endpoint_set.endpoints

def _get_instances_internal(instance_tag_name, conf, nanny_token=None):
    """
    Obtain instances list [{"host": "host", "port": "port"}], ...]
    :param instance_tag_name: instance tag names (comma-separated)
    :param conf: configuration name prefix
    :return: instances list [{"host": "host", "port": "port"}], ...]
    """
    logging.info("_get_instances_internal: tag name '%s', conf '%s'", instance_tag_name, conf)
    inst_tags = instance_tag_name if "," not in instance_tag_name else [instance_tag_name.replace(" ", "").split(",")]

    if not conf.startswith("y@"):
        instances = [{"host": h, "port": p} for h, p in ir.get_instances_by_config(conf, inst_tags, nanny_token)]
    else:
        instances = [{"host": x.fqdn, "port": x.port} for x in _get_instances_internal_yp(conf[2:])]

    msg = "Got {} instances for instance tag: '{}', conf: '{}'".format(len(instances), inst_tags, conf)
    eh.ensure(instances, msg)
    logging.info(msg)
    return instances


def _remote_copy(remote_path, resource, local_path=None, tout=10800):
    if local_path is None:
        lp = resource.path
    else:
        lp = local_path

    channel.task.remote_copy(
        remote_path, lp,
        protocol='rsync',
        create_resource=False,
        timeout=tout
    )


__Task__ = GetMiddlesearchResources
