# -*- coding: utf-8 -*-

import json
import logging
import os

import sandbox.common.types.client as ctc

from sandbox.projects import resource_types

from sandbox.projects.common import file_utils as fu
from sandbox.projects.common import utils
from sandbox.projects.common.search import bugbanner

from sandbox.sandboxsdk.task import SandboxTask
from sandbox.sandboxsdk.channel import channel
from sandbox.sandboxsdk import process
from sandbox.sandboxsdk import parameters as sp
from sandbox.sandboxsdk import errors
from sandbox.sandboxsdk import paths

OUT_FACTOR_NAMES = 'out_factor_names'
OUT_FACTOR_BORDERS = 'out_factor_borders'
OUT_POOL = 'out_pool'

PREPROCESSED_BASE = 'preprocessed_base'

SHARD_NAME = 'shard_name'

_main = "Main"
_token = "YT token"
_images = "Images-only options"
_options = "Options"


class MakePoolParams:

    class OutputTable(sp.SandboxStringParameter):
        """
            YT/MR output table
        """
        name = 'output_table'
        description = 'YT/MR output table'
        required = True
        group = _main

    class IdxOpsExecutable(sp.LastReleasedResource):
        """
            Бинарь idx_ops
        """
        name = 'idx_ops_resource_id'
        description = 'idx_ops binary'
        resource_type = resource_types.IDX_OPS_EXECUTABLE
        group = _main

    class SearchDatabaseResource(sp.ResourceSelector):
        """
            Поисковый шард
        """
        name = 'db_resource_id'
        description = 'Search database resource'
        resource_type = (
            resource_types.SEARCH_DATABASE,
            resource_types.IMAGES_SEARCH_DATABASE,
        )
        required = True
        group = _main

    class Models(sp.LastReleasedResource):
        """
            Модели MatrixNet
        """
        name = 'models_resource_id'
        description = 'Dynamic models archive'
        resource_type = (
            resource_types.DYNAMIC_MODELS_ARCHIVE,
            resource_types.DYNAMIC_MODELS_ARCHIVE_BASE,
        )
        group = _main

    class Requests(sp.ResourceSelector):
        """
            Запросы (requests.tsv.gz)
        """
        name = 'requests_resource_id'
        description = 'Requests from FML'
        resource_type = resource_types.FML_REQUESTS_TABLE
        required = True
        group = _main

    class Ratings(sp.ResourceSelector):
        """
            Оценки (ratings.tsv.gz)
        """
        name = 'ratings_resource_id'
        description = 'Ratings from FML'
        resource_type = resource_types.FML_RATINGS_TABLE
        required = True
        group = _main

    class RankingUrlsMap(sp.ResourceSelector):
        """
            Карта оценённых урлов (primus-JudTier-*-urls.tsv.gz)
        """
        name = 'ranking_urls_map'
        description = 'Ranking URLs canonization table'
        resource_type = resource_types.JUDTIER_DUPS_TABLE
        group = _main

    class BasePreprocessor(sp.ResourceSelector):
        """
            Бинарник или скрипт для обработки базы перед сбором пула
        """
        name = 'base_preprocessor'
        description = 'Executable to preprocess base'
        resource_type = None  # allow any executable
        group = _main

    class TierParameter(sp.SandboxStringParameter):
        """
            Строка, идущая в поле протопула Tier
        """
        name = 'tier'
        description = 'Tier'
        default_value = 'TestTier'
        group = _options

    class IndexAnnBetaMode(sp.SandboxBoolParameter):
        """
            Enables calcing features of experimental stream
        """
        name = 'indexann_beta_enabled'
        description = 'Enable experimental stream text machine features calculation'
        default_value = False
        group = _options

    class TextMachineLimitsMonitor(sp.SandboxBoolParameter):
        """
            Enables counting number of hits loaded from indexes and reached TextMachine
        """
        name = 'tm_limits_monitors_enabled'
        description = 'Enable text-machine limits monitors'
        default_value = False
        group = _options

    class CollectDocTexts(sp.SandboxBoolParameter):
        """
            Сбор пула с текстами (SEARCH-480)
        """
        name = 'collect_doc_texts'
        description = 'Collect doc texts'
        default_value = False
        group = _options

    class CollectStreamHits(sp.SandboxBoolParameter):
        """
            Сбор пула с хитами лингвобустинга по стримам.
            Работает при наличии CGI параметров лингвобустинга (&qbundle=, &pron=qbundleiter).
        """
        name = 'collect_stream_hits'
        description = 'Test stream hits collection'
        default_value = False
        group = _options

    class AdditionalRunParams(sp.SandboxStringParameter):
        """
            Дополнитльные параметры запуска idx_ops
        """
        name = 'additional_params'
        description = 'Additional command-line parameters'
        default_value = ""
        group = _options

    class UseUrlsFromIndex(sp.SandboxBoolParameter):
        """
            Load urls from index instead of url.dat
        """
        name = 'use_urls_from_index'
        description = 'Use urls from index'
        default_value = False
        group = _images

    class VaultOwner(sp.SandboxStringParameter):
        """
            Владелец sandbox vault с Ыть-токеном
        """
        name = 'vault_owner'
        description = 'Vault owner'
        default_value = 'RATED-POOL-MAKERS'
        group = _token

    class VaultName(sp.SandboxStringParameter):
        """
            Имя sandbox vault с Ыть-токеном
        """
        name = 'vault_name'
        description = 'Vault name'
        default_value = 'RATED_POOLS_YT_TOKEN'
        group = _token

    params = (
        OutputTable,
        IdxOpsExecutable,
        SearchDatabaseResource,
        Models,
        Requests,
        Ratings,
        RankingUrlsMap,
        BasePreprocessor,
        IndexAnnBetaMode,
        TextMachineLimitsMonitor,
        CollectDocTexts,
        CollectStreamHits,
        AdditionalRunParams,
        UseUrlsFromIndex,
        VaultOwner,
        VaultName,
    )


_P = MakePoolParams


class MakeIdxOpsPool(bugbanner.BugBannerTask):
    """
        Запускает idx_ops в режиме ``estfeatures2f`` на указанном поисковом шарде, запросах и оценках.
        Варит протопул.
    """

    type = 'MAKE_IDX_OPS_POOL'

    client_tags = ctc.Tag.Group.LINUX
    execution_space = 10000  # 10 Gb should be enough
    required_ram = 96 << 10

    input_parameters = _P.params

    def on_enqueue(self):
        SandboxTask.on_enqueue(self)
        # factor names
        resource_names = self.create_resource(
            self.descr + ': factor names', 'factor_names.txt', resource_types.FACTOR_NAMES_TXT
        )
        self.ctx[OUT_FACTOR_NAMES] = resource_names.id
        # factor borders
        resource_borders = self.create_resource(
            self.descr + ': factor borders', 'factor_borders.txt', resource_types.FACTOR_BORDERS_TXT
        )
        self.ctx[OUT_FACTOR_BORDERS] = resource_borders.id

    def on_execute(self):
        self.add_bugbanner(bugbanner.Banners.WebBaseSearch)

        idx_ops_path = self.sync_resource(self.ctx[_P.IdxOpsExecutable.name])
        ratings_path = self.sync_resource(self.ctx[_P.Ratings.name])
        requests_path = self.sync_resource(self.ctx[_P.Requests.name])
        shard_path = self.sync_resource(self.ctx[_P.SearchDatabaseResource.name])

        if self.ctx.get(_P.BasePreprocessor.name):
            preprocessor_path = self.sync_resource(self.ctx[_P.BasePreprocessor.name])
            shard_path = self._preprocess_base(shard_path, preprocessor_path)

        input_files = {
            'idx_ops': idx_ops_path,
            'ratings': ratings_path,
            'requests': requests_path,
            'shard': shard_path,
        }

        self._save_factors_info(input_files['idx_ops'])

        # pool = channel.sandbox.get_resource(self.ctx[OUT_POOL])
        self._run_idx_ops(
            input_files=input_files,
            result_name=self.ctx[_P.OutputTable.name],
            use_final_format=True,
            thread_count=24,
            trash_pool=False,  # FIXME(mvel)?
            collect_doc_texts=utils.get_or_default(self.ctx, _P.CollectDocTexts),
            collect_stream_hits=utils.get_or_default(self.ctx, _P.CollectStreamHits),
        )

    def _preprocess_base(self, shard_path, preprocessor_path):
        preprocessed_base = self.path(PREPROCESSED_BASE)
        paths.copy_path(shard_path, preprocessed_base, copy_function=os.symlink)
        access = os.stat(preprocessed_base).st_mode
        os.chmod(preprocessed_base, access | 0o700)
        process.run_process(
            [
                preprocessor_path,
                preprocessed_base
            ],
            log_prefix='preprocess_base',
            environment={
                'YT_TOKEN': self.get_vault_data(
                    utils.get_or_default(self.ctx, _P.VaultOwner),
                    utils.get_or_default(self.ctx, _P.VaultName)
                ),
            }
        )
        os.chmod(preprocessed_base, access)
        return preprocessed_base

    def _save_factors_info(self, idx_ops_path):
        resource_names = channel.sandbox.get_resource(self.ctx[OUT_FACTOR_NAMES])
        resource_borders = channel.sandbox.get_resource(self.ctx[OUT_FACTOR_BORDERS])

        args_names = [idx_ops_path, 'factor_names']
        args_borders = [idx_ops_path, 'factor_borders']

        if utils.get_or_default(self.ctx, _P.IndexAnnBetaMode):
            args_names.append('--indexann-beta')
            args_borders.append('--indexann-beta')

        with open(resource_names.path, "w") as out_file:
            process.run_process(
                args_names,
                log_prefix='get_factor_names',
                check=True,
                stdout=out_file,
            )

        with open(resource_borders.path, "w") as out_file:
            process.run_process(
                args_borders,
                log_prefix='get_factor_borders',
                check=True,
                stdout=out_file,
            )

    @staticmethod
    def _get_options(input_files):
        """
            Do some feature detection, because different branches
            may need different options
        """
        with open('options.txt', 'w') as options_file:
            process.run_process(
                [
                    input_files['idx_ops'],
                    'estfeatures2f',
                    '--help',
                ],
                stderr=options_file,
                outputs_to_one_file=False,
                log_prefix='run_idx_ops-get-version',
                check=False)

        return fu.read_file('options.txt')

    def _make_def_args(self, input_files, thread_count):
        args = [
            input_files['idx_ops'],
            'estfeatures2f',
            '--str-sort',
            '--threads', str(thread_count),
            '-g', 'domain',
            '--tier', utils.get_or_default(self.ctx, _P.TierParameter),
        ]
        if utils.get_or_default(self.ctx, _P.UseUrlsFromIndex):
            args.append('--urls-from-index')
        if utils.get_or_default(self.ctx, _P.IndexAnnBetaMode):
            args.append('--indexann-beta')
        if utils.get_or_default(self.ctx, _P.TextMachineLimitsMonitor):
            args.append('--cgi-params')
            args.append('&pron=qbundlelimitsmonitor')
        if utils.get_or_default(self.ctx, _P.Models):
            args += ['--models', self.sync_resource(self.ctx[_P.Models.name])]
        if utils.get_or_default(self.ctx, _P.RankingUrlsMap):
            args += ['--ranking-urls-map', self.sync_resource(self.ctx[_P.RankingUrlsMap.name])]
        if utils.get_or_default(self.ctx, _P.AdditionalRunParams):
            # this is not very safe for quotes
            args += utils.get_or_default(self.ctx, _P.AdditionalRunParams).split(" ")

        return args

    def _run_idx_ops(
        self,
        input_files,
        result_name,
        thread_count=24,
        trash_pool=False,
        collect_doc_texts=False,
        collect_stream_hits=False,
        use_final_format=False,
    ):
        """
            Executes idx_ops
        """
        options_list = self._get_options(input_files)
        logging.debug("idx_ops options detected:\n%s\n", options_list)

        # MR/YT output is internally supported
        output_opts = ['--output', result_name]

        args = self._make_def_args(input_files, thread_count)

        # res_type = resource_types.IDX_OPS_EST_FEATURES_PROTOBUF_OUTPUT
        if use_final_format:
            logging.warning("use-final-format option is ignored by now.")
            # args.append('--sorted-final-format' if '--sorted-final-format' in options_list else '--final-format')
            # res_type = resource_types.IDX_OPS_EST_FEATURES_PROTOBUF_FINAL_OUTPUT

        if trash_pool:
            # FIXME(mvel): do we need it? surely not for generic estimated pools
            args += [
                '--trash-rating', '-0.5',
                '--trash-ratio', '0.3',
            ]

        if collect_doc_texts:
            args.append('--collect-doc-texts')

        if collect_stream_hits:
            args.append('--collect-stream-hits')

        args += output_opts
        args += [
            input_files['ratings'],
            input_files['requests'],
            input_files['shard'],
        ]

        idx_ops_env = {
            'MR_RUNTIME': 'YT',
            'YT_DISABLE_CLIENT_SUB_TRANSACTIONS': '1',
            'YT_CONFIG_PATCHES': '{yamr_mode={create_tables_outside_of_transaction=%true}}',
            'YT_TABLE_WRITER': '{"max_row_weight":67108864}',
            'WRITE_TO_LOG': '1',
        }
        logging.debug("idx_ops process environment (without YT_TOKEN):\n%s\n", json.dumps(idx_ops_env, indent=4))
        idx_ops_env['YT_TOKEN'] = self.get_vault_data(
            utils.get_or_default(self.ctx, _P.VaultOwner),
            utils.get_or_default(self.ctx, _P.VaultName)
        )
        process.run_process(
            args,
            log_prefix='run_idx_ops',
            environment=idx_ops_env,
            exc_class=errors.SubprocessErrorBase,  # because YT errors are temporary
        )


__Task__ = MakeIdxOpsPool
