"""Contains all logic for processing host deployment using LUI provisioner."""

import json
import logging
from copy import deepcopy

import walle.admin_requests.request as admin_requests
from sepelib.core import config, constants
from sepelib.core.exceptions import Error
from walle import audit_log
from walle.clients import deploy as deploy_client, bot
from walle.clients.deploy import DEPLOY_CONFIG_EXTERNAL
from walle.constants import PROVISIONER_LUI
from walle.fsm_stages import common
from walle.fsm_stages.common import (
    register_stage,
    complete_current_stage,
    fail_current_stage,
    commit_stage_changes,
    retry_parent_stage,
    get_parent_stage,
    get_current_stage,
    terminate_current_stage,
    retry_current_stage,
    get_stage_deploy_configuration,
)
from walle.hosts import TaskType
from walle.models import timestamp
from walle.stages import Stages, StageTerminals
from walle.stats import stats_manager
from walle.util.deploy_config import DeployConfigPolicies
from walle.util.misc import drop_none

log = logging.getLogger(__name__)


_INSTALLATION_CHECK_PERIOD = 15
"""OS installation process check period."""

_CONFIG_INAPPROPRIATE_CHECK_PERIOD = 15 * constants.MINUTE_SECONDS
"""LUI tries to reload inappropriate config every day"""

_PXE_BOOT_TIMEOUT = 15 * constants.MINUTE_SECONDS
"""Time which is required for host to boot from PXE."""

_PENDING_STATUS_TIMEOUT = 15 * constants.MINUTE_SECONDS
"""Timeout for host waiting for deploy to begin."""

_HOST_INFO_UPDATE_TIMEOUT = constants.HOUR_SECONDS
"""LUI host status timeout.

If it doesn't update during the installation (READY_TO_INSTALL -> INSTALLING -> INSTALLATION_FAILED -> READY_TO_INSTALL
-> ..) it could mean that the host has hang or some other error has occurred."""

_INSTALLATION_TIMEOUT = constants.DAY_SECONDS
"""Timeout for Task.STATUS_INSTALLING."""

_FAILING_TIMEOUT = constants.HOUR_SECONDS
"""Timeout for Task.STATUS_FAILING. We expect Setup to restart deploying automatically, but it can fail."""

_FAILING_TIMEOUT_FOR_FAILED_DISK = constants.MINUTE_SECONDS
"""Timeout for failed disk status during deploy."""

_PREPARING_TIMEOUT = 30 * constants.MINUTE_SECONDS
"""Timeout for Task.STATUS_PREPARING. We expect Setup to start deploying before this timeout expires."""

_INSTALLATION_RETRY_TIMEOUT = constants.WEEK_SECONDS
"""Give the men some time to fix any problems we've met on our way."""

_ERROR_CHECK_INTERVAL = 5 * constants.MINUTE_SECONDS
"""Retry critical with increased interval because they need a human assistance which is slow."""

_ERROR_RETRY_PERIOD = constants.HOUR_SECONDS
"""Retry critical with increased interval because they need a human assistance which is slow."""

_STATUS_RESETTING_BMC = "resetting_bmc"
"""Resetting BMC before retrying the deploy."""


LUI_DEPLOY_FAIL_COUNTER_NAME = "lui_deploy_fail"


# NOTE(rocco66): stats init
stats_manager.set_counter_value(LUI_DEPLOY_FAIL_COUNTER_NAME, 0)


class CriticalStageError(Error):
    """Indicate an error that wall-e developers might be interested in. It can be not fatal for the stage."""

    pass


class FatalStageError(CriticalStageError):
    """Indicate an error that is fatal for the stage.
    Wall-E developers should not be interested in these error because they all are well known."""

    pass


class DeployFailed(FatalStageError):
    pass


class PxeBootFailed(FatalStageError):
    pass


class HostProvisionerChanged(FatalStageError):
    def __init__(self, provisioner):
        message = "Redeploy task failed. Host deploy provisioner has changed from {} to {} during the task."
        super().__init__(message, PROVISIONER_LUI, provisioner)


class DeployStageHandler:
    def __init__(self, host):
        self.host = host
        self.stage = get_current_stage(host)
        self.parent_stage = get_parent_stage(self.host, self.stage)

    @classmethod
    def as_handler(cls):
        return lambda host: cls(host).handle()

    def handle(self):
        raise NotImplementedError

    @staticmethod
    def _assert_provisioner_is_lui(provisioner):
        if provisioner != PROVISIONER_LUI:
            raise HostProvisionerChanged(provisioner)

    def _on_provisioner_changed(self, error):
        """Fail task and warn the user that task has failed."""
        if self.host.task.type == TaskType.AUTOMATED_HEALING:
            error += " Failing automated {}.".format(self.host.status)
        else:
            error += " Please restart {} if you need.".format(self.host.status)

        return fail_current_stage(self.host, error)

    def _refresh_deploy_config(self, current_configuration):
        host_configuration = self.host.get_deploy_configuration()

        self._assert_provisioner_is_lui(host_configuration.provisioner)

        if self.parent_stage.get_param("config_forced", False):
            host_configuration = host_configuration._replace(config=current_configuration.config)

        if host_configuration != current_configuration:
            self.parent_stage.set_data("config_override", host_configuration)

        return host_configuration


class GenerateDeployConfigContentHandler(DeployStageHandler):
    def handle(self):
        configuration = get_stage_deploy_configuration(self.parent_stage)
        policy_cls = DeployConfigPolicies.get_policy_class(configuration.deploy_config_policy)
        deploy_policy_overrides = policy_cls().generate(host=self.host, deploy_config_name=configuration.config)
        self.parent_stage.set_data("deploy_policy_overrides", deploy_policy_overrides)

        if deploy_policy_overrides:
            audit_log.update_payload(self.host.task.audit_log_id, {"deploy_policy_overrides": deploy_policy_overrides})

        complete_current_stage(self.host)


class AssignConfigHandler(DeployStageHandler):
    def handle(self):
        configuration = get_stage_deploy_configuration(self.parent_stage)
        deploy_policy_overrides = dict(self.parent_stage.get_data("deploy_policy_overrides", {}))

        try:
            self._handle_stage(configuration, deploy_policy_overrides)
        except HostProvisionerChanged as e:
            self._on_provisioner_changed(str(e))

        except FatalStageError as e:
            self._refresh_config_and_retry(str(e), configuration)

        except CriticalStageError as e:
            self._retry_with_deadline(str(e), logging.ERROR)

    def _handle_stage(self, configuration, deploy_policy_overrides):
        self._assert_host_has_mac_address()
        self._assert_provisioner_is_lui(configuration.provisioner)
        self._assert_deploy_config_exists(configuration.config, self.host.get_eine_box())

        deploy_params = {"config_name": configuration.config}  # deploy policy can override it

        if deploy_policy_overrides:
            config_content_json = deploy_policy_overrides.pop("config_content_json", None)
            if config_content_json is not None:
                deploy_params["config_content"] = json.loads(config_content_json)

            deploy_params.update(deploy_policy_overrides)

        deploy_params.update(self._get_hbf_project_params())
        deploy_params.update(self._get_certificate_params())

        self._reset_fail_count()

        self._schedule_redeploy(deploy_params)

        self._store_deploy_macs()
        complete_current_stage(self.host)

    def _assert_host_has_mac_address(self):
        if not self.host.macs:
            raise CriticalStageError("The host doesn't have any MAC addresses registered in BOT.")

    @classmethod
    def _assert_deploy_config_exists(cls, config_name, project_box):
        if not cls._deploy_config_exists(config_name, project_box):
            raise FatalStageError("Deploy config {} does not exist in Setup/LUI.", config_name)

    @staticmethod
    def _deploy_config_exists(config_name, project_box):
        try:
            return config_name in deploy_client.get_deploy_configs(deploy_client.get_deploy_provider(project_box))
        except deploy_client.DeployPersistentError as e:
            raise CriticalStageError(str(e))

    def _schedule_redeploy(self, deploy_kwargs):
        if "private_data" in deploy_kwargs:
            deploy_kwargs_for_log = deepcopy(deploy_kwargs)
            for private_data in deploy_kwargs_for_log["private_data"]:
                private_data["content"] = "<hidden>"
        else:
            deploy_kwargs_for_log = deploy_kwargs

        config_name = deploy_kwargs.pop("config_name")

        log.debug(
            "%s: scheduling host redeploy with config %s: %s",
            self.host.human_id(),
            config_name,
            json.dumps(deploy_kwargs_for_log),
        )

        try:
            client = deploy_client.get_client(deploy_client.get_deploy_provider(self.host.get_eine_box()))
            client.schedule_redeploy(self.host.name, self.host.macs, config_name=config_name, **deploy_kwargs)
        except deploy_client.DeployPersistentError as e:
            raise CriticalStageError(str(e))

    def _get_fail_count(self):
        try:
            client = deploy_client.get_client(deploy_client.get_deploy_provider(self.host.get_eine_box()))
            info = client.get_deploy_status(self.host.name)
        except deploy_client.HostDoesntExistError:
            return 0
        else:
            log.debug("%s: got server info from setup: %s", self.host.human_id(), json.dumps(info))
            return info["fail_count"]

    def _reset_fail_count(self):
        fail_count = self._get_fail_count()
        self.parent_stage.set_temp_data("start_fail_count", fail_count)

    def _store_deploy_macs(self):
        self.parent_stage.set_temp_data("deploy_macs", self.host.macs)

    def _get_hbf_project_params(self):
        project = self.host.get_project()
        if project.hbf_project_id is None:
            return {}

        return {"project_id": hex(project.hbf_project_id)[2:]}

    def _get_certificate_params(self):
        certificate = self.parent_stage.get_data("certificate", None)
        if certificate is None:
            return {}

        certificate_descriptor = drop_none(
            {
                "content": certificate,
                "path": config.get_value("certificator.host_certificate_path", None),
                "owner": config.get_value("certificator.host_certificate_owner", None),
            }
        )

        return {"private_data": [certificate_descriptor]}

    def _refresh_config_and_retry(self, error, current_configuration):
        try:
            new_configuration = self._refresh_deploy_config(current_configuration)
        except HostProvisionerChanged as e:
            return self._on_provisioner_changed(str(e))

        minor_changes = self._is_minor_changes(current_configuration, new_configuration)
        self._retry_with_deadline(error, minor=minor_changes)

    def _retry_with_deadline(self, error, log_level=logging.INFO, minor=True):
        """Check deadline and retry stage."""
        log.log(log_level, "%s: {}".format(error), self.host.human_id())

        if self.stage.timed_out(_INSTALLATION_RETRY_TIMEOUT):
            return fail_current_stage(self.host, error)

        if minor:
            retry_current_stage(self.host, error=error, check_after=_ERROR_RETRY_PERIOD)
        else:
            retry_parent_stage(self.host, error=error, check_after=_ERROR_RETRY_PERIOD)

    @staticmethod
    def _is_minor_changes(old_configuration, new_configuration):
        """If configurations differ on in config, then it is a minor changes.
        Other differences (network, certificate, etc.) are major changes.
        """

        return new_configuration == old_configuration._replace(config=new_configuration.config)


class InstallationStatusHandler:
    _handlers = []

    @classmethod
    def register(cls, handler):
        cls._handlers.append(handler)

    @classmethod
    def handle_status(cls, host, stage, parent_stage, lui_info):
        status = lui_info["status"]

        for handler in cls._handlers:
            if handler.match_status(status):
                return handler(host, stage, parent_stage, lui_info).handle_status()
        else:
            raise CriticalStageError(
                "Deploying process got an unexpected status '{}' ({} LUI status).", status, lui_info["description"]
            )


class AbstractStatusHandler:
    status = None

    def __init__(self, host, stage, parent_stage, lui_info):
        self.host = host
        self.stage = stage
        self.parent_stage = parent_stage
        self.lui_info = lui_info

    @classmethod
    def match_status(cls, status):
        return cls.status == status

    def handle_status(self):
        raise NotImplementedError

    def _check_deploy_error_count(self):
        fail_count = self.lui_info["fail_count"] - self.parent_stage.get_temp_data("start_fail_count")

        if fail_count >= config.get_value("deployment.max_failures"):
            raise DeployFailed("Setup failed {} times.", fail_count)


@InstallationStatusHandler.register
class CompletedStatusHandler(AbstractStatusHandler):
    status = deploy_client.STATUS_COMPLETED

    def handle_status(self):
        log.info("%s has been deployed.", self.host.human_id())
        return complete_current_stage(self.host)


@InstallationStatusHandler.register
class RetryStatusHandler(AbstractStatusHandler):
    status = deploy_client.STATUS_RETRY

    def handle_status(self):
        log.info("%s need to repeat deploying process.", self.host.human_id())
        return terminate_current_stage(StageTerminals.RETRY_ACTION, host=self.host)


@InstallationStatusHandler.register
class PendingStatusHandler(AbstractStatusHandler):
    status = deploy_client.STATUS_PENDING

    def handle_status(self):
        self._check_deploy_error_count()

        if self.stage.timed_out(_PENDING_STATUS_TIMEOUT):
            raise PxeBootFailed(
                "Host failed to boot from PXE: either it's plugged into an invalid VLAN "
                "or there is a hardware problem."
            )

        return commit_stage_changes(self.host, check_after=_INSTALLATION_CHECK_PERIOD)


@InstallationStatusHandler.register
class BootingStatusHandler(AbstractStatusHandler):
    status = deploy_client.STATUS_BOOTING

    def handle_status(self):
        self._check_deploy_error_count()

        if self.stage.timed_out(_PXE_BOOT_TIMEOUT):
            # Example of this error: https://st.yandex-team.ru/NOC-4781
            raise PxeBootFailed("Host failed to boot from PXE.")

        return commit_stage_changes(self.host, check_after=_INSTALLATION_CHECK_PERIOD)


@InstallationStatusHandler.register
class ProcessingStatusHandler(AbstractStatusHandler):
    @classmethod
    def match_status(cls, status):
        return status in {
            deploy_client.STATUS_PREPARING,
            deploy_client.STATUS_DEPLOYING,
            deploy_client.STATUS_FAILED,
            deploy_client.STATUS_DISK_FAILED,
        }

    def handle_status(self):
        self._check_deploy_error_count()
        self._cancel_ipmi_admin_requests()

        if self.stage.timed_out(self._timeout(), "install_time"):
            raise FatalStageError("Deploying process has timed out.")

        commit_stage_changes(self.host, check_after=_INSTALLATION_CHECK_PERIOD)

    def _timeout(self):
        status = self.lui_info["status"]

        return {
            deploy_client.STATUS_DEPLOYING: _INSTALLATION_TIMEOUT,
            deploy_client.STATUS_PREPARING: _PREPARING_TIMEOUT,
            deploy_client.STATUS_FAILED: _FAILING_TIMEOUT,
            deploy_client.STATUS_DISK_FAILED: _FAILING_TIMEOUT_FOR_FAILED_DISK,
        }[status]

    def _cancel_ipmi_admin_requests(self):
        install_time = self.stage.get_temp_data("install_time", None)
        if install_time is None:
            log.info("%s: Deployment process has started.", self.host.human_id())
            self.stage.set_temp_data("install_time", timestamp())

            # Since the deployment process has started, there is a high probability that all created admin requests
            # for broken IPMI no longer need and it's reasonable to cancel them now to decrease a chance that DC
            # engineer will try to heal the host during the deployment process. But we should do this only once -
            # when the deployment process just started, because for example in case of installation timeout we check
            # host's power status and in this case we mustn't cancel the request that we've just created.
            admin_requests.cancel_all_by_host(self.host.inv, self.host.name, types=admin_requests.RequestTypes.ALL_IPMI)


class InstallationStageHandler(DeployStageHandler):
    def handle(self):
        configuration = get_stage_deploy_configuration(self.parent_stage)
        if self.stage.status == _STATUS_RESETTING_BMC:
            return self._reset_bmc_and_retry()

        try:
            self._handle_stage(configuration)
        except (DeployFailed, FatalStageError) as e:
            self._handle_deploy_error(str(e), configuration)

        except CriticalStageError as e:
            self._handle_critical_error(str(e))

    def _handle_stage(self, configuration):
        info = self._get_host_info()
        log.debug("%s: got server info from setup: %s", self.host.human_id(), json.dumps(info))
        self._amend_host_status(info)

        self._assert_deploy_mac_address(info)
        self._assert_deploy_config(info, configuration.config)

        self._handle_status(info)

    def _handle_status(self, lui_info):
        if lui_info["status"] != deploy_client.STATUS_COMPLETED:
            self._check_deploy_error_count(lui_info)

        try:
            return InstallationStatusHandler.handle_status(self.host, self.stage, self.parent_stage, lui_info)
        except FatalStageError:
            if not self.host.get_ipmi_client().is_power_on():
                raise FatalStageError("Host has suddenly powered off.")

            raise

    def _check_deploy_error_count(self, lui_info):
        fail_count = lui_info["fail_count"] - self.parent_stage.get_temp_data("start_fail_count")

        if fail_count >= config.get_value("deployment.max_failures"):
            raise DeployFailed("Setup failed {} times.", fail_count)

    def _get_host_info(self):
        try:
            client = deploy_client.get_client(deploy_client.get_deploy_provider(self.host.get_eine_box()))
            return client.get_deploy_status(self.host.name)
        except deploy_client.HostDoesntExistError:
            raise FatalStageError("The host has vanished from LUI database during deployment process.")

    def _assert_deploy_mac_address(self, lui_info):
        deploy_macs = self.parent_stage.get_temp_data("deploy_macs")

        if lui_info["macs"] != deploy_macs:
            error_message = "Host has suddenly changed it's deployment MAC address from {} to {}."
            raise FatalStageError(error_message, deploy_macs, lui_info["macs"])

    @staticmethod
    def _assert_deploy_config(lui_info, config_name):
        if lui_info["config"] != config_name and lui_info["config"] != DEPLOY_CONFIG_EXTERNAL:
            error_message = "Host has suddenly changed it's deployment config from {} to {}."
            raise FatalStageError(error_message, config_name, lui_info["config"])

    def _amend_host_status(self, lui_info):
        modify_time = lui_info.get("modify_time") or self.stage.status_time
        if self.stage.status != lui_info["lui_status"].lower() or (
            modify_time is not None and modify_time > self.stage.status_time
        ):
            commit_stage_changes(self.host, status=lui_info["lui_status"].lower())
            # commit stage changes reloads the host, we need to update these data object instances
            # if we want to `set_data` and `set_temp_data` to work.
            self.stage = get_current_stage(self.host)

    def _handle_deploy_error(self, error, current_configuration):
        try:
            self._refresh_deploy_config(current_configuration)
        except HostProvisionerChanged as e:
            return self._on_provisioner_changed(str(e))

        if self._setup_error_limit_exceeded():
            self._on_deploy_failed(error)
        else:
            self._fail_with_bmc_reset(error)

    def _setup_error_limit_exceeded(self):
        counter_name = "lui_errors"
        limit_name = "deployment.max_errors"

        not_exceeded = common.increase_configurable_error_count(
            self.host, self.stage, counter_name, limit_name, error=None, fail_stage=False
        )

        return not not_exceeded

    def _setup_retries_limit_exceeded(self):
        counter_name = "lui_retries"
        limit_name = "deployment.max_retries"
        not_exceeded = common.increase_configurable_error_count(
            self.host, self.stage, counter_name, limit_name, error=None, fail_stage=False
        )

        return not not_exceeded

    def _on_deploy_failed(self, error):
        """Deploy failure means lui/setup failed to deploy host a few times
        or we've tried to boot host a few times and it failed.

        Try to profile the host if it's possible or just keep retrying until somebody fixes the problem.
        We may need to profile host to fix broken iPXE and/or some other probably broken stuff.

        N.B. Upgrade to highload profile may be disabled via task settings. In this case this task will retry until
        time's out, hence the check_after arg for terminate function call.
        """
        stats_manager.increment_counter(LUI_DEPLOY_FAIL_COUNTER_NAME)
        if self.stage.timed_out(_INSTALLATION_RETRY_TIMEOUT):
            return fail_current_stage(self.host, error)

        if self._setup_retries_limit_exceeded():
            client = deploy_client.get_client(deploy_client.get_deploy_provider(self.host.get_eine_box()))
            client.deactivate(self.host.name)
            return fail_current_stage(self.host, error)

        # this may lead either to retry parent stage or to upgrade task to profile and redeploy
        terminate_current_stage(StageTerminals.DEPLOY_FAILED, self.host, check_after=_ERROR_RETRY_PERIOD, error=error)

    def _fail_with_bmc_reset(self, error):
        """We just retry the stage, but we need to reset bmc in between, because it is failing on retries."""
        commit_stage_changes(self.host, error=error, status=_STATUS_RESETTING_BMC, check_now=True)

    def _handle_critical_error(self, error):
        """This is most probably an error in wall-e itself or in integration between wall-e and setup/lui.
        Wait until this error gets fixed.
        """
        log.error("%s: {}".format(error), self.host.human_id())

        if self.stage.timed_out(_INSTALLATION_RETRY_TIMEOUT):
            return fail_current_stage(self.host, error)

        commit_stage_changes(self.host, error=error, check_after=_ERROR_CHECK_INTERVAL)

    def _reset_bmc_and_retry(self):
        """Reset BMC with IPMI command nd retry the stage."""
        ipmi_client = self.host.get_ipmi_client()
        ipmi_client.bmc_reset()

        terminate_current_stage(StageTerminals.RETRY_ACTION, host=self.host)


@InstallationStatusHandler.register
class ConfigInappropriateStatusHandler(AbstractStatusHandler):
    status = deploy_client.STATUS_CONFIG_INAPPROPRIATE

    def handle_status(self):
        self._check_deploy_error_count()

        if self.stage.timed_out(_INSTALLATION_TIMEOUT):
            raise FatalStageError("Deploying process has timed out.")

        error = """LUI encountered wrong deploy config ({}) and will try to reload it every day
Usually it means that config doesn't support host's disk configuration ({})""".format(
            self.get_config_name(), self.get_host_disk_conf()
        )
        return commit_stage_changes(self.host, check_after=_CONFIG_INAPPROPRIATE_CHECK_PERIOD, error=error)

    def get_config_name(self):
        deploy_configuration = get_stage_deploy_configuration(self.parent_stage)
        config_info = deploy_configuration.config
        if deploy_configuration.deploy_config_policy not in [None, DeployConfigPolicies.PASSTHROUGH]:
            config_info += " (config policy: {})".format(deploy_configuration.deploy_config_policy)
        return config_info

    def get_host_disk_conf(self):
        return bot.get_host_disk_configuration(self.host.inv)


def _deactivate_host_in_lui(host):
    if host.name is not None:
        client = deploy_client.get_client(deploy_client.get_deploy_provider(host.get_eine_box()))
        client.deactivate(host.name)

    complete_current_stage(host)


def _add_host_to_lui(host):
    stage = get_current_stage(host)
    if not host.name:
        return fail_current_stage(host, "Host name is required. This is a bug in Wall-E.")

    try:
        client = deploy_client.get_client(deploy_client.get_deploy_provider(host.get_eine_box()))
        deploy_config = stage.get_param("config")
        client.setup(host.name, host.macs or [], deploy_config)
    except deploy_client.DeployPersistentError as e:
        error = "Failed to add host to lui: {}".format(e)
        return terminate_current_stage(StageTerminals.SKIP, host, error)

    complete_current_stage(host)


def _remove_host_from_lui(host):
    if host.name is not None:
        client = deploy_client.get_client(deploy_client.get_deploy_provider(host.get_eine_box()))
        client.remove(host.name)

    complete_current_stage(host)


def _cancel_stage(host, stage):
    parent_stage = get_parent_stage(host, stage)

    if parent_stage.has_temp_data("deploy_macs"):
        # Unschedule deployment in LUI. The host may continue to install the system if started already,
        # but at least it won't begin installation if it didn't start it yet.
        log.info("%s: Cancelling the installation...", host.human_id())

        try:
            client = deploy_client.get_client(deploy_client.get_deploy_provider(host.get_eine_box()))
            client.deactivate(host.name)
        except Exception as e:
            log.error("%s: Failed to cancel the installation: %s", host.human_id(), e)


# Attention:
# These stages retry parent stage and write data to it
register_stage(Stages.ASSIGN_LUI_CONFIG, AssignConfigHandler.as_handler(), cancellation_handler=_cancel_stage)
register_stage(
    Stages.GENERATE_CUSTOM_DEPLOY_CONFIG,
    GenerateDeployConfigContentHandler.as_handler(),
    cancellation_handler=_cancel_stage,
)
register_stage(Stages.LUI_INSTALL, InstallationStageHandler.as_handler(), cancellation_handler=_cancel_stage)
register_stage(Stages.LUI_SETUP, _add_host_to_lui)
register_stage(Stages.LUI_REMOVE, _remove_host_from_lui)
register_stage(Stages.LUI_DEACTIVATE, _deactivate_host_in_lui)
