"""Rules that make decision based on several checks results."""

import logging

from sepelib.core.exceptions import Error
from walle.expert.constants import HW_WATCHER_CHECK_MAX_POSSIBLE_DELAY
from walle.expert.decision import Decision
from walle.expert.rules import AvailabilityCheckRule
from walle.expert.rules.base import AbstractRule
from walle.expert.rules.escalation import (
    EscalationRules,
    EscalationPoint,
    action_match,
    limit_reached,
    escalate_to_deactivate,
    task_has_not_helped,
    escalate_to_redeploy,
)
from walle.expert.rules.utils import get_check_result
from walle.expert.types import WalleAction, Failure, CheckType, CheckStatus
from walle.models import timestamp
from walle.operations_log.constants import Operation

log = logging.getLogger(__name__)
decision_debug_log = logging.getLogger("decision_debug_log")


class CanNotHealHost(Error):
    pass


class WalleMetaIsBroken(CanNotHealHost):
    pass


class HostIsNotAvailable(CanNotHealHost):
    pass


class PossibleFlap(CanNotHealHost):
    pass


class MissingHwChecksRule(AbstractRule):
    _ALL_MISSING_STATUSES = frozenset([CheckStatus.MISSING, CheckStatus.STALED])
    _ALL_INVALID_STATUSES = frozenset([CheckStatus.INVALID])

    _ALL_HW_WATCHER_CHECKS = frozenset(set(CheckType.ALL_HW_WATCHER))

    _ALL_HARDWARE_CHECKS = frozenset(set(CheckType.ALL_HARDWARE) - {CheckType.WALLE_RACK_OVERHEAT})
    _INVALID_PERCENTAGE_TO_FAIL = 60

    escalation_rules = EscalationRules(
        EscalationPoint(
            predicate=action_match(WalleAction.REBOOT),
            reason=task_has_not_helped(Operation.REBOOT.host_status, "Reboot hasn't helped", "Host failed to reboot"),
            action=escalate_to_redeploy,
        ),
        EscalationPoint(
            predicate=action_match(WalleAction.REBOOT),
            reason=task_has_not_helped(
                Operation.REDEPLOY.host_status, "Redeploying hasn't helped", "Host failed to redeploy"
            ),
            action=escalate_to_deactivate,
        ),
        EscalationPoint(
            predicate=action_match(WalleAction.REBOOT),
            reason=limit_reached("max_host_reboots", Operation.REBOOT),
            action=escalate_to_redeploy,
        ),
        EscalationPoint(
            predicate=action_match(WalleAction.REDEPLOY),
            reason=limit_reached("max_host_redeployments", Operation.REDEPLOY),
            action=escalate_to_deactivate,
        ),
    )

    def make_decision(self, host, reasons, enabled_checks):
        all_hw_watcher_checks = self._ALL_HW_WATCHER_CHECKS
        enabled_hardware_checks = enabled_checks & self._ALL_HARDWARE_CHECKS

        if all(self._is_missing(reasons, check) for check in all_hw_watcher_checks):
            try:
                self._assert_meta_is_not_broken(reasons)
                self._assert_host_is_available(host, reasons, enabled_checks)
                self._assert_is_not_a_flap(reasons)

                return self._decision_reboot("All hw-watcher checks are missing.", CheckType.ALL_HW_WATCHER)

            except CanNotHealHost as e:
                return self._decision_wait(str(e))

        if (enabled_checks & all_hw_watcher_checks) != all_hw_watcher_checks:
            reason = "Missing hw-checks rule is not enabled for the host: not all hw-watcher checks are enabled."
            return Decision.healthy(reason)

        invalid_checks = {check for check in enabled_hardware_checks if self._is_invalid(reasons, check)}

        if len(invalid_checks) * 100 // len(enabled_hardware_checks) >= self._INVALID_PERCENTAGE_TO_FAIL:
            try:
                self._assert_host_is_available(host, reasons, enabled_checks)

                # TODO(rocco66): use enabled_hardware_checks here instead of CheckType.ALL_HARDWARE?
                return self._decision_reboot("Most of hardware checks are invalid.", CheckType.ALL_HARDWARE)
            except CanNotHealHost as e:
                return self._decision_wait(str(e))

        return Decision.healthy("Host is healthy.")

    def escalate(self, host, decision):
        return self.escalation_rules.escalate(host, decision)

    @staticmethod
    def _decision_reboot(reason, checks):
        return Decision(
            WalleAction.REBOOT,
            checks=checks,
            failures=[Failure.CHECKS_MISSING],
            reason=reason,
        )

    @staticmethod
    def _decision_wait(reason):
        return Decision.wait(
            "All hw-watcher checks are missing, but it's probably some other failure: {}".format(reason)
        )

    @staticmethod
    def _is_missing(reasons, check, missing_statuses=_ALL_MISSING_STATUSES):
        return get_check_result(reasons, check)["status"] in missing_statuses

    @staticmethod
    def _is_invalid(reasons, check, invalid_statuses=_ALL_INVALID_STATUSES):
        return get_check_result(reasons, check)["status"] in invalid_statuses

    @staticmethod
    def _assert_meta_is_not_broken(reasons):
        if CheckType.W_META not in reasons:
            # do not try to fix these.
            raise WalleMetaIsBroken("no result for walle_meta check.")

        meta_check = get_check_result(reasons, CheckType.W_META)
        if meta_check["status"] not in {CheckStatus.PASSED, CheckStatus.FAILED}:
            # walle_meta is either missing or suspected, unsupported configuration.
            # failed walle_meta is supported configuration (currently, it means that hw-watcher is broken).
            raise WalleMetaIsBroken("walle_meta is {}.", meta_check["status"])

        status_mtime = meta_check.get("status_mtime")
        if status_mtime and status_mtime + HW_WATCHER_CHECK_MAX_POSSIBLE_DELAY >= timestamp():
            raise WalleMetaIsBroken("walle_meta result is too fresh, wait hw-checks to catch up.")

    @staticmethod
    def _assert_host_is_available(host, reasons, enabled_checks):
        availability_decision = AvailabilityCheckRule.make_decision(host, reasons, enabled_checks)

        if availability_decision.action != WalleAction.HEALTHY:
            raise HostIsNotAvailable(availability_decision.reason)

    @classmethod
    def _assert_is_not_a_flap(cls, reasons):
        minimum_stale_time = timestamp() - 2 * HW_WATCHER_CHECK_MAX_POSSIBLE_DELAY

        for check in cls._ALL_HW_WATCHER_CHECKS:
            check_result = get_check_result(reasons, check)

            # hw-watcher checks which may not turn into NO DATA when hw-watcher breaks
            # because we can still get metadata from hw-watcher and send it to juggler.
            # But it will be stale and we can get timestamp from metadata, which is actually used for stale_timestamp.
            # For "not hw-watcher" checks stale_timestamp bumps on check arrival and STALED status actually mean
            # that we did not receive fresh data from juggler, (meaning, juggler is causing a delay, not host problems)
            # so stale timestamp is only suitable for hw-watcher checks and the whole rule
            # should only consider STALED status for hw-watcher checks.
            if check_result["status"] == CheckStatus.STALED:
                if check_result["stale_timestamp"] > minimum_stale_time:
                    raise PossibleFlap("check {} has not been staled for long enough, need to wait more.", check)
            else:
                if check_result["status_mtime"] > minimum_stale_time:
                    raise PossibleFlap("check {} has not been missing for long enough, need to wait more.", check)
