import logging

from walle.expert.decision import Decision
from walle.expert.failure_types import FailureType
from walle.expert.rules.hw_watcher_rules.util import get_eine_code
from walle.expert.rules.utils import get_check_result, is_disabled_check
from walle.operations_log.constants import Operation
from .base import AbstractRule, check_common
from .escalation import (
    EscalationRules,
    escalate_to_deactivate,
    EscalationPoint,
    action_match,
    limit_reached,
    task_has_not_helped,
)
from ..types import WalleAction, CheckType, CheckStatus

log = logging.getLogger(__name__)


class RackOverheatRule(AbstractRule):

    escalation_rules = EscalationRules(
        EscalationPoint(
            predicate=action_match(WalleAction.REPAIR_RACK_OVERHEAT),
            reason=task_has_not_helped(
                Operation.REPAIR_RACK_FAILURE.host_status, "Ticket is closed, rack has not been repaired"
            ),
            action=escalate_to_deactivate,
        ),
        EscalationPoint(
            predicate=action_match(WalleAction.REPAIR_RACK_OVERHEAT),
            reason=limit_reached("max_host_rack_overheat_repairs", Operation.REPAIR_RACK_OVERHEAT),
            action=escalate_to_deactivate,
        ),
    )

    def make_decision(self, host, reasons, enabled_checks):
        check_type = CheckType.WALLE_RACK_OVERHEAT

        if is_disabled_check(host, check_type, enabled_checks):
            return Decision.healthy("Rack overheat check is not enabled for the host.", checks=[check_type])

        check_result = get_check_result(reasons, check_type)
        if check_result["status"] != CheckStatus.FAILED:
            return check_common(check_type, check_result)

        if self._host_cpu_not_overheating(reasons):
            return self._decision_wait(
                host, "Rack overheat check for rack {rack} queue {queue} failed but host is healthy."
            )

        return self._decision_repair(host, check_result)

    @staticmethod
    def _host_cpu_not_overheating(reasons):
        check_result = get_check_result(reasons, CheckType.CPU_CACHES)
        return check_result["status"] != CheckStatus.FAILED or get_eine_code(check_result["metadata"]["result"]) != [
            "CPU_OVERHEATING"
        ]

    @staticmethod
    def _decision_wait(host, reason):
        return Decision.wait(
            reason=reason.format(queue=host.location.short_queue_name, rack=host.location.rack),
            checks=[CheckType.WALLE_RACK_OVERHEAT],
            failure_type=FailureType.RACK_OVERHEAT,
        )

    @staticmethod
    def _decision_repair(host, check_result):
        template = (
            "Rack {rack} in queue {queue} is overheated:"
            " {failed} of {total} hosts overheated (threshold is {threshold})."
        )
        message = template.format(
            queue=host.location.short_queue_name,
            rack=host.location.rack,
            failed=check_result["metadata"]["failed"],
            total=check_result["metadata"]["total"],
            threshold=check_result["metadata"]["threshold_failed"],
        )

        return Decision(
            WalleAction.REPAIR_RACK_OVERHEAT,
            reason=message,
            checks=[CheckType.WALLE_RACK_OVERHEAT],
            failure_type=FailureType.RACK_OVERHEAT,
            failure_check_info=check_result,
        )

    def escalate(self, host, decision):
        return self.escalation_rules.escalate(host, decision)


class RackRule(AbstractRule):

    escalation_rules = EscalationRules(
        EscalationPoint(
            predicate=action_match(WalleAction.REPAIR_RACK_FAILURE),
            reason=task_has_not_helped(
                Operation.REPAIR_RACK_FAILURE.host_status, "Ticket is closed, rack has not been repaired"
            ),
            action=escalate_to_deactivate,
        ),
        EscalationPoint(
            predicate=action_match(WalleAction.REPAIR_RACK_FAILURE),
            reason=limit_reached("max_host_rack_repairs", Operation.REPAIR_RACK_FAILURE),
            action=escalate_to_deactivate,
        ),
    )

    def make_decision(self, host, reasons, enabled_checks):
        check_type = CheckType.WALLE_RACK

        if is_disabled_check(host, check_type, enabled_checks):
            return Decision.healthy("Rack check is not enabled for the host.", checks=[check_type])

        check_result = get_check_result(reasons, check_type)
        if check_result["status"] != CheckStatus.FAILED:
            return check_common(check_type, check_result)

        if self._host_available(reasons):
            return self._decision_wait(host, "Rack check for rack {rack} queue {queue} failed but host is available.")

        if not self._is_network_ok(reasons):
            return self._decision_wait(
                host, "Rack check for rack {rack} queue {queue} failed but netmon check shows some other problems."
            )

        if check_result["metadata"]["failed"] < check_result["metadata"]["total"]:
            # Only report racks if all hosts in rack failed.
            # NB: this is not an ultimate behaviour, we are test-driving it.
            # Check turns "red" when 80% of hosts fail, which can actually mean switch failure.
            return self._decision_wait(
                host, "Some host has failed for rack {rack} queue {queue}. Waiting until all hosts fail."
            )

        return self._decision_repair(host, check_result)

    @staticmethod
    def _host_available(reasons):
        for check_type in CheckType.ALL_AVAILABILITY:
            if get_check_result(reasons, check_type)["status"] != CheckStatus.FAILED:
                return True

    @staticmethod
    def _is_network_ok(reasons):
        # Return ok (True) only if we know for sure that network is ok:
        # either we have a definitely PASSED check
        # or we have a FAILED check but with 'dc' and 'queue' levels PASSED
        # 'switch' level failure does not mean anything: all hosts are dead which makes switch check "fail".

        result = get_check_result(reasons, CheckType.NETMON)
        if result["status"] not in {CheckStatus.PASSED, CheckStatus.FAILED, CheckStatus.MISSING}:
            return False

        # MISSING can mean we do not have switch status, but we still can have dc and queue.
        # do not check switch status: whether it is failed, passed or missing, it says nothing useful to us.
        metadata = result.get("metadata", {})
        for domain in ("queue", "datacenter"):
            domain_status = metadata.get(domain, {}).get("status")
            if domain_status != CheckStatus.PASSED:
                return False

        return True

    @staticmethod
    def _decision_wait(host, reason):
        return Decision.wait(
            reason=reason.format(queue=host.location.short_queue_name, rack=host.location.rack),
            checks=[CheckType.WALLE_RACK],
            failure_type=FailureType.RACK_COMMON,
        )

    @staticmethod
    def _decision_repair(host, check_result):
        template = (
            "Rack {rack} in queue {queue} has failed: {failed} of {total} hosts failed (threshold is {threshold})."
        )
        message = template.format(
            queue=host.location.short_queue_name,
            rack=host.location.rack,
            failed=check_result["metadata"]["failed"],
            total=check_result["metadata"]["total"],
            threshold=check_result["metadata"]["threshold_failed"],
        )

        return Decision(
            WalleAction.REPAIR_RACK_FAILURE,
            reason=message,
            checks=[CheckType.WALLE_RACK],
            failure_type=FailureType.RACK_COMMON,
            failure_check_info=check_result,
        )

    def escalate(self, host, decision):
        return self.escalation_rules.escalate(host, decision)
