"""Contains all logic for handling IPMI errors."""

import logging
from datetime import timedelta, datetime

import humanize

import walle.admin_requests.request as admin_requests
import walle.fsm_stages.common
from sepelib.core import config, constants
from sepelib.core.exceptions import LogicalError
from walle.fsm_stages.common import commit_stage_changes, fail_current_stage, push_host_ticket
from walle.models import timestamp

log = logging.getLogger(__name__)


HARDWARE_ERROR_RETRY_INTERVAL = 2 * constants.MINUTE_SECONDS
"""Retry interval for task after a hardware error."""

HOST_RECOVERY_TIMEOUT = 5 * constants.MINUTE_SECONDS
"""
Assume that host must become healthy during this timeout after admin request closing (closed request may not mean that
host is alive already - DC engineer may just reset the BMC and close the request assuming that BMC reset will heal the
host).
"""

LOST_REQUEST_TIMEOUT = 2 * constants.HOUR_SECONDS
"""Sometimes BOT experiences temporary database problems and return "UNKNOWN", but later it become alive again.
If we create new request during this period, we create duplicate.
Assume 2 hours is enough for BOT's database to revive."""


def admin_requests_enabled(host):
    return config.get_value("hardware.enable_admin_requests") and not host.task.disable_admin_requests


def reset_ipmi_errors(host):
    del host.task.hardware_error_count
    del host.task.power_error_count


def handle_ipmi_host_missing_error(host, ipmi_fqdn, error):
    error = str(error)
    log.warning("%s: %s", host.human_id(), error)

    if not admin_requests_enabled(host):
        return fail_current_stage(host, error)

    if _delay_processing_if_healing_by_dc_engineers(host, error):
        return

    request = admin_requests.get_last_request_status(admin_requests.RequestTypes.IPMI_HOST_MISSING, host.inv)
    if request is None:
        log.warning(
            "%s: There are no active admin requests for missing IPMI host %s. Creating a new one.",
            host.human_id(),
            ipmi_fqdn,
        )
        return _create_ipmi_host_missing_admin_request(host, error)

    error += " There is a processed admin request for missing IPMI host but the problem persists."

    if host.task.reopened_admin_request_count < config.get_value("hardware.max_reopened_admin_requests"):
        log.warning("%s: %s Creating a new one.", host.human_id(), error)
        return _create_ipmi_host_missing_admin_request(host, error, reopen=True)

    log.error("%s: %s", host.human_id(), error)
    fail_current_stage(host, error)


def _create_ipmi_host_missing_admin_request(host, error, reopen=False):
    admin_requests.create_admin_request(host, admin_requests.RequestTypes.IPMI_HOST_MISSING, reason=error)
    if reopen:
        host.task.reopened_admin_request_count += 1

    commit_stage_changes(host, error=error, check_after=walle.fsm_stages.common.ADMIN_REQUEST_CHECK_INTERVAL)


def handle_ipmi_error(host, error):
    error = str(error)
    log.warning("%s: %s", host.human_id(), error)

    if admin_requests_enabled(host) and _delay_processing_if_healing_by_dc_engineers(host, error):
        return

    host.task.hardware_error_count += 1
    error_count = host.task.hardware_error_count
    allowed_count = config.get_value("hardware.max_ipmi_errors")

    if error_count <= allowed_count:
        log.info(
            "%s: Encountered %s hardware errors from allowed %s. "
            "Retry the operation after hardware error retry interval.",
            host.human_id(),
            error_count,
            allowed_count,
        )
        return commit_stage_changes(host, error=error, check_after=HARDWARE_ERROR_RETRY_INTERVAL)

    if not admin_requests_enabled(host):
        return fail_current_stage(host, error)

    request = admin_requests.get_last_request_status(admin_requests.RequestTypes.IPMI_UNREACHABLE, host.inv)
    if request is None:
        log.warning("%s: There are no active admin requests for unreachable IPMI. Creating a new one.", host.human_id())
        return _create_ipmi_unreachable_admin_request(host, error)

    error += " There is a processed admin request for unreachable IPMI but the problem persists."

    if host.task.reopened_admin_request_count < config.get_value("hardware.max_reopened_admin_requests"):
        log.warning("%s: %s Creating a new one.", host.human_id(), error)
        return _create_ipmi_unreachable_admin_request(host, error, reopen=True)

    log.error("%s: %s", host.human_id(), error)
    fail_current_stage(host, error)


def handle_power_on_off_timeout(host, error, retry_status):
    error = str(error)
    log.warning("%s: %s", host.human_id(), error)

    if admin_requests_enabled(host) and _delay_processing_if_healing_by_dc_engineers(
        host, error, retry_status=retry_status
    ):
        return

    host.task.power_error_count += 1
    error_count = host.task.power_error_count
    allowed_count = config.get_value("hardware.max_power_errors")

    if error_count <= allowed_count:
        log.info(
            "%s: Encountered %s power on/off timeout errors from allowed %s. "
            "Retry the operation after hardware error retry interval.",
            host.human_id(),
            error_count,
            allowed_count,
        )
        return commit_stage_changes(host, status=retry_status, error=error, check_after=HARDWARE_ERROR_RETRY_INTERVAL)

    if not admin_requests_enabled(host):
        return fail_current_stage(host, error)

    request = admin_requests.get_last_request_status(admin_requests.RequestTypes.IPMI_UNREACHABLE, host.inv)
    if request is None:
        log.warning("%s: There are no active admin requests for broken IPMI. Creating a new one.", host.human_id())
        return _create_ipmi_unreachable_admin_request(host, error, retry_status=retry_status)

    if _delay_processing_if_request_in_process(
        host, request, admin_requests.RequestTypes.IPMI_UNREACHABLE, error, retry_status=retry_status
    ):
        return

    error += " There is a processed admin request for broken IPMI but the problem persists."

    if host.task.reopened_admin_request_count < config.get_value("hardware.max_reopened_admin_requests"):
        log.warning("%s: %s Creating a new one.", host.human_id(), error)
        return _create_ipmi_unreachable_admin_request(host, error, retry_status=retry_status, reopen=True)

    log.error("%s: %s", host.human_id(), error)
    fail_current_stage(host, error)


def _create_ipmi_unreachable_admin_request(host, error, retry_status=None, reopen=False):
    admin_requests.create_admin_request(host, admin_requests.RequestTypes.IPMI_UNREACHABLE, reason=error)

    reset_ipmi_errors(host)
    if reopen:
        host.task.reopened_admin_request_count += 1

    commit_stage_changes(
        host, status=retry_status, error=error, check_after=walle.fsm_stages.common.ADMIN_REQUEST_CHECK_INTERVAL
    )


def _delay_processing_if_healing_by_dc_engineers(host, error, retry_status=None):
    """Delay host processing if it's being healed by DC engineers now."""

    for request_type in admin_requests.RequestTypes.ALL_IPMI:
        request = admin_requests.get_last_request_status(request_type, host.inv)
        if request is None:
            continue

        if "ticket" in request and push_host_ticket(host, request["ticket"]):
            commit_stage_changes(host, extra_fields=["ticket"])

        if _delay_processing_if_request_in_process(host, request, request_type, error, retry_status=retry_status):
            return True

    return False


def _delay_processing_if_request_in_process(host, request, request_type, error, retry_status=None):
    """Check that request is currently in process by DC engineers."""

    if request["status"] == admin_requests.STATUS_IN_PROCESS:
        log.info(
            "%s: There is an active '%s' admin request. Waiting when it will be processed.",
            host.human_id(),
            request_type.type,
        )

        reset_ipmi_errors(host)
        commit_stage_changes(
            host,
            check_after=walle.fsm_stages.common.ADMIN_REQUEST_CHECK_INTERVAL,
            status=retry_status,
            error=error + " There is an active admin request for broken IPMI. Waiting when it will be processed.",
        )

        return True
    elif request["status"] == admin_requests.STATUS_PROCESSED:
        elapsed_time = timestamp() - request["close_time"]
        if elapsed_time < HOST_RECOVERY_TIMEOUT:
            log.info(
                "%s: There is a '%s' admin request that has been processed %s."
                "Give a little time to the host to recover...",
                host.human_id(),
                request_type.type,
                humanize.naturaltime(timedelta(seconds=elapsed_time)),
            )

            reset_ipmi_errors(host)
            commit_stage_changes(host, status=retry_status, error=error, check_after=HARDWARE_ERROR_RETRY_INTERVAL)

            return True
    elif request["status"] == admin_requests.STATUS_NOT_EXIST:
        # tricky case: sometimes bot experiences temporary database problems and return "UNKNOWN",
        # but later it become alive again. If we create new request during this period, we create duplicate.
        # The trick is, request may have be genuinely lost, a new request need to be created then.
        if timestamp() - request["create_time"] < LOST_REQUEST_TIMEOUT:
            log.info(
                "%s: There should be a '%s' admin request that was created at %s, but is has gone."
                "Give BOT some time to return it back...",
                host.human_id(),
                request_type.type,
                datetime.fromtimestamp(request["create_time"]),
            )

            reset_ipmi_errors(host)
            commit_stage_changes(host, status=retry_status, error=error, check_after=HARDWARE_ERROR_RETRY_INTERVAL)
            return True

    # Assert that we know how to handle all possible statuses.
    # Any new status must be either handled or explicitly ignored.
    elif request["status"] != admin_requests.STATUS_DELETED:
        raise LogicalError()

    return False
