#!/usr/bin/env python

# Provides: walle_memory
from __future__ import absolute_import

import json
import os.path
import time

from juggler.bundles import as_check, Status, Event
from .common import get_hw_watcher_status, oldstyle_main, timestamp, WALLE_STATE_DIR, MEMORY_REPAIR_FILE_PATH

CHECK_NAME = "walle_memory"
DESCRIPTION_LEN_LIMIT = 800
MIN_COMMENT_LENGTH = 94

MEMORY_CRIT_FLAG_FILE_PATH = os.path.join(WALLE_STATE_DIR, "memory_crit_flag")


def shrink_result(res):
    if len(res["reason"]) > 1:
        # Every reason line is valuable. But juggler can not pass too many of them through.
        res["reason"].pop()
        return True

    if "comment" in res:
        trim_comment_length = max(DESCRIPTION_LEN_LIMIT, MIN_COMMENT_LENGTH)
        if len(res["comment"]) > trim_comment_length + 3:
            res["comment"] = res["comment"][:trim_comment_length] + "..."
            return True

    return False


def run_check():
    results = {
        "ecc": get_hw_watcher_status("ecc"),
        "mem": get_hw_watcher_status("mem"),
    }

    status = check_status(results["ecc"]["status"], results["mem"]["status"])
    return status, {"results": results}


def check_status(ecc_status, mem_status):
    if "FAILED" in {ecc_status, mem_status}:
        return Status.CRIT
    elif ecc_status == "UNKNOWN":
        return Status.CRIT
    elif ecc_status == mem_status == "OK":
        return Status.OK
    else:
        return Status.WARN


def make_event(status, metadata):
    description = json.dumps(metadata)

    for check in ("ecc", "mem"):

        while len(description) > DESCRIPTION_LEN_LIMIT:
            if shrink_result(metadata["results"][check]):
                description = json.dumps(metadata)
            else:
                break

    if len(description) > DESCRIPTION_LEN_LIMIT:
        # I tried so hard and got so far. We can not pass this data trough, which is very bad actually.
        # Juggler will trim check's description and wall-e won't parse it and check will be invalid.
        # But we have monitoring which reports invalid checks,
        # so that we can analyze and fix whatever gets through this
        pass

    return Event(status, description)


def _write_memory_repair_time(event):
    # NOTE(rocco66): memory problem is reason for reboots often, so we should
    #                for walle_reboot after memory was repaired. See https://st.yandex-team.ru/WALLE-4160
    if not os.path.exists(WALLE_STATE_DIR):
        os.mkdir(WALLE_STATE_DIR)
    if event.status == Status.CRIT:
        with open(MEMORY_CRIT_FLAG_FILE_PATH, "w+"):
            pass
    elif event.status == Status.OK and os.path.exists(MEMORY_CRIT_FLAG_FILE_PATH):
        os.remove(MEMORY_CRIT_FLAG_FILE_PATH)
        with open(MEMORY_REPAIR_FILE_PATH, "w+") as crit_time_file:
            crit_time_file.write(str(int(time.time())))


@as_check(name=CHECK_NAME)
def juggler_check():
    try:
        event = make_event(*run_check())
    except Exception as e:
        return make_event(Status.WARN, {
            "results": {
                "ecc": {
                    "reason": ["Can't get status from hw-watcher: {}".format(e)],
                    "status": "Disabled",
                    "timestamp": timestamp(),
                },
                "mem": {
                    "reason": ["Can't get status from hw-watcher: {}".format(e)],
                    "status": "Disabled",
                    "timestamp": timestamp(),
                }
            }
        })
    try:
        _write_memory_repair_time(event)
    except Exception:
        # NOTE(rocco66): it needs for walle_reboots check, not for walle_memory
        pass
    return event


if __name__ == "__main__":
    oldstyle_main(CHECK_NAME, juggler_check())
