#!/skynet/python/bin/python

import textwrap
import calendar
import datetime as dt

import requests

import utils


SERVICE_NAME = "sandbox_slo"
TIMESTEP = 15 * 60
PERCENTILE = 98
QUERY_TEMPLATE = textwrap.dedent("""
    SELECT
      method, quantile(0.{percentile})(duration) AS duration
    FROM sandbox.apicalld
    WHERE
      timestamp BETWEEN toDateTime({utcnow} - {timestep}) AND toDateTime({utcnow})
    GROUP BY method
    FORMAT JSON
""")


# Slo timing plus 20% for CRIT
RESPONSE_TIME_PERCENTILES_CRIT = {
    "Task.create": 12,
    "Task.update": 10,
    "BatchTask.start": 23,
}


# https://wiki.yandex-team.ru/sandbox/sla/#slo (timings are in seconds)
RESPONSE_TIME_PERCENTILES_WARN = {
    "Task.create": 10,
    "Task.update": 7,
    "BatchTask.start": 21,
}


def main():
    utcnow = calendar.timegm(dt.datetime.utcnow().timetuple())
    query = QUERY_TEMPLATE.format(
        utcnow=utcnow,
        timestep=TIMESTEP,
        percentile=PERCENTILE,
    )

    try:
        data = utils.query_clickhouse(query)
    except requests.RequestException as exc:
        utils.say(SERVICE_NAME, utils.STATUS_WARN, "Data source is unreachable: {}".format(exc))
    else:
        if not data or not data["data"]:
            utils.say(SERVICE_NAME, utils.STATUS_WARN, "No data (empty response received from ClickHouse)", die=True)

        durations = {
            method_name: None
            for method_name in RESPONSE_TIME_PERCENTILES_CRIT
        }
        for item in data["data"]:
            if item["method"] in durations:
                durations[item["method"]] = item["duration"] / 1000  # duration is stored in milliseconds in ClickHouse

        def status(method_name):
            duration = durations[method_name]
            threshold_warn = RESPONSE_TIME_PERCENTILES_WARN[method_name]
            threshold_crit = RESPONSE_TIME_PERCENTILES_CRIT[method_name]
            if duration is None:
                return "DATA NOT FOUND"
            if duration > threshold_crit:
                return "{:.01f}s > {}s".format(duration, threshold_crit)
            elif duration > threshold_warn:
                return "{:.01f}s > {}s".format(duration, threshold_warn)
            else:
                return "{:.01f}s <= {}s".format(duration, threshold_warn)

        message = " | ".join(
            "{}: {}".format(k, status(k)) for k in sorted(RESPONSE_TIME_PERCENTILES_CRIT)
        )
        is_slo_broken_crit = any((
            durations[k] is None or durations[k] > RESPONSE_TIME_PERCENTILES_CRIT[k]
            for k in RESPONSE_TIME_PERCENTILES_CRIT
        ))

        is_slo_broken_warn = any((
            durations[k] is None or durations[k] > RESPONSE_TIME_PERCENTILES_WARN[k]
            for k in RESPONSE_TIME_PERCENTILES_WARN
        ))

        if is_slo_broken_crit:
            status = 2
        elif is_slo_broken_warn:
            status = 1
        else:
            status = 0

        utils.say(SERVICE_NAME, status, message)


if __name__ == "__main__":
    main()
