#!/skynet/python/bin/python

import sys
import httplib
import textwrap
import datetime as dt
import itertools as it
import requests
import simplejson.decoder
import utils

from utils import Environment


SERVICE_NAME = "sandbox_dead_clients"
DB_NAME = "sandbox"
DB_NAME_PRE_PRODUCTION = "sandbox1"

QUERY_TEMPLATE = textwrap.dedent("""
    SELECT *
    FROM {db_name}.clientinfod
    WHERE timestamp IN (
      SELECT max(timestamp)
      FROM {db_name}.clientinfo
      WHERE timestamp >= now() - 3600
    )
    FORMAT JSON
""")


class Checker(object):
    """
    Figure out if amount of dead clients matching all possible intersections of PURPOSE and DENSITY tags
    exceeds the corresponding threshold, which is dynamic and based on:
    - set size (smaller means higher threshold);
    - dead count (a sane minimal amount of inactive clients is introduced);
    - day time (you must knock out A LOT of hosts in a set at night to trigger the alarm).
    """

    SMALL_GROUP_SIZE = 20

    DAILY_ALERT_RATIO = 0.1
    DAILY_ALERT_RATIO_SMALL = 0.5
    DAILY_ALERT_RATIO_OSX = 0.2
    DAILY_ALERT_RATIO_TOTAL = DAILY_ALERT_RATIO / 2
    DAILY_NUMERIC_THRESHOLD = 5

    NIGHTLY_ALERT_RATIO = 0.1
    NIGHTLY_ALERT_RATIO_SMALL = 0.75
    NIGHTLY_ALERT_RATIO_OSX = 0.3
    NIGHTLY_ALERT_RATIO_TOTAL = NIGHTLY_ALERT_RATIO / 2
    NIGHTLY_NUMERIC_THRESHOLD = 10

    def __init__(self, clients):
        self.groups = []
        self.clients = clients
        self.daytime = utils.DailyAlertSpan.contains(dt.datetime.now())

    @property
    def alert_ratio(self):
        return self.DAILY_ALERT_RATIO if self.daytime else self.NIGHTLY_ALERT_RATIO

    @property
    def alert_ratio_small(self):
        return self.DAILY_ALERT_RATIO_SMALL if self.daytime else self.NIGHTLY_ALERT_RATIO_SMALL

    @property
    def alert_ratio_osx(self):
        return self.DAILY_ALERT_RATIO_OSX if self.daytime else self.NIGHTLY_ALERT_RATIO_OSX

    @property
    def alert_ratio_total(self):
        return self.DAILY_ALERT_RATIO_TOTAL if self.daytime else self.NIGHTLY_ALERT_RATIO_TOTAL

    @property
    def numeric_threshold(self):
        return self.DAILY_NUMERIC_THRESHOLD if self.daytime else self.NIGHTLY_NUMERIC_THRESHOLD

    def pick_ratio(self, total, tags):
        if total < self.SMALL_GROUP_SIZE:
            return self.alert_ratio_small
        if "OSX" in tags:
            return self.alert_ratio_osx
        if not tags:  # all tags
            return self.alert_ratio_total
        return self.alert_ratio

    def count_dead(self, tags=()):
        dead, total = 0, 0
        for client in self.clients:
            if all(
                (
                    tag[1:] not in client["tags"]
                    if tag.startswith("~") else
                    tag in client["tags"]
                )
                for tag in tags
            ):
                total += 1
                dead += not client["alive"]

        return tags, dead, total

    def fill_failed_groups(self, tags):
        tags, dead, total = self.count_dead(tags)
        ratio = self.pick_ratio(total, tags)
        if (
            total and
            (dead / float(total) >= ratio) and
            dead >= self.numeric_threshold
        ):
            self.groups.append((tags, dead, total))

    @property
    def error_string(self):
        if not self.groups:
            return None

        def formatter(tags, dead, total):
            return "{}: {}% ({} dead / {} total)".format(
                " & ".join(tags) or "TOTAL", dead * 100 / total, dead, total
            )

        return "More than {}% ({}% for TOTAL, {}% if <{} hosts) of hosts in following groups are dead: {}".format(
            int(self.alert_ratio * 100),
            int(self.alert_ratio_total * 100),
            int(self.alert_ratio_small * 100),
            self.SMALL_GROUP_SIZE,
            "; ".join(it.starmap(formatter, self.groups))
        )


def main():
    tags = tuple(filter(None, sys.argv[1].split("&")))
    if tags:
        tags += ("~MAINTENANCE",)
    environment = Environment[sys.argv[2]] if len(sys.argv) > 2 else Environment.PRODUCTION
    data = None
    try:
        db_name = DB_NAME if environment is Environment.PRODUCTION else DB_NAME_PRE_PRODUCTION
        query = QUERY_TEMPLATE.format(db_name=db_name)
        data = utils.query_clickhouse(query, environment=environment)
    except (requests.RequestException, httplib.IncompleteRead) as exc:
        utils.say(SERVICE_NAME, utils.STATUS_WARN, "Data source is unreachable: {}".format(exc), die=True)
    except simplejson.decoder.JSONDecodeError:
        utils.say(SERVICE_NAME, utils.STATUS_WARN, "Clickhouse replied with: {}".format(data.text), die=True)
    if not data or not data["data"]:
        utils.say(SERVICE_NAME, utils.STATUS_WARN, "No data (empty response received from ClickHouse)", die=True)

    clients = data["data"]
    for client in clients:
        client["tags"] = set(client["tags"])

    checker = Checker(clients)
    checker.fill_failed_groups(tags)

    error_string = checker.error_string
    if not error_string:
        utils.say(SERVICE_NAME, utils.STATUS_OK, "All good", die=True)
    utils.say(SERVICE_NAME, utils.STATUS_CRIT, error_string)


if __name__ == "__main__":
    main()
