#!/usr/bin/env python
# Modified version of "porto_cpu_limits" for working in isolation mode
#
# Provides: porto_cpu_limits
#
# $Id: porto_cpu_limits.py 11257 2017-07-19 15:26:40Z mixas $
# $HeadURL: svn+ssh://svn.yandex.ru/monitor/trunk/yandex-farm/projects/monitoring/monitoring-only/search/home/monitor/agents/modules-available/porto_cpu_limits.py $
#
# some info available in https://st.yandex-team.ru/SEPE-16522

# DISCLAIMER:
#   all output is a nanoceconds or share metric/limit
#   summaries are filtered containers metric totals vs whole CPU
#   don't expect precise accuracy here - it's just a monitoing tool after all

import argparse
import multiprocessing
import os
import porto
import sys
import time


CHECK_NAME = "porto_cpu_limits"
CPU_COUNT = float(multiprocessing.cpu_count())


def die(check_name, status, message):
    print 'PASSIVE-CHECK:%s;%s;%s' % (check_name, status, message)
    sys.exit(0)


def nanos(value):
    if value.endswith('c'):  # cores
        return 1000000000 * float(value[:-1])
    else:
        return value


def get_porto_containers_metrics(container_name):

    metrics = {}
    values = [
        'cpu_guarantee',
        'cpu_limit',
        'cpu_usage',
        'cpu_usage_system',
        'cpu_wait',  # kern 4.4 required: https://wiki.yandex-team.ru/porto/major3/#pribory
        'state',
    ]

    try:
        api = porto.Connection()
        metrics = api.Get([container_name], values)
    except Exception:
        die(CHECK_NAME, 2, "Failed to get container metrics")
    finally:
        api.disconnect()

    return metrics


def parse_args():
    nanny_service_id = os.getenv('NANNY_SERVICE_ID')
    if not nanny_service_id:
        container_name = 'self'
    else:
        container_name = 'ISS-AGENT--{port}/{port}_{service}_{hash}/iss_hook_start'.format(
            port=os.getenv('BSCONFIG_IPORT', default=''),
            service=nanny_service_id,
            hash=os.getenv('HOME').split('_')[-1]
        )

    parser = argparse.ArgumentParser(
        description='Juggler check for porto CPU limits, SEPE-16522',
        formatter_class=argparse.ArgumentDefaultsHelpFormatter
    )
    parser.add_argument('--container-name', type=str, help='container names regexp', default=container_name, metavar='REGEXP')
    parser.add_argument('--debug', help='print debug info', action="store_true")
    parser.add_argument('--cpu-usage-crit', type=float, help='cpu_usage/guarantee crit threshold', default=0.75)
    parser.add_argument('--cpu-wait-crit', type=float, help='cpu_wait/guarantee crit threshold', default=0.01)
    parser.add_argument('--probe-interval', type=float, help='seconds between probes', default=5, metavar='SECONDS')

    return parser.parse_args()


if __name__ == '__main__':
    opts = parse_args()

    probe1 = get_porto_containers_metrics(opts.container_name)
    time.sleep(opts.probe_interval)
    probe2 = get_porto_containers_metrics(opts.container_name)

    if not probe1 or not probe2:
        die(CHECK_NAME, 2, "No one container filtered by '{}'".format(opts.container_name))

    check = {'CRIT': [], 'INFO': [], 'WARN': []}

    total_cpu_bandwidth = opts.probe_interval * 1000000000 * CPU_COUNT  # bandwidth in nanoseconds
    total_cpu_usage = 0
    total_cpu_usage_system = 0

    for container, metrics in probe2.items():
        if container not in probe1:
            continue

        p1 = probe1[container]
        p2 = probe2[container]

        cpu_guarantee = nanos(p2['cpu_guarantee']) * opts.probe_interval
        cpu_limit = nanos(p2['cpu_limit']) * opts.probe_interval

        cpu_usage = int(p2['cpu_usage']) - int(p1['cpu_usage'])
        cpu_usage_system = int(p2['cpu_usage_system']) - int(p1['cpu_usage_system'])
        cpu_wait = int(p2['cpu_wait']) - int(p1['cpu_wait'])

        if opts.debug:
            print "{}:\n\tcpu_usage: {}\n\tcpu_wait: {}\n\tcpu_guarantee: {}\n\tcpu_limit: {}\n".format(
                container, cpu_usage, cpu_wait, int(cpu_guarantee), int(cpu_limit))

        total_cpu_usage += cpu_usage
        total_cpu_usage_system += cpu_usage_system

        if cpu_guarantee > 0:
            if cpu_usage > cpu_guarantee * opts.cpu_usage_crit:
                check['CRIT'].append("{} cpu_usage/guarantee {}".format(
                    container, round(cpu_usage / cpu_guarantee, 2)))

            if cpu_wait > cpu_guarantee * opts.cpu_wait_crit:
                check['CRIT'].append("{} cpu_wait/guarantee {}".format(
                    container, round(cpu_wait / cpu_guarantee, 2)))

    # report results
    msg = "Ok"
    code = 0
    stats = "US: {}%, SY:{}%, CPUs: {}".format(
        round(total_cpu_usage / total_cpu_bandwidth, 3) * 100,
        round(total_cpu_usage_system / total_cpu_bandwidth, 3) * 100,
        CPU_COUNT
    )

    if len(check['WARN']):
        msg = "WARNS: {}".format(", ".join(check['WARN']))
        code = 1

    if len(check['CRIT']):
        msg = "CRITS: {}".format(", ".join(check['CRIT']))
        msg += ("; " + msg if code else "")
        code = 2

    die(CHECK_NAME, code, msg + "; " + stats)
