#!/usr/bin/env python

"""
Nagios check for for telegraph RabbitMQ. Makes sure that each
telegraph_* queue has a corresponding telegraph_*_overflow queue.
Consider items that went through the telegraph_*_overflow as loss.
We WARN/CRIT for aggregate loss over particular thresholds, and UNKNOWN
for queues where we don't have enough accumulated data to decide.
"""

import pynagios as nagios
import requests
import collections
import time
import json
import csv

@nagios.parse_args
def parse_args(parser):
    parser.add_argument('--server', dest='server', type=str, required=True, help='Which RabbitMQ to monitor')
    parser.add_argument('--queues', dest='queues', nargs='+', default=None, help='The queues to check. If not given, all telegraph_* queues will be checked. Other queue data will NOT be stored between invocations.')
    parser.add_argument('--min-sample-period', dest='min_period', type=int, default=100, help='Minimum window (in seconds) to consider')
    parser.add_argument('--max-sample-period', dest='max_period', type=int, default=600, help='Maximum window (in seconds) to consider')
    parser.add_argument('--discard-sample-period', dest='discard_period', type=int, default=60*120, help='Time (in seconds) after which to discard samples')
    parser.add_argument('--warning-rate', dest='ignore1', type=float, required=False, default=0.01, help='IGNORED (can be removed when nagios reconfig is complete)')
    parser.add_argument('--critical-rate', dest='ignore2', type=float, required=False, default=0.01, help='IGNORED (can be removed when nagios reconfig is complete)')
    parser.add_argument('--warning-ratio', dest='warning_ratio', type=float, required=False, default=0.001, help='Loss ratio (aggregate across all queues) to trigger WARNING state')
    parser.add_argument('--critical-ratio', dest='critical_ratio', type=float, required=False, default=0.070, help='Loss ratio (aggregate across all queues) to trigger CRITICAL state')
    parser.add_argument('--sample-file', dest='sample_file', type=str, required=False, default='/tmp/nagios_telegraph_rate.csv', help='File to store sample data in between invocations.')


def nagios_error_status(*args):
    return nagios.UNKNOWN, { 'monitor error': ''.join(args) }

def queue_detail_text(loss_data):
    loss_data = list(loss_data)
    loss_data.sort(reverse=True, key=lambda d: d['total_rate'])
    loss_data.sort(reverse=True, key=lambda d: d['loss_rate'])

    result = []
    for d in loss_data:
        result.append('{queue}: interval={interval}, total={total_messages}, total_per_second={total_rate}, loss={lost_messages}, loss_per_second={loss_rate},'.format(**d))
    return '\n'.join(result)


CSV_ARGS = { 
        'fieldnames': ['ts', 'ts_human', 'queue', 'ingress', 'egress', 'total_counter', 'expired_counter'],
        'quoting': csv.QUOTE_NONNUMERIC,
        'strict': True
        }

def save_samples(config, samples):
    if samples:
        ts_discard_time = (samples[-1]['ts'] - config.discard_period)
    else:
        ts_discard_time = 0

    with open(config.sample_file, 'wb') as csvfile:
        writer = csv.DictWriter(csvfile, **CSV_ARGS)
        writer.writerows((s for s in samples if s['ts'] >= ts_discard_time))

def load_samples(config):
    """
    Loads samples from the configured location. Malformatted data may raise an exception. Errors
    reading the raw data from disk will be swallowed.
    """
    try:
        with open(config.sample_file, 'rb') as csvfile:
            reader = csv.DictReader(csvfile, **CSV_ARGS)
            return [row for row in reader]
    except IOError:
        return []

def get_current_rabbitmq_samples(config):
    ts = time.time()
    ts_human = time.ctime(ts)

    response = requests.get('http://%s/api/queues' % config.server, auth = ('guest', 'guest'))
    response.raise_for_status()
    queues = dict((q['name'], q) for q in json.loads(response.text))

    samples_now = []
    for name,q in queues.iteritems():
        if name.endswith('_overflow'):
            continue

        if (config.queues is None and name.startswith('telegraph_')) or (config.queues and name in config.queues):
            oq = queues.get('%s_overflow' % (name));
            samples_now.append({
                'queue': str(name),
                'ts': ts,
                'ts_human': ts_human,
                'total_counter': q['backing_queue_status']['next_seq_id'],
                'expired_counter': oq['backing_queue_status']['next_seq_id'] if oq else None})

    return samples_now


def calculate_loss_data(config, samples):
    """
    Given a bunch of samples, which much be non-empty and stored oldest-to-newest, return a
    dict of loss information for all queues which have the most recent sampling timestamp. A
    rate of None is given for queues where we don't have enough data to calculate a loss rate.
    """
    ts_now = samples[-1]['ts']

    newest_sample = {}
    older_sample = {}
    for s in samples:
        if s['ts'] < ts_now - config.max_period:
            continue

        qn = s['queue']

        # Does the "old newest" sample have a greater counter than the "new newest" sample?
        # This indicates the queue was destroyed then recreated, resetting the counter. We can't
        # validly compare across that event
        if qn in newest_sample and (s['expired_counter'] < newest_sample[qn]['expired_counter'] or s['total_counter'] < newest_sample[qn]['total_counter']):
            del older_sample[qn]
            del newest_sample[qn]

        newest_sample[qn] = s
        if qn not in older_sample:
            older_sample[qn] = s

    # Where we have a current sample, and an older sample in the permissable time window,
    # determine the loss rate over that time
    loss_data = []
    for (qn, newest) in newest_sample.iteritems():
        if newest['ts'] == ts_now:
            d = { 'queue': qn, 'total_messages': None, 'lost_messages': None, 'interval': None, 'total_rate': None, 'loss_rate': None }

            older = older_sample.get(qn)
            if older:
                d['total_messages'] = int(newest['total_counter'] - older['total_counter'])
                d['lost_messages'] = int(newest['expired_counter'] - older['expired_counter'])
                d['interval'] = float(newest['ts'] - older['ts'])

                if d['interval'] and d['interval'] >= config.min_period:
                    d['total_rate'] = float(d['total_messages']) / d['interval']
                    d['loss_rate'] = float(d['lost_messages']) / d['interval']

            loss_data.append(d)

    # Represent queues which were explicitly asked for, but not present in the data
    if config.queues:
        for qn in set(config.queues) - set(d['queue'] for d in loss_data):
            d = { 'queue': qn, 'total_messages': None, 'lost_messages': None, 'interval': None, 'total_rate': None, 'loss_rate': None }
            loss_data.append(d)

    return loss_data


def aggregate_loss_data(loss_data):
    total = sum((d.get('total_rate') or 0.0) for d in loss_data)
    lost = sum((d.get('loss_rate') or 0.0) for d in loss_data)

    ratio = (lost/total) if (total >= 1) else 0.0

    return lost, total, ratio


@nagios.check
def check(config):
    # load, update, and save our sampled rabbitmq data. return UNKNOWN status if things look bad
    try:
        samples = load_samples(config)
    except Exception:
        return nagios_error_status('State file is present but could not load, it may be corrupt: ', config.sample_file)

    try:
        current_samples = get_current_rabbitmq_samples(config)
    except Exception as e:
        return nagios_error_status('Unable to fetch rabbitmq stats over HTTP: ', str(e))
    
    if not current_samples:
        return nagios_error_status('We got data from the rabbitmq API, but it did not contain any queues of interest')
    samples += current_samples

    try:
        save_samples(config, samples)
    except Exception:
        return nagios_error_status('Unable to save state file: ', config.sample_file)

    # calculate and output loss rates
    loss_data = calculate_loss_data(config, samples)
    lost, total, loss_ratio = aggregate_loss_data(loss_data)

    unknown = [d['queue'] for d in loss_data if d['loss_rate'] is None]
    unhealthy = [d['queue'] for d in loss_data if d['loss_rate'] > 0.0]
    healthy = [d['queue'] for d in loss_data if d['loss_rate'] == 0.0]

    if loss_ratio >= config.critical_ratio:
        status = nagios.CRITICAL
    elif loss_ratio >= config.warning_ratio:
        status = nagios.WARNING
    elif unknown:
        status = nagios.UNKNOWN
    else:
        status = nagios.OK

    counts = collections.OrderedDict()
    counts['not enough data'] = '%s queues' % len(unknown)
    counts['some loss']       = '%s queues' % len(unhealthy)
    counts['zero loss']       = '%s queues' % len(healthy)
    counts['lost/total msg per second'] = '%s / %s == %s' % (round(lost,2), round(total,2), round(loss_ratio,4))

    extra_details = queue_detail_text(d for d in loss_data if (config.verbose or d['queue'] in unknown + unhealthy))

    return status, counts, extra_details


if __name__ == '__main__':
    nagios.run_check()

