#!/usr/bin/env python
import traceback
import requests
import argparse
import signal
import json
import sys

# Error exit codes
OK       = (0, 'OK')
WARN     = (1, 'WARN')
CRITICAL = (2, 'CRITICAL')
UNKNOWN  = (3, 'UNKNOWN')

extras = []
def exit_with(err_lvl, mesg):
    print '%s: %s' % (err_lvl[1], mesg)
    print '\n'.join(extras)
    sys.exit(err_lvl[0])

errors = ['503', '502']
types = ['chunk', 'playlist', 'other']

USHER_NODE_URL = "http://usher.justin.tv/node/show/{0}.json"

def is_node_pr(node):
    r = requests.get(USHER_NODE_URL.format(node))
    r.raise_for_status()
    node = json.loads(r.text)
    return node and node[0].get('rep_type') == 'hls_replication'

def main():
    parser = argparse.ArgumentParser()
    parser.add_argument('-w', '--warn', type=int, default=5, help='warn level')
    parser.add_argument('-c', '--crit', type=int, default=10, help='crit level')
    parser.add_argument('-m', '--mins', type=int, default=1,  help='mins to average')
    parser.add_argument('--timeout', type=int, default=15, help='timelimit for the check')
    parser.add_argument('-H', '--host', required=True, help='hostname')
    args = parser.parse_args()

    def __timeout_handler(signum, frame):
        exit_with(UNKNOWN, "timed out after %d seconds" % args.timeout)

    signal.signal(signal.SIGALRM, __timeout_handler)
    signal.alarm(args.timeout)

    try:
        if is_node_pr(args.host):
            exit_with(OK, "Node {0} is a PR, ignoring.".format(args.host))

        metric = 'nginx_({0})_request_(.*)'.format('|'.join(types))

        window = args.mins * 60
        # request some more data because it might be padded with NaN's
        metrics = get_metrics(args.host, metric, window + 120)
        if not metrics:
            exit_with(UNKNOWN, "Ganglia did not return any data.")

        series = extract_series(metrics, window)
        if not series:
            exit_with(UNKNOWN, "Not enough data returned from ganglia.")

        data = dict((t, [0.0, 0.0]) for t in types)
        for name, values in series.iteritems():
            _, t, code = name.split()
            avg = float(sum(values)) / len(values)
            if code in errors:
                data[t][0] += avg
            data[t][1] += avg

        crit, warn = False, False
        message = []
        for t in types:
            error, total = data.get(t, (0, 0))
            rate = float(error) / total if total else 0.0

            state = ''
            if rate >= args.crit:
                crit = True
                state = ' is CRITICAL'
            elif rate >= args.warn:
                warn = True
                state = ' is WARNING'

            message.append('{0}{1}: {2:.3}% ({3:.0f} of {4:.2f});'.format(t, state, rate, error, total))

        message = ' '.join(message)
        if crit:
            exit_with(CRITICAL, message)
        elif warn:
            exit_with(WARNING, message)
        else:
            exit_with(OK, message)

    except Exception:
        exit_with(UNKNOWN, traceback.format_exc())

def extract_series(metrics, window):
    result = {}
    numeric_types = (float, int)
    for series in metrics:
        name = str(series['metric_name'].strip())
        data = series['datapoints']

        result[name] = [(time, value) for (value, time) in data if isinstance(value, numeric_types)]
        if not result[name]:
            # Only NaNs were returned, we can't work under such conditions!
            return None

    # the smallest timestamp of all the last timestamps having the data,
    # the comparison relies on tuples having a timestamp as their first element
    last_full = min(map(max, result.values()))[0]

    # filter the data that is in within the window ending at last full data point
    for name, data in result.iteritems():
        result[name] = [value for (time, value) in data if time <= last_full and time > last_full - window]

    return result

def get_metrics(host, metric, window):
    global extras

    template = "https://ganglia-api.internal.justin.tv/graph.php?gtype=stacked&aggregate=1&hreg[]={host}&mreg[]={metric}&cs=-{window}&json=1"
    url = template.format(host=host, metric=metric, window=window)
    extras.append("url used: {0}".format(url))
    r = requests.get(url)
    r.raise_for_status()
    return json.loads(r.text)

if __name__ == "__main__":
    main()
