#!/usr/bin/env python
import sys
from sys import exit
import argparse
import requests
import json
import time
import collections
import re, sre_constants

OK   = 0
WARN = 1
CRIT = 2
UNKN = 3

def main():
    parser = argparse.ArgumentParser()
    parser.add_argument('-w', '--warn', dest='warn', type=int, default=90, help='warn level')
    parser.add_argument('-c', '--crit', dest='crit', type=int, default=95, help='crit level')
    parser.add_argument('-m', '--mins', dest='mins', type=int, default=5,  help='mins to average')
    parser.add_argument('-n', '--capacity', dest='capacity', type=int, default="1000",  help='capacity per machine')
    parser.add_argument('-C', '--cluster', dest='cluster', help='cluster name', required=True)
    parser.add_argument('-M', '--metric', dest='metric', help='metric to check', default="bytes_out")
    parser.add_argument('-x', '--exclude', dest='exclude', default=[], action='append')
    parser.add_argument('-v', '--verbose', dest='verbose', default=False, action='store_true')
    args = parser.parse_args()

    try:
        r = requests.get(
                "https://ganglia.internal.justin.tv/graph.php?r=hour&gtype=stacked&aggregate=1&json=0" \
                        "&hreg[]=%(cluster)s&mreg[]=%(metric)s" % 
                        {'cluster': args.cluster, 'metric': args.metric}
                )
    except Exception as e:
        print "UNKN: Exception '%s'" % e
        exit(UNKN)

    number_of_hosts=0
    aggregate_data=collections.defaultdict(lambda: 0)
    if r.ok:
        data = json.loads(r.text)

        if not data:
            print "Error: Can't get data for cluster '%s'." % args.cluster
            exit(UNKN)

        # Ganglia is kinda stupid in that the aggregate graphs return multiple
        # hosts as "metrics" to shoehorn in multiple server graphs into the standard RRD system
        for dataset in data:
            skip = False
            for host in args.exclude:
                try:
                    if re.match(host, dataset['metric_name'].rstrip()):
                        if args.verbose:
                            print "Skipping %s due to '%s'" % (dataset['metric_name'], host)
                        skip = True
                        continue
                except sre_constants.error, e:
                    print "Invalid regex: '%s'; Error: %s" % (host, e)
                    exit(CRIT)

            if skip == False:
                number_of_hosts+=1

                for (value, data_time) in dataset['datapoints']:
                    if value != "NaN":
                        aggregate_data[data_time] += value

        # Find the last valid datapoint
        now = int(time.time())
        last_timestamp = now
        for data_time in sorted(aggregate_data.keys(), reverse=True):
            if aggregate_data[data_time] != 0:
                last_timestamp = data_time
                break

        # Average the data for the last args.mins minutes
        sum = 0
        count = 0
        for data_time, value in aggregate_data.iteritems():
            if data_time > (last_timestamp - (60 * args.mins)) and data_time <= last_timestamp:
                if value == 0:
                    continue

                count += 1
                sum   += value

        if count != 0:
            average = sum / count
        else:
            average = 0

        max_level = args.capacity * number_of_hosts
        crit_level = (args.crit / 100.0) * max_level
        warn_level = (args.warn / 100.0) * max_level
        mbps = (average / 1000 / 1000) * 8

        format = "%s: %0.2f mbps (%.2f%% / %d hosts) -- %s" 
        format2 = "(warn: %.0f%% / crit: %.0f%%)" % ((args.warn), (args.crit))

        if number_of_hosts == 0:
            print "CRIT: no ingest servers found"
        elif mbps < 0:
            print format % ("UNKN", mbps, (mbps / max_level) * 100, number_of_hosts, format2)
            exit(UNKN)
        elif mbps > crit_level:
            print format % ("CRIT", mbps, (mbps / max_level) * 100, number_of_hosts, format2)
            exit(CRIT)
        elif mbps > warn_level:
            print format % ("WARN", mbps, (mbps / max_level) * 100, number_of_hosts, format2)
            exit(WARN)
        else:
            print format % ("OK", mbps, (mbps / max_level) * 100, number_of_hosts, format2)
            exit(OK) 
    else:
        print >> sys.stderr, "Error [%d]: %s" % (r.status_code, r.reason)
        return UNKN

if __name__ == "__main__":
    sys.exit(main())
