#!/usr/bin/env python
from __future__ import with_statement
import sys
import os
from sys import exit
import argparse
import requests
import time
import re
from socket import getfqdn

try:
    import json
except:
    import simplejson as json

class NagiosStatus():
    def __init__(self, name, value):
        self.name  = name
        self.value = value

    def __str__(self):
        return self.name

    def __repr__(self):
        return "<NagiosStatus: %s>" % self.name

    def exit(self):
        sys.exit(self.value)

    def max(self, alert):
        if self.value > alert.value:
            return self
        else:
            return alert

OK   = NagiosStatus("OK", 0)
WARN = NagiosStatus("WARN", 1)
CRIT = NagiosStatus("CRIT", 2)
UNKN = NagiosStatus("UNKN", 3)

def get_nic_info(verbose=False):
    prefix="/sys/class/net/"
    count=0
    max_speed=0
    interfaces = os.listdir(prefix)

    for interface in interfaces:
        if interface.startswith("eth"):
            try:
                operstate = "down"
                operstate_file = "%s/%s/operstate" % (prefix, interface)
                speed_file = "%s/%s/speed" % (prefix, interface)

                with open(operstate_file) as f:
                    operstate = f.read()

                if operstate.startswith("up"):
                    if os.path.isfile(speed_file):
                        with open(speed_file) as f:
                            max_speed += int(f.readline())
                    else:
                        # If the speed file doesn't exist we assume the nics are 1gpbs.
                        # This fixes a problem with older versions of ubuntu.
                        max_speed += 1000
                    count += 1
                elif verbose:
                    print "%s: Offline Device" % interface
            except IOError, e:
                if verbose:
                    print "%s: Offline Device" % interface
    
    return (count, max_speed)

def get_ganglia_data(host,verbose=False):
    try:
        url = "https://ganglia-api.internal.justin.tv/graph.php?r=hour&hreg[]=^%s&mreg[]=^bytes_.*&aggregate=1&json=1" % (host)
        if verbose:
            print "Ganglia API Request: %s" % url
        r = requests.get(
                url,
                timeout=4,
                )
    except Exception, e:
        # TODO: This shouldn't be here?
        print "UNKN: Exception '%s'" % e
        UNKN.exit()

    if r.ok:
        data = json.loads(r.text)
        return data
    else:
        print >> sys.stderr, "Error [%d]: %s" % (r.status_code, r.reason)
        UNKN.exit()

def filter_valid_datapoints(datapoints, earliest, latest):
    results = []
    for point in datapoints:
        value, time = point
        if earliest < time < latest:
            results.append(point)

    return results
        

def process_data(data, mins):
    results = {}
    for dataset in data:
        datapoints  = dataset["datapoints"]                # A collection of [bits, unix_time] pairs
        metric_name = dataset["metric_name"].split(' ')[1] # When using aggregate graphs ganglia mangles the metric name
        count = sum = average = 0

        latest_timestamp   = last_valid_timestamp(datapoints)
        earliest_timestamp = latest_timestamp - (60 * mins)

        for point in filter_valid_datapoints(datapoints, earliest_timestamp,
                latest_timestamp):
            value, time = point
            try:
                sum   += value
                count += 1
            except:
                continue


        if count != 0:
            average = sum / count

        results[metric_name] = {"count": count, "sum": sum, "average": average}

    return results

def last_valid_timestamp(data):
    """
    Takes a list of data in the following format:
    [[data, unix_timestamp], ...]
    Returns the most recent timestamp that is not 0
    """
    latest_timestamp = 0
    for pair in data:
        value, timestamp = pair
        if value != 0:
            latest_timestamp = max(timestamp, latest_timestamp)

    return latest_timestamp

def check_results(mbps, max_speed, crit, warn):
    """
    Returns both the alert level (crit,warn,unkn,ok) and a list of messages
    """
    crit_level = (crit / 100.0) * max_speed
    warn_level = (warn / 100.0) * max_speed
    unkn_level = max_speed * 10


    if mbps < 0:
        return UNKN
    # When networking is restarted ganglia reports the nic as having done
    # trillians of bytes in one second. This causes false alarms. Instead
    # of alerting a CRIT we will alert OK if the bytes is at least 10x the
    # maximum the box should be possible of serving.
    elif mbps > unkn_level:
        return OK
    elif mbps > crit_level:
        return CRIT
    elif mbps > warn_level:
        return WARN
    else:
        return OK

def main():
    parser = argparse.ArgumentParser()
    parser.add_argument('-w', '--warn', dest='warn', type=int, default=90, help='warn level')
    parser.add_argument('-c', '--crit', dest='crit', type=int, default=95, help='crit level')
    parser.add_argument('-m', '--mins', dest='mins', type=int, default=5,  help='mins to average')
    parser.add_argument('-n', '--nics', dest='nics', type=int, default=None,  help='manually set the number of nics')
    parser.add_argument('-t', '--total', dest='bandwidth', type=int, default=None,  help='manually set the total bandwidth')
    parser.add_argument('-H', '--host', dest='host',  default=None, help='check non-local host')
    parser.add_argument('-v', '--verbose', dest='verbose', action='store_true', help='Verbose Output')
    args = parser.parse_args()

    if args.host:
        host = args.host
    else:
        host = getfqdn()

    if args.verbose:
        print "Host: %s" % host

    data = get_ganglia_data(host, verbose=args.verbose)

    if not data:
        print "Error: Can't get data for host '%s'." % host
        UNKN.exit()

    try:
        (nics, max_speed) = get_nic_info(args.verbose)
    except OSError:
        if not args.nics and not args.bandwidth:
            print "Can't automatically detect bandwidth or number of nics. Please manually set."
            UNKN.exit()

    if args.nics:
        nics = args.nics
    if args.bandwidth:
        max_speed = args.bandwidth

    results = process_data(data, args.mins)
    out_mbps = (results["bytes_out"]["average"] / 1000 / 1000) * 8
    in_mbps  = (results["bytes_in"]["average"] / 1000 / 1000)  * 8
    out_alert = check_results(out_mbps, max_speed, args.crit, args.warn)
    in_alert = check_results(in_mbps, max_speed, args.crit, args.warn)

    alert = out_alert.max(in_alert)

    print "%(status)s: %(in_mbps)0.2f (%(in_percent).2f%%) in / %(out_mbps)0.2f (%(out_percent).2f%%) out - (%(nics)d interfaces with %(max_speed)s mbps) (w: %(warn).0f%% / c: %(crit).0f%%)" % {
            "out_mbps"    : out_mbps,
            "in_mbps"     : in_mbps,
            "in_percent"  : (in_mbps  / max_speed) * 100,
            "out_percent" : (out_mbps / max_speed) * 100,
            "nics"        : nics,
            "status"      : alert,
            "max_speed"   : str(max_speed),
            "crit"        : args.crit,
            "warn"        : args.warn,
            "unkn"        : 1000,
            }

    alert.exit()

if __name__ == "__main__":
    sys.exit(main())
