#!/usr/bin/python

# Author: Cyrus Hall <cyrus@justin.tv>

import optparse, sys, time
from datetime import datetime, timedelta
import urllib2
import traceback

try:
    import simplejson as json
except ImportError:
    import json

# Error exit codes
OK       = (0, 'OK')
WARN     = (1, 'WARN')
CRITICAL = (2, 'CRITICAL')
UNKNOWN  = (3, 'UNKNOWN')

def exit_with(err_lvl, mesg):
    print '%s: %s' % (err_lvl[1], mesg)
    sys.exit(err_lvl[0])

def node_names(node_list):
    return ', '.join([n['name'] for n in node_list])

opt_args = [
  (['--verbose', '-v'], {'help': 'be verbose', 'action': 'store_true', 'default': False}),
  (['--cluster', '-p'], {'help': 'the cluster to check'}),
  (['--usherhost', '-u'], {'help': 'usher host to use', 'default': 'usher.justin.tv'}),
  (['--critical', '-c'], {'help': 'node timeout length to go critical', 'type': 'int', 'default': 60}),
  (['--warn', '-w'], {'help': 'node timeout length to warn at', 'type': 'int', 'default': 45}),
  (['--expected', '-e'], {'help': 'expected period of node updates', 'type': 'int', 'default': 30}),
]

parser = optparse.OptionParser()
for option in opt_args: parser.add_option(*option[0], **option[1])
options, args = parser.parse_args()

if not options.cluster:
    print 'Must provide a cluster to check.'
    sys.exit(3)

if options.verbose:
    print 'Cluster: %s\nExpected update period: %s\nWarning timeout: %s\nCritical timeout: %s' \
                % (options.cluster, options.expected, options.warn, options.critical)

url = 'http://%s/cluster/nodes/%s.json' % (options.usherhost, options.cluster)
if options.verbose:
    print 'Requesting %s' % url

try:
    f = urllib2.urlopen(url)
except urllib2.URLError, e:
    exit_with(CRITICAL, 'Usher is down!')

try:
    nodes = json.loads(f.readline())

    if len(nodes) == 0:
        exit_with(CRITICAL, 'Request to retrieve nodes for %s failed.' % options.cluster)

    # get current time, init time delta
    cur_time = datetime.now()

    for n in nodes:
        # date format: Fri Mar 16 22:12:19 2012
        date = datetime.strptime(n['updated_on'], '%a %b %d %H:%M:%S %Y')
        n['updated_on'] = date

    def timestamp_aged(node, secs):
        return node['updated_on'] < cur_time - timedelta(seconds=secs) 

    critical_nodes = []
    warning_nodes = []
    over_expected_nodes = []

    for n in nodes:
        if timestamp_aged(n, options.critical):
            critical_nodes.append(n)
        elif timestamp_aged(n, options.warn):
            warning_nodes.append(n)
        elif timestamp_aged(n, options.expected):
            over_expected_nodes.append(n)

    num_crit = len(critical_nodes)
    num_warn = len(warning_nodes)
    num_over_exp = len(over_expected_nodes)

    if options.verbose:
        print 'Total nodes: %s, over expected ts: %s, warning: %s, critical %s' % (len(nodes), num_over_exp, num_warn, num_crit)

    if num_crit > 0:
        mesg = '%s nodes in %s have very old usherdb timestamps: %s' % (num_crit, options.cluster, node_names(critical_nodes))
        exit_with(CRITICAL, mesg)
    elif num_warn > 0:
        mesg = '%s nodes in %s have old usherdb timestamps: %s' % (num_warn, options.cluster, node_names(warning_nodes))
        exit_with(WARN, mesg)
    else:
        mesg = 'All nodes in %s have good timestamps' % options.cluster
        if num_over_exp > 0:
            mesg += ' (%s over expected value)' % num_over_exp
        exit_with(OK, mesg)

except SystemExit, e:
    sys.exit(e.code)
except:
    exit_with(UNKNOWN, traceback.print_exc())
