#!/usr/bin/env python
import subprocess
import traceback
import argparse
import requests
import json
import sys
import re

def parse_args():
    parser = argparse.ArgumentParser()
    parser.add_argument('--max-old-uptime', metavar='N', type=int, default=60,
                        help='Emit CRITICAL if old process stays up for longer than N seconds after the last restart')
    parser.add_argument('--tenfoot-expvar', type=str, default='http://localhost:6565/debug/vars',
                        help='Address of tenfoot endpoint exposing its runtime variables')
    parser.add_argument('--deployments', type=str,
                        default = 'http://skadi.internal.justin.tv/v1/artifacts/deployed?owner=video&name=tenfoot',
                        help='Address of an endpoint exposing expected deployed versions of tenfoot')
    parser.add_argument('--services', type=str, default = 'http://localhost:8500/v1/agent/services',
                        help='Address of an endpoint exposing info about locally running services')
    parser.add_argument('--service-name', type=str, default='tenfoot',
                        help='Consul name of the checked service')
    parser.add_argument('--svc-dir', type=str, default='/etc/service/tenfoot',
                        help='Service directory')
    return parser.parse_args()

# exit statuses
OK = 'OK'
WARNING = 'WARNING'
CRITICAL = 'CRITICAL'
UNKNOWN = 'UNKNOWN'

def exit_with(status, message, tb=False):
    ''' Prints a result and exits in the way Nagios expects
    '''
    # print the supplied message
    plugin_name = sys.argv[0]
    print '{0}: {1} - {2}'.format(plugin_name, status, message)
    if tb:
        print traceback.print_exc()

    # exit with the proper exit code
    states = {
            OK: 0,
            WARNING: 1,
            CRITICAL: 2,
            UNKNOWN: 3,
    }
    if status in states.keys():
        exit_code = states.get(status)
    else:
        exit_code = states.get(UNKNOWN)
    sys.exit(exit_code)

class CheckTenfoot(object):
    def __init__(self, options):
        self.options = options

    def fetch_json(self, url, name):
        try:
            resp = requests.get(url, timeout=5)
        except Exception:
            exit_with(CRITICAL, 'could not contact {0} (url: {1})'.format(name, url), tb=True)

        if resp.status_code != 200:
            exit_with(CRITICAL, '{0} (url: {1}) responded with {2} status code'.format(name, url, resp.status_code))

        try:
            return json.loads(resp.text)
        except ValueError:
            exit_with(CRITICAL, 'incorrect response from {0} (url: {1}) (not a json)'.format(name, url), tb=True)

    def fetch(self):
        self.deployments = self.fetch_json(self.options.deployments, 'deployments')
        self.services = self.fetch_json(self.options.services, 'services')
        self.tenfoot_expvar = self.fetch_json(self.options.tenfoot_expvar, 'tenfoot_expvar')

    # NOTE: this can be removed once we have a generic version check for all
    # the services of which versions are tracked in consul
    def check_version(self):
        svc = self.options.service_name
        if not svc in self.services:
            exit_with(UNKNOWN, 'tenfoot is not registered on this box (svc name used: {0})'.format(svc))

        # should this be filtered somehow?
        tags = self.services[svc]['Tags']
        version = self.tenfoot_expvar['tenfoot']['version']

        vi = [e['deployment'] for e in self.deployments.itervalues() if e['deployment']['environment'] in tags]
        if not vi:
            exit_with(UNKNOWN, 'no deployments registered for the service')

        # ISO8601 timestamps compare lexicographically \o/
        vi.sort(key=lambda d:d['updated_at'])
        exp_version = vi[0]['sha']
        exp_environment = vi[0]['environment']

        if version != exp_version:
            exit_with(CRITICAL, 'version mismatch; want {0} (env: {1}), found {2}'.format(exp_version, exp_environment, version))

    def svcinfo(self, svcdir):
        svstat = subprocess.check_output(['sudo', '/usr/bin/svstat', svcdir]).split(':', 1)[1].strip()
        m = re.match('up \(pid (\d+)\) (\d+) seconds', svstat)

        if not m:
            exit_with(CRITICAL, 'could not parse svstat (tenfoot process is down?): {0}'.format(svstat))

        try:
            return map(int, m.groups())
        except ValueError:
            exit_with(CRITICAL, 'could not parse svstat (pid or uptime is not a number?): {0}'.format(svstat), tb=True)


    def check_proctree(self):
        pid = self.tenfoot_expvar['process']['pid']
        ppid = self.tenfoot_expvar['process']['ppid']
        svcpid, svcuptime = self.svcinfo(self.options.svc_dir)

        if svcpid != ppid:
            exit_with(CRITICAL, 'tenfoot instance listening on the socket (pids {0}, {1}) is not one managed by supervise (pid {2})'.
                    format(pid, ppid, svcpid))

        strayprocs = [p for p in subprocess.check_output(['pidof', 'tenfoot']).split() if int(p) not in (pid, ppid)]
        if strayprocs and svcuptime > self.options.max_old_uptime:
            exit_with(CRITICAL, 'found an unmanaged tenfoot instance (pids: {0}); managed uptime: {1}'.format(strayprocs, svcuptime))


    def run(self):
        self.fetch()

        self.check_proctree()
        self.check_version()

if __name__ == '__main__':
    try:
        options = parse_args()
        check = CheckTenfoot(options)
        check.run()
    except Exception, e:
        exit_with('UNKNOWN', 'unhandled check failure', tb=True)

    exit_with('OK', 'all checks passed')
