#!/usr/bin/env python
# Twitch NRDP - Client-Side Daemon
# David Newhall II - Oct 17, 2016

import sys
import cgi
import time
import yaml
import shlex
import string
import signal
import socket
import logging
import timeout
import requests
import argparse
from os import listdir, remove
from subprocess import Popen, PIPE
from xml.dom.minidom import parseString
from os.path import isfile, isdir, join
from logging.handlers import SysLogHandler


# Defaults for our server
NRDP_TOKEN = "You may put a token here, but why?"

# XML templates used to POST to NRDP API.
_SVC_TEMPLATE = """
  <checkresult type='service' checktype='1'>
    <hostname>{}</hostname>
    <servicename>{}</servicename>
    <state>{}</state>
    <output>{}</output>
  </checkresult>
"""

_NRDP_TEMPLATE = """
<?xml version="1.0" encoding="utf-8"?>
<checkresults>
{}
</checkresults>
"""


class SendNrdp(object):
    # This variable is used to track state changes among services.
    svc_states = {}

    def build_parser(self):
        parser = argparse.ArgumentParser(
            formatter_class=argparse.ArgumentDefaultsHelpFormatter,
            description = """Originally, this daemon was designed to
suck in data about passive checks that have already been
performed. The initial design did not provide enough
variation in check times; the checks fired from cron.d
entries. The function of sucking passive check data in
still exists. To use it, put files in the <path>|--path
with the format: service-name<tab>exit_code<tab>output
Output can have newlines, they are represented in
Nagios correctly. One service per file.
Now the design of this daemon is to execute passive
checks directly and report them to Nagios/NRDP at
an <interval>|--interval; normally 5 minutes.
To execute a passive check, put a yaml file in the
<checksd>|--services folder. The name of the file
is the name of the service in Nagios. It must contain
entries for 'command' and 'interval' at a minimum.
'retry' and 'retry_interval' are optional in the yaml.
If omitted, retry will be set to the default (below).
If omitted, retry_interval will be set to interval.
The idea is to be similar to NRPE + Nagios active checks.""")

        parser.add_argument('-p', '--path', dest="path", default="./checkcache",
                            help="Slurp passive Nagios checks from this folder.")
        parser.add_argument('-d', '--delim', default="\t", dest="delim",
                            help="Delimiter used to parse files. Default is <tab>.")
        parser.add_argument('-t', '--token', dest="token", default=NRDP_TOKEN,
                            help="The token used to access remote NRDP URL(s).")
        parser.add_argument('-f', '--tokenfile', dest="tokenfile", default="token.txt",
                            help="Read token from file. Creation will override -t")
        parser.add_argument('-u', '--url', dest="url", required=True, action="append",
                            help="NRDP URL to send updates to. Use more than once.")
        parser.add_argument('-i', '--interval', dest="interval", default=300,
                            help="The interval at which we send updates to NRDP.")
        parser.add_argument('-r', '--retry', type=int, dest="retry", default=2,
                            help="Default retries before reporting a Problem.")
        parser.add_argument('-H', '--host', dest="hostname", default=socket.getfqdn(),
                            help="FQDN of the host associated with the passive check.")
        parser.add_argument('-A', '--auto', dest="autohost", default=False, action="store_true",
                            help="Automatically determine the host's FQDN.")
        parser.add_argument('-m', '--monitor', default=False, action="store_true", dest="monitor",
                            help="Monitor Checkcache Path and services (checks.d); daemon mode.")
        parser.add_argument('-s', '--services', default="./checks.d", dest="checksd",
                            help="This is the checks.d/ folder with service checks.")
        parser.add_argument('-T', '--timeout', type=int, default=15,
                            dest="timeout", help="HTTP (NRDP API) timeout in seconds.")
        parser.add_argument('-l', '--log', default=False, action="store_true", dest="syslog",
                            help="Use this flag if you want to log to syslog.")
        parser.add_argument('-D', '--debug', action="store_true", default=False,
                            dest="debug", help="Turn on debugging messages.")
        return parser.parse_args()

    # This function verifies the op-args that need verifying and starts self.loop
    def run(self):
        op = self.build_parser()
        log = logging.getLogger('nrdp')
        log.setLevel(logging.INFO)
        if op.syslog:
            # create syslog handler.
            fh = SysLogHandler('/dev/log', facility=logging.handlers.SysLogHandler.LOG_LOCAL3)
        # create console handler.
        ch = logging.StreamHandler(sys.stdout)
        # create formatter and add it to the handlers
        # add the handlers to logger
        formatter = logging.Formatter('%(name)s [%(process)s]: %(levelname)s - %(message)s')
        ch.setFormatter(formatter)
        if op.syslog:
            fh.setFormatter(formatter)
            log.addHandler(fh)
        log.addHandler(ch)

        if not (isdir(op.path)):
            raise Exception("Unable to locate the provided path: {}"
                            .format(op.path))
        if op.debug:
            log.setLevel(logging.DEBUG)

        if op.monitor:
            log.info("Agent daemonizing. Monitoring '{}' for updates.".format(op.path))
            log.info("Performing any checks found in path '{}'".format(op.checksd))
        else:
            log.info("Running once. Checking '{}' for updates ".format(op.path) +
                     "and '{}' for services.".format(op.checksd))
            op.retry = 1

        # If token.txt (or whatever you passed with -f) exists,
        # read it into our token var. This will override -t.
        if isfile(op.tokenfile):
            with open(op.tokenfile, 'r') as f:
                op.token = f.read().strip()
        self.one_infinite_loop(op, log)

    def one_infinite_loop(self, op, log):
        start_time = time.time()
        svc_checks = self.load_checks(log, op.checksd, {}, op.retry, op.monitor)
        next_check_load = start_time + 5
        # Start somewhere.
        last_update = start_time - op.interval
        while True:
            update_nagios = False
            force_update = False
            # This runs our service checks and adds their output and state
            # to the svc_states global variable.
            one = self.run_checks(log, svc_checks, op.monitor)
            # Check for new checks every ~5 seconds for 10 minutes. Add 1 check per check.
            now = time.time()
            if next_check_load < now:
                svc_checks = self.load_checks(log, op.checksd, svc_checks,
                                              op.retry, op.monitor)
                # Once we've been running for 10 minutes, re-read checks minutely.
                if now - start_time > 600:
                    next_check_load = now + 60
                else:
                    next_check_load = now + 5
            # This slurps external updates into the svc_states variable.
            two = self.load_updates(log, op.path, op.delim, op.retry)
            if one or two:
                # We have data to share!
                update_nagios = True
            elif last_update + op.interval <= now:
                # Send an update after `interval` even if state has not changed.
                update_nagios = True
                force_update = True
                log.debug("Update Triggered: Last update {}s > interval({}s)."
                          .format(int(now-last_update), op.interval))
            if update_nagios:
                if op.autohost:
                    op.hostname = socket.getfqdn()
                xml = self.build_xml(log, op.hostname, op.retry, last_update, svc_checks, force_update)
                if xml and self.post_data(log, op.timeout, op.url, op.token, xml):
                    # Only update last_update on successful post.
                    last_update = now
            if not op.monitor:
                break
            time.sleep(4)

    # This function scans our checks.d folder and returns all service checks.
    def load_checks(self, log, checksd, old_svc_checks, retry, monitor):
        svc_checks = {}
        new_service = False
        if checksd != "" and isdir(checksd):
            files = [file for file in listdir(checksd) if isfile(join(checksd, file))]
        for filename in files:
            filepath = join(checksd, filename)
            svc_name = cgi.escape(filename, True)
            # Read the file into a dict with pyyaml.
            with open(filepath, 'r') as f:
                svc = yaml.load(f)
            if svc_name in old_svc_checks:
                # We already had this check, so just update it.
                svc["next_check"] = old_svc_checks[svc_name]["next_check"]
            else:
                # New check. Add it and set the next_check time.
                svc["next_check"] = int(time.time())
                new_service = True
            if "interval" not in svc or "command" not in svc:
                log.warn("Ignored service file missing command/interval: {}"
                         .format(filepath))
                continue
            # These three variables are optional, but we need them set after load.
            svc["timeout"] = svc.setdefault("timeout", 10)
            svc["retry"] = svc.setdefault("retry", retry)
            svc["retry_interval"] = svc.setdefault("retry_interval", svc["retry"])
            svc_checks[svc_name] = svc
            if monitor:
                if new_service:
                    log.info("New Service Check, running every {} minutes: {}"
                             .format(svc["interval"], svc_name))
                    # Return here, add 1 check at a time. This splays them out.
                    return svc_checks
                else:
                    log.debug("Updated Service Check, runs every {} minutes: {}"
                              .format(svc["interval"], svc_name))

        for svc_name in old_svc_checks:
            if svc_name not in svc_checks and svc_name in self.svc_states:
                # If we removed a service, remove the state data too.
                del self.svc_states[svc_name]
        return svc_checks

    # This function runs passive checks and stores their return code/output.
    # It also schedules the next check of a service. Returns True if a service
    # had a state change; meaning Nagios needs to be updated.
    def run_checks(self, log, svc_checks, monitor):
        send_nagios_update = False
        for svc_name, svc_data in svc_checks.items():
            if svc_data["next_check"] > int(time.time()):
                # This one isn't ready yet, skip it!
                continue
            attempt = 1
            cmd = svc_data["command"]
            timeout = svc_data["timeout"]
            interval = svc_data["interval"] * 60
            retry = svc_data["retry"]
            retry_interval = svc_data["retry_interval"] * 60
            log.info("Triggering Service Check: {} - {}".format(svc_name, cmd))
            svc_status, output = self.command(shlex.split(cmd), timeout)
            # Grab exit code and convert new lines to NRDP format.
            svc_output = cgi.escape(output, True).replace('\n', '\\n')
            # Remove non-printable characters (they're bad for XML)
            svc_output = filter(lambda c:c in string.printable, svc_output)
            if not monitor:
               log.info("Service Output for {} ({}): {}".format(svc_name, svc_status, svc_output))
            if (svc_name not in self.svc_states or
               self.svc_states[svc_name]["status"] != svc_status):
                # New service, single-run, state changed, or daemon just started.
                send_nagios_update = True
                log.debug("Update Triggered: New service or change for {}."
                          .format(svc_name))
            else:
                # Nothing changed, increase the attempt counter.
                attempt += self.svc_states[svc_name]["attempt"]
                log.debug("No Update: Found Service in same state; {}."
                          .format(svc_name))

            if svc_status != 0 and attempt == retry:
                # Send an update after retry attempts, when status is not OK.
                send_nagios_update = True
                log.debug("Update Triggered: Retries expired, PROBLEM with {}"
                          .format(svc_name))

            # This sets how long we wait before doing the next service check.
            if svc_status != 0 and attempt < retry:
                ri = retry_interval
            else:
                ri = interval
            svc_checks[svc_name]["next_check"] = int(ri + time.time())
            # Save the current state of this service into our global variable.
            self.svc_states[svc_name] = {"status": svc_status, "output": svc_output,
                                         "attempt": attempt, "time": time.time()}
        return send_nagios_update

    # This function saves service-check updates to the `svc_states` dict.
    # Returns true if a state changed so new state can be updated in Nagios.
    def load_updates(self, log, path, delim, retry):
        # Read in the list of files from the provided path.
        files = [f for f in listdir(path) if isfile(join(path, f))]
        send_nagios_update = False
        for filename in files:
            filepath = join(path, filename)
            # Read the file into a variable. One service check per file.
            # Newlines in output are allowed.
            with open(filepath, 'r') as f:
                checkdata = f.read()
            parts = checkdata.split(delim)
            if len(parts) != 3:
                log.error("Ignored malformed data in file {}: {}"
                          .format(filepath, checkdata))
                continue

            svc_name = cgi.escape(parts[0], True)
            svc_status = int(parts[1])
            svc_output = cgi.escape(parts[2].rstrip(), True).replace('\n', '\\n')
            attempt = 1
            if (svc_name not in self.svc_states
               or self.svc_states[svc_name]["status"] != svc_status):
                # New service, single-run, state changed, or daemon just started.
                send_nagios_update = True
                log.debug("Update Triggered: New service or change for {}."
                          .format(svc_name))
            else:
                # Nothing changed, increase the attempt counter.
                attempt += self.svc_states[svc_name]["attempt"]
                log.debug("No Update: Found Service in same state; {}."
                          .format(svc_name))

            if svc_status != 0 and attempt == retry:
                # Send an update after retry attempts, when status is not OK.
                send_nagios_update = True
                log.debug("Update Triggered: Retries expired, PROBLEM with {}"
                          .format(svc_name))

            remove(filepath)
            # Save the current state of this service into our global variable.
            self.svc_states[svc_name] = {"status": svc_status, "output": svc_output,
                                         "attempt": attempt, "time": time.time()}
        return send_nagios_update

    # This functions builds and returns the XML payload.
    def build_xml(self, log, hostname, retry, last_update, svc_checks, force=False):
        xml = ""
        # Loop through all our services saved in `svc_states`
        for svc_name, svc in self.svc_states.items():
            svc_status = self.svc_states[svc_name]["status"]
            svc_output = self.svc_states[svc_name]["output"]
            svc_atempt = self.svc_states[svc_name]["attempt"]
            last_check = self.svc_states[svc_name]["time"]
            if svc_name in svc_checks:
                local_retry = svc_checks[svc_name]["retry"]
            else:
                local_retry = retry
            if svc_status != 0 and svc_atempt < local_retry:
                # Squelch the problem-state change until we get to retry #.
                log.info("PROBLEM({}) w/ service {}, waiting for {}/{} updates."
                         .format(svc_status, svc_name, svc_atempt, local_retry))
            elif last_check > last_update:
                # Only send updates for checks more current than our last update.
                xml += _SVC_TEMPLATE.format(hostname, svc_name, svc_status,
                                            svc_output).rstrip()
        if xml != "" or force:
            # Add NRDP to our reported services.
            xml += _SVC_TEMPLATE.format(hostname, "nrdp", "0", "OK!").rstrip()
            xml = _NRDP_TEMPLATE.format(xml.strip("\n")).strip()
        return xml

    # This function POSTs the XML to Nagios. Returns true if NRDP replies OK.
    def post_data(self, log, timeout, urls, token, xml):
        params = {'token': token.strip(), 'cmd': 'submitcheck', 'XMLDATA': xml}
        return_status = False
        svcs = []
        for xml_node in parseString(xml).getElementsByTagName("servicename"):
            svcs.append(xml_node.childNodes[0].data)
        svc_count = len(svcs)
        svc_report = ", ".join(svcs)
        # Lots of error checking on http stuff.
        for url in urls:
            try:
                if "staging" in url:
                    reply = requests.post(url, data=params, timeout=timeout, verify=False)
                else:
                    reply = requests.post(url, data=params, timeout=timeout)
            except Exception as e:
                log.error("Exiting: Cannot connect to url: {}\n{}".format(url, e))
                # Exit broke production; switching back to continue. Will fix thew problem with a puppet change instead. 
                # exit(1)
                continue
            try:
                data = reply.text
                result = parseString(data)
                if int(result.getElementsByTagName("status")[0].childNodes[0].data) == 0:
                    return_status = True
                    log.info("Successfully sent {} updates to {}: {}"
                             .format(svc_count, url, svc_report))
                    log.debug("XML Payload Sent:\n{}".format(xml))
                    log.debug("Server Reply Received:\n{}".format(data.rstrip()))
                else:
                    log.warn("ERROR - NRDP Returned: {}".format(result
                             .getElementsByTagName("message")[0].childNodes[0].data))
                    log.warn("XML Payload Sent:\n{}".format(xml))
            except Exception as e:
                log.warn("Bad data from NRDP server, {}".format(url))
                log.warn("XML Payload Sent:\n{}".format(xml))
                log.warn("Server Reply Received:\n{}\n{}".format(data, e))
        return return_status

    # This ridiculous hack is because Python 2 doesn't have a timeout in subprocess.
    def command(self, cmd, seconds):
        try:
            err_msg = "Timeout after {} seconds: {}".format(seconds, " ".join(cmd))
            with timeout.Timeout(seconds=seconds, err_msg=err_msg):
                self.process = Popen(cmd, stdout=PIPE, stderr=PIPE)
                output = " ".join(self.process.communicate()).rstrip()
                svc_status = self.process.returncode
        except Exception as e:
            output = "Caught Exception: {}".format(e)
            svc_status = 2
        return svc_status, output


def exit_gracefully(signum, frame):
    sys.stderr.write('Caught signal, exiting.\n')
    sys.exit(0)

if __name__ == "__main__":
    signal.signal(signal.SIGHUP, exit_gracefully)
    signal.signal(signal.SIGINT, exit_gracefully)
    signal.signal(signal.SIGTERM, exit_gracefully)
    SendNrdp().run()
