#!/usr/bin/env python

import array
import argparse
import collections
import fcntl
import os
import requests
import socket
import struct
import subprocess
import shlex
import sys
import time
import traceback
from urlparse import urlparse

HOSTNAME = socket.getfqdn()
NAGIOS_OK = 0
NAGIOS_WARN = 1
NAGIOS_CRIT = 2

# global counter for nagios last sent OK packet
last_ping = None
last_state = None

DEFAULT_URL = 'http://localhost/liveness.xml'

parser = argparse.ArgumentParser(
             formatter_class=argparse.ArgumentDefaultsHelpFormatter,
             description="Keeps an interface up or down, depending on the result of a HTTP health check. " +
                         "Use this to keep unhealthy servers out of anycast. Most of these arguments " +
                         "resemble a Varnish backend polling configuration.")
parser.add_argument('-u', '--url', dest='url', default=DEFAULT_URL, help='Backend url to check for health. We issue a HEAD request, expect a 200 response.')
parser.add_argument('-c', '--connect-only', dest='connect_only', default=False, action='store_true', help="Only connect to the destination, don't care about the response.")
parser.add_argument('-r', '--raw-command', dest='raw_command', default=False, help="Run a unix command to perform the status check. Exiting zero is considered healthy, non-zero is unhealthy.")
parser.add_argument('-s', '--source', dest='source', default=False, help='When using a connect-only test, the source IP address to send the request from.')
parser.add_argument('-o', '--timeout', dest='timeout', type=float, default=5.0, help='Backend timeout limit in seconds.')
parser.add_argument('-t', '--interval', dest='interval', type=float, default=1.0, help='Time to wait between probes.')
parser.add_argument('-w', '--window', dest='window', type=int, default=2, help='History to keep of backend success/fail.')
parser.add_argument('-e', '--threshold', dest='threshold', type=int, default=1, help='Number of successes required in history to consider backend healthy.')
parser.add_argument('-n', '--noop', dest='noop', default=False, action='store_true', help="Just print, don't bring interface up/down.")
parser.add_argument('-x', '--once', dest='once', default=False, action='store_true', help="Act once, then quit, rather than running continuously.")
parser.add_argument('-i', '--interface', dest='interface', required=True, help="Interface to bring up/down. [REQUIRED]")
parser.add_argument('-m', '--email', dest='email', default=None, help="Comma-separated email addresses for alert emails.")
parser.add_argument('--nagios-server', dest='nagios_server', default=None, help="Nagios server to send passive checks")
parser.add_argument('--nrdp-folder', dest='nrdp_folder', default=None, help="Folder to write NRDP check data too. Ex: /usr/local/nrdp/checkdata")
parser.add_argument('--nagios-service', dest='nagios_service', default=None, help="Nagios service name to use in passive checks")
args = parser.parse_args()


def send_nagios_passive_health(mesg, status=NAGIOS_CRIT, interval=30):
    """Send health to nagios server using send_nsca, throttle
    to at most once every interval, unless state has changed
    Takes:
        string message to include in nagios notification
        integer status code for nagios health state
        integer interval to send metrics
    Returns subprocess return code
    """
    if not (args.nagios_service and (args.nrdp_folder or args.nagios_server)):
        return 0

    global last_ping
    global last_state
    # Always write NRDP files. NRDP handles state changes the same way this script does.
    if args.nrdp_folder:
        nrdp_file = args.nrdp_folder + '/' + args.nagios_service.replace(" ", "_")
        file_handle = open(nrdp_file, 'w')
        file_handle.truncate()
        file_handle.write("{}\t{}\t{}\n".format(args.nagios_service, status, mesg))
        file_handle.close()
    if (last_state != status or
        last_ping is None or
        (last_ping + interval) <= time.time()):
        # only send a packet if we're changing state, or we haven't in last interval
        print mesg
        if args.nagios_server:
            cmd = ['send_nsca', '-H', args.nagios_server]
            p = subprocess.Popen(cmd, shell=False, stdin=subprocess.PIPE)
            p.communicate(input="%s\t%s\t%s\t%s\n" % (HOSTNAME, args.nagios_service, status, mesg))
        last_ping = time.time()
        last_state = status
    return 0


def alert_email(mesg):
    command = ["mail", "-s", "anycast_health_enforcer took emergency action on %s" % (HOSTNAME)] + args.email.split(",")
    print "Mailing: %s" % command
    p = subprocess.Popen(command, shell=False, stdin=subprocess.PIPE)
    p.stdin.write(mesg)
    p.stdin.close()

    wait_time = 2.0
    wait_until = time.time() + wait_time
    while p.returncode is None and time.time() < wait_until:
        time.sleep(0.1)
        p.poll()

    if p.returncode is None:
        print "Mailer subprocess didn't exit after %s s! Abandoning." % wait_time
    else:
        print "Mailer subprocess returned: %s" % p.returncode


def interface_updown_command(interface, up):
    if up:
        return "ifup %s" % (interface)
    else:
        return "ifdown %s" % (interface)


# 64-bit Linux kernel interface - man page is netdevice(7)
SIOCGIFCONF        = 0x8912
SIOCGIFCONF_Packer = struct.Struct('16s H H 4s 16x')
SIOCGIFCONF_Data   = collections.namedtuple('SIOCGIFCONF_Data', ['ifr_name', 'sa_family', 'sin_port', 'sin_addr'])

SIOCGIFFLAGS        = 0x8913
SIOCGIFFLAGS_Packer = struct.Struct('16s H 22x')
SIOCGIFFLAGS_Data   = collections.namedtuple('SIOCGIFFLAGS_Data', ['ifr_name', 'ifr_flags'])

IFF_UP = 1
AF_INET = 2
# Allocate a buffer large enough for 128 interfaces. This is huge. After the
# syscall, it should be nowhere near entirely used, and the used portion should
# be evenly divisible by the record size
BUF_RESERVED_ITEMS = 128

assert SIOCGIFCONF_Packer.size == 40
assert SIOCGIFFLAGS_Packer.size == 40


def network_addresses():
    """
    Returns a tuple of (ipaddress, interface) for all configured addresses on this machine
    ipaddress will be None for non-IPv4 addresses
    """
    ifreq_buffer = array.array('B', '\0' * SIOCGIFCONF_Packer.size * BUF_RESERVED_ITEMS)

    # Create a socket and issue the ioctl. The ioctl reads/writes a simple structure containing
    # the length and address of the ifreq buffer, into which interface information is written
    ifreq_addr, ifreq_len = ifreq_buffer.buffer_info()
    inbuf = struct.pack('IQ', ifreq_len, ifreq_addr)

    s = socket.socket(socket.AF_INET, socket.SOCK_DGRAM)
    outbuf = fcntl.ioctl(s.fileno(), SIOCGIFCONF, inbuf)
    if len(outbuf) != len(inbuf):
        raise Exception("ioctl(SIOCGIFCONF) returned output with len %s, expected %s" % len(outbuf), len(inbuf))

    out_len, out_addr = struct.unpack('IQ', outbuf)
    assert out_addr == ifreq_addr
    assert out_len < ifreq_len - 2*SIOCGIFCONF_Packer.size
    assert (out_len % SIOCGIFCONF_Packer.size) == 0

    # Parse interfaces and addresses out of the buffer
    result = list()
    for i in range(0, out_len, SIOCGIFCONF_Packer.size):
        ifreq = SIOCGIFCONF_Data(*SIOCGIFCONF_Packer.unpack(ifreq_buffer[i:i+SIOCGIFCONF_Packer.size]))
        interface = ifreq.ifr_name.rstrip('\0')
        if ifreq.sa_family == AF_INET:
            result.append((socket.inet_ntoa(ifreq.sin_addr), interface))
        else:
            result.append((None, interface))

    return result

def network_interface_is_up(interface):
    """Is a network interface up?"""
    flags_in = SIOCGIFFLAGS_Data(ifr_name=interface, ifr_flags=0)
    buf = array.array('B', SIOCGIFFLAGS_Packer.pack(*flags_in))

    s = socket.socket(socket.AF_INET, socket.SOCK_DGRAM)
    r = fcntl.ioctl(s.fileno(), SIOCGIFFLAGS, buf)
    if r != 0:
        raise Exception("ioctl(SIOCGIFFLAGS) returned non-zero: %s" % r)

    flags_out = SIOCGIFFLAGS_Data(*SIOCGIFFLAGS_Packer.unpack(buf))

    return (flags_out.ifr_flags & IFF_UP) == IFF_UP

def network_interface_has_address(interface):
    """Does a network interface have an IPv4 address?"""
    has_ipv4, has_other = False, False

    for naddr, nif in network_addresses():
        if interface == nif:
            if naddr is None:
                has_other = True
            else:
                has_ipv4 = True

    if has_other and not has_ipv4:
        raise Exception("Interface has addresses, but they're all non-IPv4. I am confused!")

    return has_ipv4

def command_check(command, timeout):
    """Raise an exception if the exit code of the command isn't 0"""
    shell_args = shlex.split(command)
    ret = subprocess.call(shell_args)

    if ret == 0:
        return True
    else:
        raise Exception("Check command returned %i" % (ret))

def http_health_check(url, timeout):
    """Raise an exception if we can't get a 200 OK"""
    r = requests.head(url, timeout=timeout)
    r.raise_for_status()
    return True

def connect_check(url, timeout):
    """Raise an exception if we can't connect to the destination"""
    hostdata = url.split(":")
    if len(hostdata) != 2:
        raise Exception("Check URL must be in format hostname:port.")
    if hostdata[0].startswith("http"):
        raise Exception("Check URL must be in format hostname:port.")

    host = hostdata[0]
    port = int(hostdata[1])

    if args.source:
        source = (args.source, 0)
    else:
        source = None

    s = socket.create_connection((host, port), timeout, source)
    s.close()
    return True

def bring_interface_updown(status_message, interface, up):
    """Bring interface up/down with appropriate logging/emailing"""
    command = interface_updown_command(interface, up)

    mesg = """
anycast_health_enforcer is bringing anycast interface %s %s.

The current health status is: %s

More details may be available in the machine log.

%s

The command is: %s
""" % (
        interface,
        "UP" if up else "DOWN",
        status_message,
        "anycast_health_enforcer is in NO-OP mode, the command will not be run!" if args.noop else "",
        command
    )

    print mesg
    if not args.noop:
        try:
            if args.email:
                alert_email(mesg)
            send_nagios_passive_health(mesg, NAGIOS_CRIT)
        except Exception:
            traceback.print_exc()
        finally:
            os.system(command)


# Our run loop
history = collections.deque(maxlen=args.window)

def poll_once():
    # Check HTTP
    try:
        if args.raw_command:
            command_check(args.raw_command, args.timeout)
        elif args.connect_only:
            connect_check(args.url, args.timeout)
        else:
            http_health_check(args.url, args.timeout)
        history.append(1)
    except Exception:
        traceback.print_exc()
        history.append(0)

    # Calculate HTTP state over the window
    http_healthy = sum(history) >= args.threshold
    if http_healthy:
        http_message = "http healthy (%s/%s checks ok)" % (sum(history), len(history))
    else:
        http_message = "http NOT healthy (%s/%s checks ok)" % (sum(history), len(history))

    # Check network interface
    try:
        if not network_interface_has_address(args.interface):
            network_healthy, network_message = False, "interface does not exist, or lacks address"
        elif not network_interface_is_up(args.interface):
            network_healthy, network_message = False, "interface exists but is down"
        else:
            network_healthy, network_message = True, "interface is up and has an address"
    except Exception:
        traceback.print_exc()
        network_healthy, network_message = False, "interface checking raised an exception"

    status_message = "%s. %s." % (http_message, network_message)

    if http_healthy != network_healthy:
        # Print status message if we're unhealthy
        print status_message
        bring_interface_updown(status_message, args.interface, http_healthy)
    elif (http_healthy is False) and (network_healthy is False):
        # in the case of both of network and http health being bad, send CRIT
        send_nagios_passive_health(status_message, NAGIOS_CRIT)
    else:
        # should have no problems, tell nagios we're OK
        send_nagios_passive_health(status_message, NAGIOS_OK)

    sys.stdout.flush()


if __name__ == '__main__':
    print "anycast_health_enforcer daemon is starting"
    continue_running = True

    # There is a way to have "requests" use a source address, but the version
    # we use doesn't support adapaters, which is what's required to use that:
    # https://github.com/sigmavirus24/requests-toolbelt/blob/master/requests_toolbelt/adapters/source.py
    if args.source and not args.connect_only:
        raise Exception("--source can only be used with --connect-only.")

    if args.raw_command and args.url != DEFAULT_URL:
        raise Exception("--raw-command and --url cannot be used together.")

    while continue_running:
        poll_once()
        time.sleep(args.interval)

        if args.once:
            continue_running = False
