#!/usr/bin/env python
"""Multiple Nagios check for haproxy health based on haproxy stats"""
import argparse
import csv
import socket
import glob
import json
import sys
import time
import errno
from StringIO import StringIO

# Error exit codes
OK = (0, 'OK')
WARN = (1, 'WARN')
CRITICAL = (2, 'CRITICAL')
UNKNOWN = (3, 'UNKNOWN')

WARN_RATIO = 0.75
CRIT_RATIO = 0.9

MAX_DIFF = 0.1
MAX_AGE = 300
COMPARE_STATE_DIR = '/tmp/'

class HaproxyStatsMap(object):
    """General representation of haproxy show stat output"""

    SOCKET_BUFFER_SIZE = 1024

    def __init__(self, sock_dir, file_names):
        self.sock_dir = sock_dir
        paths = [sock_dir + file_name for file_name in file_names]

        sockets = []
        for path in paths:
            sockets += glob.glob(path)
        if not sockets:
            raise ValueError("no socket files in passed in dir:{}".format(sock_dir))
        
        self.global_stat_dict_list, stat_err = self._show_stat(sockets)
        if not self.global_stat_dict_list:
            raise ValueError("no stats output from any sockets in passed in dir:{}".format(sock_dir))
        if stat_err is not "":
            raise IOError(stat_err)
        self.global_info_dict_list, info_err = self._show_info(sockets)
        if not self.global_stat_dict_list:
            raise ValueError("no info output from any sockets in passed in dir:{}".format(sock_dir))
        if info_err is not "":
            raise IOError(info_err)

    def _show_stat(self, sockets):
        global_stat_dict_list = {}
        err = ""
        stat = None
        for sock_path in sockets:
            try:
                stat = self._get_data("show stat -1 -1 -1", sock_path)
            except IOError as e:
                err += "{}:{}\n".format(sock_path, str(e))

            if stat:
                local_dict_list = self._parse_stat(stat)
                global_stat_dict_list[sock_path.replace(self.sock_dir, "", 1)] = local_dict_list
        
        return global_stat_dict_list, err

    def _show_info(self, sockets):
        global_info_dict_list = {}
        err = ""
        info = None
        for sock_path in sockets:
            try:
                info = self._get_data("show info", sock_path)
            except IOError as e:
                err += "{}:{}\n".format(sock_path, str(e))

            if info:
                local_dict = self._parse_info(info)
                global_info_dict_list[sock_path.replace(self.sock_dir, "", 1)] = local_dict

        return global_info_dict_list, err

    def _get_data(self, cmd, sock_path):
        buff = StringIO()
        try:
            client = socket.socket(socket.AF_UNIX, socket.SOCK_STREAM)
            client.connect(sock_path)
            client.sendall('{}\n'.format(cmd))

            data = ''
            while True:
                chunk = client.recv(self.SOCKET_BUFFER_SIZE)
                if chunk:
                    buff.write(chunk)
                else:
                    break

            data = buff.getvalue()
            return data
        except IOError as e:
            if e.errno not in (errno.EAGAIN, errno.EINTR):
                raise
        finally:
            buff.close()
            client.close()

    @staticmethod
    def _parse_stat(stat):
        reader = csv.DictReader(StringIO(stat))
        dict_list = []
        for line in reader:
            dict_list.append(line)

        return dict_list

    @staticmethod
    def _parse_info(info):
        lines = info.split('\n')
        return {l.split(': ')[0]: l.split(': ')[1] for l in lines if l}

class Fails(object):
    """Representation of check fails information"""

    def __init__(self):
        self.fails = {}
        self.exit_code = OK

    def output(self):
        return json.dumps(self.fails, indent=4)

    def append(self, process_name, fails=[]):
        for fail in fails:
            if self.fails.get(process_name):
                self.fails[process_name].append(fail)
            else:
                self.fails[process_name] = [fail]
        
            if fail.get('level', '') == 'warn':
                if self.exit_code != CRITICAL:
                    self.exit_code = WARN
            elif fail.get('level', '') == 'crit':
                self.exit_code = CRITICAL

def new_fail(check, pxname, svname, info={}):
    res = {'check_item': check,
           'instance name': pxname,
           'instance type': svname}
    for k, v in info.iteritems():
        res[k] = v

    return res

# Check the connection ratio (curr/limit) based on each frontend `show stat` result (defined in default or frontend)
def check_instance_session_ratio(stats_map, **kwargs):
    if stats_map['svname'] != 'FRONTEND':
        return {}

    warn_ratio = kwargs.get('warn_ratio', WARN_RATIO)
    crit_ratio = kwargs.get('crit_ratio', CRIT_RATIO)

    extra_info = {}
    ratio = float(stats_map['scur'])/float(stats_map['slim'])
    if ratio >= crit_ratio:
        extra_info['level'] = 'crit'
    elif ratio >= warn_ratio:
        extra_info['level'] = 'warn'

    if extra_info:
        extra_info['max_sess'] = stats_map['slim']
        extra_info['cur_sess'] = stats_map['scur']
        extra_info['cur_ratio'] = ratio
        extra_info['warn_ratio'] = warn_ratio
        extra_info['crit_ratio'] = crit_ratio
        return new_fail('check instance session ratio', stats_map['# pxname'], stats_map['svname'], info=extra_info)

    return {}

# Check the connection ratio (curr/limit) based on single haproxy process `show info` result (defined in global)
def check_process_conn_ratio(info_map, **kwargs):
    warn_ratio = kwargs.get('warn_ratio', WARN_RATIO)
    crit_ratio = kwargs.get('crit_ratio', CRIT_RATIO)

    cur_conn = int(info_map['CurrConns'])
    max_conn = int(info_map['Maxconn'])
    conn_ratio = float(cur_conn) / max_conn
    extra_info = {}
    if conn_ratio >= crit_ratio:
        extra_info['level'] = 'crit'
    elif conn_ratio >= warn_ratio:
        extra_info['level'] = 'warn'

    if extra_info:
        extra_info['MaxConn'] = max_conn
        extra_info['CurrConn'] = cur_conn
        extra_info['CurrRatio'] = conn_ratio
        extra_info['WarnRatio'] = warn_ratio
        extra_info['CritRatio'] = crit_ratio
        return new_fail('check process connection ratio', info_map['node'], "INSTANCE", info=extra_info)

    return {}

# Check the number of available backends for any frontend instance
def check_available_backend(stats_map, **kwargs):
    if stats_map['svname'] != 'BACKEND':
        return {}

    extra_info = {}
    ava_bck = stats_map['act'] + stats_map['bck']
    if ava_bck == 0:
        extra_info['level'] = 'crit'
    elif stats_map['act'] == 0:
        extra_info['level'] = 'warn'

    if extra_info:
        extra_info['active_backend'] = stats_map['act']
        extra_info['backup_backend'] = stats_map['bck']
        return new_fail('check available backend', stats_map['# pxname'], stats_map['svname'], info=extra_info)

    return {}

# Check if any instance is not UP(server, backend) or OPEN(frontend)
def check_instance_status(stats_map, **kwargs):
    extra_info = {}
    if stats_map['status'] not in ['UP', 'OPEN', 'no check']:
        extra_info['level'] = 'crit'
        extra_info['status'] = stats_map['status']

    if extra_info:
        return new_fail('check status', stats_map['# pxname'], stats_map['svname'], info=extra_info)

    return {}

# Check if any backend instance return 5xx without server interaction (indicates server resource starvation)
def check_server_noresponse(process_name, stats_map_list, **kwargs):
    def compare(curr, prev=None):
        if prev is None:
            return True

        diff_server = curr.get("server_5xx", 0) - prev.get("server_5xx", 0)
        diff_backend = curr.get("backend_5xx", 0) - prev.get("backend_5xx", 0)
        diff_time = curr.get("timestamp", 0) - prev.get("timestamp", curr.get("timestamp", 0))
        if diff_time == 0:
            return True

        return abs(diff_backend-diff_server)/diff_time < MAX_DIFF

    def output_info(curr, prev):
        return {'backend_5xx': curr.get("backend_5xx", 0) - prev.get("backend_5xx", 0),
                'server_5xx': curr.get("server_5xx", 0) - prev.get("server_5xx", 0),
                'time': curr.get("timestamp", 0) - prev.get("timestamp", curr.get("timestamp", 0)),
                'level': "crit"}

    map_5xx = {}
    for stats_map in stats_map_list:
        if stats_map['# pxname'].startswith('stats-'):
            continue
        elif stats_map['svname'] == 'FRONTEND':
            continue
        elif stats_map['svname'] == 'BACKEND':
            if stats_map.get('hrsp_5xx'):
                cur_map = map_5xx.get(stats_map['# pxname'])
                if cur_map:
                    cur_map['backend_5xx'] = cur_map.get('backend_5xx', 0) + int(stats_map['hrsp_5xx'])
                else:
                    map_5xx[stats_map['# pxname']] = {'backend_5xx': int(stats_map['hrsp_5xx']), 'timestamp': time.time()}
        else:
            if stats_map.get('hrsp_5xx'):
                cur_map = map_5xx.get(stats_map['# pxname'])
                if cur_map:
                    cur_map['server_5xx'] = cur_map.get('server_5xx', 0) + int(stats_map['hrsp_5xx'])
                else:
                    map_5xx[stats_map['# pxname']] = {'server_5xx': int(stats_map['hrsp_5xx']), 'timestamp': time.time()}

    file_path = COMPARE_STATE_DIR+process_name+".json"
    prev_map_5xx = {}
    try:
        with open(file_path) as f:
             prev_map_5xx = json.load(f)
    except IOError as e:
        print e

    fails = []
    for pxname, curr in map_5xx.iteritems():
        prev = prev_map_5xx.get(pxname)
        if prev and time.time()-prev.get("timestamp", 0) > MAX_AGE:
            prev = None
        if not compare(curr, prev):
            fails.append(new_fail('check server no-response per sec', pxname, "SERVER/BACKEND", output_info(curr, prev)))

    try:
        with open(file_path, mode='w') as f:
            json.dump(map_5xx, f)
    except IOError as e:
        print e
        fails.append(new_fail('check server no-response per sec', 'overall', 'SERVER/BACKEND', {'level': 'warn', 'reason': str(e)}))

    return fails

def check_per_instance(stats_map_list, check_list=None, **kwargs):
    fails = []
    if check_list == None:
        return fails

    for stats_map in stats_map_list:
        if stats_map['# pxname'].startswith('stats-'):
            continue
        for check in check_list:
            fail = check(stats_map, **kwargs)
            if fail:
                fails.append(fail)

    return fails

def check_per_process(info_map, check_list=None, **kwargs):
    fails = []
    if check_list == None:
        return fails

    for check in check_list:
        fail = check(info_map, **kwargs)
        if fail:
            fails.append(fail)

    return fails

CHECK_MAP = {
    'available-backend': check_available_backend,
    'sess-ratio': check_instance_session_ratio,
    'conn-ratio': check_process_conn_ratio,
    'instance-status': check_instance_status,
    'server-noresp': check_server_noresponse,
}

# These are the checks against a single role (one instance) in `show stat` csv output; within single haproxy process context
INSTANCE_STAT_CHECK = [check_available_backend, check_instance_session_ratio, check_instance_status]
# These are the checks against multiple rows (involve multiple instances) in `show stat` csv output; within single haproxy process context
PROCESS_STAT_CHECK = [check_server_noresponse]
# These are the checks against single haproxy process `show info` 'json-like' output
PROCESS_INFO_CHECK = [check_process_conn_ratio]

def perform_all_checks(haproxy_stats, check_list=None, **kwargs):
    final_fails = Fails()
    if check_list == None:
        return final_fails

    for process_name, stats_map in haproxy_stats.global_stat_dict_list.iteritems():
        # first let's run some checks against each single instances of `show stat` result
        fails = check_per_instance(stats_map, check_list=[c for c in check_list if c in INSTANCE_STAT_CHECK], **kwargs)
        if fails:
            final_fails.append(process_name, fails=fails)

        # Then we need to run any check that needs multiple instances info from `show stat` result
        for check in [c for c in check_list if c in PROCESS_STAT_CHECK]:
            fails = check(process_name, stats_map, **kwargs)
            if fails:
                final_fails.append(process_name, fails=fails)

    for process_name, info_map in haproxy_stats.global_info_dict_list.iteritems():
        # Perform checks against single haproxy process `show info` output
        fails = check_per_process(info_map, check_list=[c for c in check_list if c in PROCESS_INFO_CHECK], **kwargs)
        if fails:
            final_fails.append(process_name, fails=fails)

    return final_fails

def exit(err, msg):
    print '{}: {}'.format(err[1], msg)
    sys.exit(err[0])

def main():
    parser = argparse.ArgumentParser(description='give me the sock_dir and check list')
    parser.add_argument('-c', '--check_list', nargs='+', help='<Required> define what checks to run', required=True)
    parser.add_argument('-g', '--glob', nargs='+', help='<Required> what haproxy stat socks to run the check', required=True)
    parser.add_argument('-s', '--socket_dir', help='<Required> haproxy sockets dir', required=True)
    parser.add_argument('-w', '--warn_ratio', type=float, default=WARN_RATIO, help='the warning threshold of connection ratio; default is 0.5')
    parser.add_argument('-t', '--crit_ratio', type=float, default=CRIT_RATIO, help='the critical threshold of connection ratio; default is 0.8')
    args = parser.parse_args()
    
    check_list = [CHECK_MAP[check_name] for check_name in args.check_list if check_name in CHECK_MAP.keys()]

    try:
        haproxy_stats = HaproxyStatsMap(args.socket_dir, args.glob)
        final_fails = perform_all_checks(haproxy_stats, check_list=check_list, warn_ratio=args.warn_ratio, crit_ratio=args.crit_ratio)
        if final_fails.fails:
            exit(final_fails.exit_code, final_fails.output())
        exit(OK, "performed {}; all passed".format(args.check_list))
    except ValueError as err:
        exit(UNKNOWN, err)
    except Exception as err:
        exit(CRITICAL, err)

if __name__ == '__main__':
    main()
