#!/usr/bin/perl -w

use strict;
use Sys::Hostname;

my $conf = {
    thresh => {
        ### set thresholds here ###
        ###########################
        crit_space  => '2GB',
        warn_space  => '5GB',
        crit_inodes => '1000',
        warn_inodes => '10000',
        ###########################
    },
    cmds => {
        ### path to glusterfs(8) tool ###
        'gfs_tool' => '/usr/bin/sudo -n /usr/sbin/gluster volume status all detail',
    },
    procs => {
        ### list of processes to monitor ###
        'glusterd'   => 'gfs mngmnt daemon',
        'glusterfsd' => 'gfs brick daemon',
        'glusterfs'  => 'gfs export daemon'
    },
    cli_max_attempts => 10,
    cli_attempt_timeout => 180,
};

sub terminate 
{
    my ( $status_code, $message ) = @_;
    print "$status_code;$message\n";
    exit 0;
}

sub to_bytes
{
    my $input = shift;
    my %units = (
        TB => 1024 ** 4,
        GB => 1024 ** 3,
        MB => 1024 ** 2,
        KB => 1024,
    );

    foreach my $unit ( keys %units ){
        if ( $input =~ /^([0-9\.]+)\s*$unit$/i){
            return $units{$unit} * $1;
        }
    }
}

my $status = {
    warn => '',
    crit => '',
};


# 1. Quick glance to see if all necessary daemons are running
foreach my $daemon ( keys %{ $conf->{procs} } ){
    `/usr/bin/pgrep -x $daemon` 
        or $status->{crit} .= ( $conf->{procs}->{$daemon} . " is not running!; " );
}

# 2. Check bricks` status more thoroughly

# Collect all status` into hierachical structure, mainly to be able 
# to print errors more informatively (e.g., reference a brick id in error msg.)
my @peer_status;
my @peers;

# Workaround for https://bugzilla.redhat.com/show_bug.cgi?id=981661
ATTEMPT: for my $attempt ( (1..$conf->{cli_max_attempts}) ){
    my $pid = open( my $tool_output, $conf->{cmds}->{gfs_tool} . " 2>/dev/null |" ) 
        or terminate(2, "gluster tool failed to run");
    @peers = split /-{2,}/, do { local $/; <$tool_output> };
    waitpid( $pid, 0 );
    # See if tool ran ok
    if ( ($? / 256) == 0 ){
        last;
    } else {
        sleep 1+int(rand($conf->{cli_attempt_timeout}));
        next ATTEMPT if $attempt < $conf->{cli_max_attempts};
        terminate(1, "gluster tool failed to run after $conf->{cli_max_attempts} attempts.");
    }
}

PEER: for (@peers) {
    # $_ = e.g.:
    # Brick                : Brick logstore09h.mail.yandex.net:/data_1
    # Port                 : 49157               
    # Online               : Y          
    my %peer;
    PARAM: for ( split( "\n" ) ){
        # $_ = e.g.:
        # Online               : Y          
        if ( /(^[a-zA-Z0-9\s-]+[a-zA-Z0-9])\s+:\s+(?:Brick\s+)?([^\s]+)\s*$/ ){
            $peer{$1} = $2;
        } # end if
    } # end for PARAM
    if ( $peer{Brick} ){
        push( @peer_status, { %peer } );
    }
} # end for PEER

# Perform actual checks.
BRICK: foreach my $brick (@peer_status){
    # 2.0. Skip all bricks but mine.
    my ( $myself ) = split(':',$brick->{Brick});
    next BRICK if hostname() ne $myself;

    # 2.1. Check if brick is online.
    if ( $brick->{Online} ne 'Y' ){
        $status->{crit} .= "brick $brick->{Brick} is offline; ";
        # If its dead, dont bother checking for free space and stuff.
        next BRICK;
    }

    # 2.2. Check if it has enough free space left
    # e.g.: Disk Space Free: 4.6TB
    if ( to_bytes( $brick->{'Disk Space Free'} ) <= to_bytes( $conf->{thresh}->{crit_space} ) ){
        $status->{crit} .= "no dskspace on $brick->{Brick}: $brick->{'Disk Space Free'} left; ";
    } elsif ( to_bytes( $brick->{'Disk Space Free'} ) <= to_bytes( $conf->{thresh}->{warn_space} ) ) {
        $status->{warn} .= "no dskspace on $brick->{Brick}: $brick->{'Disk Space Free'} left; ";
    }

    # 2.3. Check if it has enough free inodes
    # e.g.: Free Inodes: 1403068771 
    if ( $brick->{'Free Inodes'} <= $conf->{thresh}->{crit_inodes} ){
        $status->{crit} .= "inodes cnt on $brick->{Brick}: $brick->{'Free Inodes'} left; ";
    } elsif ( $brick->{'Free Inodes'} <= $conf->{thresh}->{warn_inodes} ){
        $status->{warn} .= "inodes cnt on $brick->{Brick}: $brick->{'Free Inodes'} left; ";
    }
}
# 3. TODO: add heal checks. Not relevant in our case, though.
# 4. Final verdict.
if ( $status->{crit} ){
    terminate( 2, ($status->{crit} . $status->{warn}) );
}
if ( $status->{warn} ){
    terminate( 1, $status->{warn} );
}

terminate( 0, 'OK' );
