#!/usr/bin/perl

=head1 NAME
       
    apache-mem-monitor - мониторинг и графики памяти директовых апачей

=head1 DESCRIPTION

    apache-mem-monitor.pl [opts] {monrun|graphite|restart}

    graphite - данные о памяти воркеров в формате
        graphite-client (по-умолчанию)
    monrun - мониторинг памяти воркеров в monrun формате
    restart - рестартует апачи, если превышен warn limit, не больше одного ДЦ
        в день. Без опции --force просто говорит, что собирается сделать.

    Опции:
    --rss-limit-kb - лимит памяти на воркер (как в конфиге), в KБ
        если не указано, пытается найти в директовых Settings
    --warn - лимит (по-умолчанию 0.75), при котром зажигать мониторинг в warning
    --crit - лимит (по-умолчанию 0.9), при котром зажигать мониторинг в crit
    --force - на самом деле сделать restart
    --force-dc - игнорировать правило "не больше одного дц в день", рестартить
        указанный дц (man, sas, ...). --force-dc=any рестартит любой дц.
    --debug

    Примеры:

    apache-mem-monitor.pl # список метрик для графита
    apache-mem-monitor.pl monrun

    apache-mem-monitor.pl restart --force-dc=man
    apache-mem-monitor.pl restart --force-dc=any
    apache-mem-monitor.pl restart --force
    apache-mem-monitor.pl restart --debug

=cut

use strict;
use warnings FATAL => 'all';
use utf8;
use open qw(:std :encoding(utf8));
use Getopt::Long;

my %O;
my $host = qx(hostname -f); chomp $host; $host =~ s/[^0-9a-zA-Z-]/_/g;
my $threshold_warn = 0.75;
my $threshold_crit = 0.90;
my $debug;
my $force;
my $force_dc;
my $rss_limit;

GetOptions(
    '--rss-limit-kb=i' => \$rss_limit,
    '--warn=f' => \$threshold_warn,
    '--crit=f' => \$threshold_crit,
    '--debug' => \$debug,
    '--force' => \$force,
    '--force-dc=s' => \$force_dc,
) or die;

my $action = shift // 'graphite';
log_debug("force:" . ($force // "undef") . " force_dc:" . ($force_dc // "undef"));

$threshold_warn = $threshold_crit if $threshold_warn > $threshold_crit;
my $threshold_restart = $threshold_warn / 3;
log_debug(sprintf "warn: %.2f, crit: %.2f, restart: %.2f", ($threshold_warn, $threshold_crit, $threshold_restart));

if ($action !~ /monrun|graphite|restart/) {
    system("podselect -section NAME -section DESCRIPTION $0 | pod2text-utf8 >&2");
    exit(0);
}

my @errmsg_warn = ();
my @errmsg_crit = ();
my ($total_state_run, $total_state_all) = (0, 0);
my $time_regex = '\b' . qx(date +%H:%M:);
chomp $time_regex;

# ищем, какие апачи с какими конфигами запущены
my $cmd = q(ps -e -o pid,cmd | grep 'apache[2] -f /etc' | perl -lane 'print $F[-1]' | sort -u);
my @apache_configs = split /\n/, qx($cmd);
log_debug("apache configs: " . join(' ', @apache_configs));

my %apache;
# собираем инфу о памяти воркеров по каждому виду апачей, проверяем лимиты, пишем в $apache{$conf}
for my $conf (@apache_configs) {
    my ($conf_prefix) = ($conf =~ m!/([^/]+)$!);
    $conf_prefix =~ s/\.conf//;
    $conf_prefix =~ s/\.yandex\.\w+$//;
    # conf_prefix - ppc|soap.direct|...
    $conf_prefix =~ s/[^0-9a-zA-Z-]/_/g;

    my ($root_pid, $root_vsize, $root_rss, $workers_avg_vsize, $workers_avg_rss, $state_run, $state_all) = (0) x 7;

    log_debug("find apache root process for $conf: ps -eL -o ppid,vsize,rss,pid,state,cmd | grep apache[2] | grep $conf");
    my @ps_info = map { s/^\s+|\s+$//; $_ } split(/\n/, qx(ps -eL -o pid,ppid,vsize,rss,state,cmd | grep apache2 | grep -v grep | grep $conf));
    for my $ps_info (@ps_info) {
        #log_debug($ps_info);
        my ($pid, $ppid, $vsize, $rss, $state) = split /\s+/, $ps_info;
        if ($ppid == 1) {
            ($root_pid, $root_vsize, $root_rss) = ($pid, $vsize, $rss);
        }
        else {
            $workers_avg_vsize += $vsize;
            $workers_avg_rss += $rss;
            $state_all++;
            $total_state_all++;
            if ($state =~ /^R$/) {
                $state_run++;
                $total_state_run++;
            }
        }
    }
    $workers_avg_vsize = int($workers_avg_vsize / (scalar(@ps_info) || 1));
    $workers_avg_rss   = int($workers_avg_rss / (scalar(@ps_info) || 1));
    log_debug("apache root pid,vsize,rss; workers avg vsize,rss; workers running,all: $root_pid, $root_vsize, $root_rss; $workers_avg_vsize, $workers_avg_rss, $state_run, $state_all");

    my $log = (split /\s+/, qx(grep ErrorLog $conf | head -1))[-1];
    if ((not defined($log) or $log !~ /\w+/) and $root_pid) {
        log_debug("no ErrorLog found in $conf, try lsof");
        $log = (split /\s+/, qx(lsof -p $root_pid | grep 'error\.log' | head -1))[-1];
    }
    $log = undef if defined($log) and $log !~ /\w+/;

    my $too_big_errlog_total = undef;
    if ($log) {
        chomp $log;
        log_debug("apache error log: $log");
        log_debug("grep root apache too big msg: grep -c 'too big' $log");

        $too_big_errlog_total = qx(grep -c 'too big' $log) || 0;
        chomp $too_big_errlog_total;
        log_debug("total too big msg: $too_big_errlog_total");
    }
 
    my $time = time;
    $apache{$conf_prefix}{graphite} = "one_min.$host.apache.$conf_prefix.too_big_errlog_total $too_big_errlog_total $time\n" .
        "one_min.$host.apache.$conf_prefix.rss $root_rss $time\n" .
        "one_min.$host.apache.$conf_prefix.vsize $root_vsize $time\n" .
        "one_min.$host.apache.$conf_prefix.workers_avg_rss $workers_avg_rss $time\n" .
        "one_min.$host.apache.$conf_prefix.workers_avg_vsize $workers_avg_vsize $time\n" .
        "one_min.$host.apache.$conf_prefix.workers_all $state_all $time\n" .
        "one_min.$host.apache.$conf_prefix.workers_running $state_run $time\n";

    $rss_limit = get_direct_rss_limit($conf_prefix) if ! $rss_limit;

    $apache{$conf_prefix}{restart} = 0;
    $apache{$conf_prefix}{warn} = 0;
    $apache{$conf_prefix}{crit} = 0;
    if ($rss_limit and $root_rss >= $rss_limit*$threshold_restart and $root_rss < $rss_limit*$threshold_warn) {
        $apache{$conf_prefix}{restart} = 1;
    }
    elsif ($rss_limit and $root_rss >= $rss_limit*$threshold_warn and $root_rss < $rss_limit*$threshold_crit) {
        $apache{$conf_prefix}{warn} = 1;
        $apache{$conf_prefix}{restart} = 1;
        push @errmsg_warn, $conf_prefix;
    }
    elsif ($rss_limit and $root_rss >= $rss_limit*$threshold_crit) {
        $apache{$conf_prefix}{crit} = 1;
        $apache{$conf_prefix}{warn} = 1;
        $apache{$conf_prefix}{restart} = 1;
        push @errmsg_crit, $conf_prefix;
    }
    log_debug(sprintf "restart rss %.1f, root rss %.1f, warn rss %.1f, crit rss: %.1f", $rss_limit*$threshold_restart, $root_rss, $rss_limit*$threshold_warn, $rss_limit*$threshold_crit) if $rss_limit;
}

if ($action eq 'monrun') {
    if (scalar @errmsg_crit) {
        print "2;Apache too big: " . join(' ', @errmsg_crit, @errmsg_warn) . "\n";
    }
    elsif (scalar @errmsg_warn) {
        print "1;Apache too big: " . join(' ', @errmsg_crit, @errmsg_warn) . "\n";
    }
    else {
        print $rss_limit ? "0;OK\n" : "0;OK - no limit set/found\n";
    }
}
elsif ($action eq 'graphite') {
    map { print $_->{graphite}; } values %apache;
    my $time = time;
    print "one_min.$host.apache_workers.all $total_state_all $time\n" .
          "one_min.$host.apache_workers.running $total_state_run $time\n";
}
elsif ($action eq 'restart') {
    my @dcs = qw(iva myt sas vla man);
    my %service = (
        # не хочу хардкодить тут дц, в которых присутствует сервис, а нормального service-discovery нет =(
        'soap_direct' => '/etc/init.d/soap.direct.yandex.ru',
        'intapi_direct' => '/etc/init.d/intapi.direct.yandex.ru',
        'ppc' => '/etc/init.d/ppc.yandex.ru',
    );

    my $stop_file = '/var/spool/apache-mem-monitor/apache-restart.stop';
    if (-f $stop_file) {
        log_info("Stop file $stop_file found, exiting");
        exit 0;
    }

    open(my $fh, '<', '/etc/ppcinv/localhost.root_dc');
    chomp(my $my_dc = <$fh>);
    close($fh);

    chomp(my $cur_day = qx(date +%u));
    # date +%w не подходит, хочу, чтобы пн был 0, а не вск
    $cur_day--;
    my $dc_today_id = $cur_day % scalar(@dcs);
    my $dc_today = $dcs[$dc_today_id];

    my $my_dc_id = 0;
    for my $dc (@dcs) {
        last if $dc eq $my_dc;
        $my_dc_id++;
    }

    log_info("Datacenters: " . join(" ", @dcs));
    log_info("Day number $cur_day is the day for dc $dc_today (id $dc_today_id)");
    log_info("My dc - $my_dc (id $my_dc_id), find apache to restart ...");
    my $restarted = 0;
    while (my ($conf, $data) = each %apache) {
        log_info("Apache - $conf, limits exceeded - restart: $data->{restart}, warn: $data->{warn}, crit: $data->{crit}");

        if ($data->{warn}) {
            log_info("Restart $my_dc - apache exceeds warn limit");
        }
        elsif ($force_dc && ($force_dc eq $my_dc || $force_dc eq 'any')) {
            log_info("Restart $my_dc as requested (--force), ignore day number");
        }
        elsif ($data->{restart} && $dc_today eq $my_dc) {
            log_info("Restart $my_dc - today is my day and apache exceeds restart limit");
        }
        else {
            log_info("No reason to restart - not my dc day, or warn limit is not exceeded, or not --force-dc");
            next;
        }

	chomp(my $already_closed = qx(iptruler dump | grep 'tcp-reset'));
        log_info("No slb actions - iptruler 'all down' rule already present") if $already_closed;

        if ($service{$conf}) {
            # никак не обрабатываем ошибки, рестартуем не больше одного дц в день,
            # если что-то пойдет не так - будет видно на графиках
            sleep(int rand 300) if $force;
            my_system(qw(iptruler all down)) if !$already_closed;
            sleep(20) if $force;
            my_system(($service{$conf}, 'stop'));
            sleep(60) if $force;
            my_system(($service{$conf}, 'start'));
            sleep(20) if $force;
            my_system(qw(iptruler all up)) if !$already_closed;
            $restarted++ if $force;
        }
    }
    log_info("$restarted apaches was restarted");
}


sub get_direct_rss_limit {
    my $conf = shift;

    $rss_limit = 0;
    if ($conf =~ /ppc|soap|intapi/ && -d '/var/www/ppc.yandex.ru/protected/') {
        my $cmd = q(cd /var/www/ppc.yandex.ru/protected/ 2>/dev/null && perl -e 'use my_inc ".."; use Settings; print join " ", $Settings::APACHE_MAX_PROC_SIZE, $Settings::SOAP_APACHE_MAX_PROC_SIZE' 2>/dev/null);
        my ($rss_front, $rss_soap) = split /\s+/, (split(/\n/, qx($cmd)))[0];
        $rss_limit = $conf =~ /soap/ ? $rss_soap : $rss_front;
    }
    elsif (-d '/var/www/geocontext.yandex.ru/lib/') {
        my $cmd = q(perl -I /var/www/geocontext.yandex.ru/lib/ -MSettings -e 'print $Settings::APACHE_MEMLIMIT' 2>/dev/null);
        $rss_limit = (split(/\n/, qx($cmd)))[0];
    }
    elsif (-d '-I/var/www/direct-mod.yandex.ru/protected') {
        my $cmd = q(perl -I/var/www/direct-mod.yandex.ru/protected -MSettings -e 'print $Settings::APACHE_MAX_PROC_SIZE');
        $rss_limit = (split(/\n/, qx($cmd)))[0];
    }
    $rss_limit /= 1024 if $rss_limit;

    log_debug("rss limit for $conf: $rss_limit");
    return $rss_limit;
}

sub log_debug {
    return if not $ENV{DEBUG} and not $debug;
    chomp(my $date = qx(date -Isec));
    print "[$date] DEBUG: @_\n";
}

sub log_info {
    chomp(my $date = qx(date -Isec));
    print "[$date] INFO: @_\n";
}

sub my_system {
   log_info(($force ? "Run " : "Dry-run (use --force) ") . join(" ", @_));
   if ($force) {
       system { $_[0] } @_;
   } 
   return ($? >> 8) == 0;
}
