#!/usr/bin/perl
use warnings;
use strict;

use Sys::Syslog qw/ :standard :macros /;
use Getopt::Long;
use IO::Socket::INET;
use Time::HiRes qw/ sleep time /;

$| = 1;

my $host = "127.0.0.1:11211";
my $check_interval = 2;
my $fast_evict_time = 60;
my $disable_actions = 0;
my $min_donator_size = 8;
my $logfile = '';

my $help = 0;

GetOptions("host=s" => \$host,
           "log=s" => \$logfile,
           "interval=i" => \$check_interval,
           "fast_evict_time=i" => \$fast_evict_time,
           "disable_actions" => \$disable_actions,
           "min_donator_size=i" => \$min_donator_size,
           "help" => \$help)
       or usage();

usage() if @ARGV;
usage() if $help;

openlog("usher-slab-reallocator", "cons,pid", "local3");

my $sock = IO::Socket::INET->new($host);
if ( not $sock ) {
    syslog(LOG_ERR, "Couldn't connect to %s: %m", $host);
    die "Couldn't connect to $host: $!";
}

sub usage {
    print STDERR "Usage: $0 [--host hostname] [--interval check_interval] [--fast_evict_time\n";
    print STDERR "       interval] [--log logfile] [--disable_actions] [--min_donator_size\n";
    print STDERR "       pagecount]\n";
    print STDERR "\n";
    print STDERR "  --interval check_interval\n";
    print STDERR "    The number of seconds between checks. Default: 2.\n";
    print STDERR "  --fast_evict_time interval\n";
    print STDERR "    The number of seconds considered to be a short eviction, forcing a\n";
    print STDERR "    reallocation.\n";
    print STDERR "  --disable_actions\n";
    print STDERR "    Don't actually reallocate anything when issues are found.\n";
    print STDERR "  --min_donator_size pagecount\n";
    print STDERR "    Only choose slabs with more than pagecount pages as donators to other slabs\n";
    exit 1;
}

my $last_stats;
my $last_time;

sub get_stats {
    my %stats;

    my $lines = 0;

    for my $type ( qw/ slabs items / ) {
        print $sock "stats $type\n" or die;

        while ( my $line = <$sock> ) {
            $lines++;
            $line =~ s/[\r\n]+$//;
            last if $line eq "END";

            if ( $line =~ /^STAT (?:items:)?(\d+):(\S+) (\d+)$/ ) {
                $stats{$1}{$2} = $3;
            } elsif ( $line =~ /^STAT (\S+) (\d+)$/ ) {
                # global stat
            } else {
                syslog(LOG_WARNING, "Unknown line from memcached: %s", $line);
            }
        }
    }

    die "no response from server" unless $lines > 0;

    # stats items only returns stuff if the slab is non-empty
    # unfortunately, this means we get no info on empty slabs with
    # existing errors or evictions.
    #
    # we delete their info to avoid mistaking their change to non-emptyness
    # for new errors/evictions.
    SLAB: for my $k ( keys %stats ) {
        for my $name ( qw/ evicted evicted_time outofmemory total_pages / ) {
            if ( not exists $stats{$k}{$name} ) {
                delete $stats{$k};
                next SLAB;
            }
        }

        if ( $stats{$k}{total_pages} == 0 ) {
            delete $stats{$k};
            next SLAB;
        }
    }

    # {
    #     "32" => { "evicted" => ..., "total_pages" => ..., ... }
    #     ...
    # }
    return \%stats;
}

sub get_differences_and_update {
    my $this_time = time;
    my $this_stats = get_stats;

    my %diffs;

    if ( defined $last_time and defined $last_stats ) {
        SLAB: for my $slabid ( keys %$this_stats ) {
            next if not exists $last_stats->{$slabid};
            for my $stat ( keys %{$this_stats->{$slabid}} ) {
                if ( not exists $last_stats->{$slabid}{$stat} ) {
                    delete $diffs{$slabid};
                    next SLAB;
                }
                $diffs{$slabid}{$stat} = $this_stats->{$slabid}{$stat} - $last_stats->{$slabid}{$stat};
            }
        }
    }

    $last_time  = $this_time;
    $last_stats = $this_stats;

    return %diffs ? ($this_stats, \%diffs, $this_time - $last_time) : undef;
}

sub pick_random_slab {
    my ($stats, $total_pages) = @_;

    # picks a random slab weighted by the number of pages in each slab
    # so larger slabs are more likely to be picked

    my $page_index = int rand $total_pages;

    for my $idx ( sort keys %$stats ) {
        if ( $page_index <= $stats->{$idx}{'total_pages'} ) {
            return $idx;
        } else {
            $page_index -= $stats->{$idx}{'total_pages'};
        }
    }

    die "not reached";
}

sub move_slab {
    my ($from, $to) = @_;

    syslog(LOG_NOTICE, "Reassigning a slab from %d to %d", $from, $to);

    if ( $disable_actions ) {
        syslog(LOG_NOTICE, "Skipping reassignment, actions explicitly disabled");
        return;
    }

    print $sock "slabs reassign $from $to\n";
    my $result = <$sock>;
    $result =~ s/[\r\n]+$//;

    syslog(LOG_NOTICE, "Result: %s", $result);
}

sub process_differences {
    my ($stats, $stat_diff, $time_diff) = @_;

    my %wanted = ();
    my $total_pages = 0;

    # figure out which slabs want pages, put those in %wanted
    for my $slabid ( keys %$stat_diff ) {
        next if not exists $stats->{$slabid};

        my $slab_diff = $stat_diff->{$slabid};
        my $slab = $stats->{$slabid};

        $total_pages += $slab->{'total_pages'};

        # at least one fast eviction
        if ( $slab_diff->{'evicted'} > 0 and $slab->{'evicted_time'} < $fast_evict_time ) {
            syslog(LOG_NOTICE, "slab %d has a fast eviction (evicted_time = %d)", $slabid, $slab->{'evicted_time'});
            $wanted{$slabid}++;
        }

        # or any OOM errors
        if ( $slab_diff->{'outofmemory'} > 0 ) {
            syslog(LOG_NOTICE, "slab %d has %d new OOM errors", $slabid, $slab_diff->{'outofmemory'});
            $wanted{$slabid}++;
        }
    }

    # now move pages
    for my $slabid ( keys %wanted ) {
        my $donator;
        
        # figure out who's going to donate a page, randomly
        my $tries = 0;
        {
            $tries++;
            $donator = pick_random_slab($stats, $total_pages);

            # don't keep trying forever
            last if $tries > 100;

            # don't donate from pages we want to donate to (including ourselves)
            redo if exists $wanted{$donator};

            # don't donate from slabs with very few pages
            redo if $stats->{$donator}{total_pages} < $min_donator_size;
        }

        move_slab($donator => $slabid);
    }
}

{
    my $last_talk_time = time;
    sub talk_periodically {
        return if time() - $last_talk_time < 60*60*24;
        syslog(LOG_DEBUG, "Still talking to %s", $host);
        $last_talk_time = time;
    }
}

$SIG{ALRM} = sub {
    die "Taking too much time in iteration";
};

my $done = 0;

$SIG{INT} = $SIG{HUP} = $SIG{TERM} = sub {
    syslog(LOG_NOTICE, "Exiting on signal");
    die "exiting on signal\n";
    $done = 1;
};

eval {
    syslog(LOG_NOTICE, "Now handling %s", $host);
    while ( not $done ) {
        alarm $check_interval*2;
        my $start_time = time;

        my ($stats, $stat_diff, $time_diff) = get_differences_and_update();
        if ( defined $stat_diff ) {
            process_differences($stats, $stat_diff, $time_diff);
        }

        my $time_taken = time() - $start_time;

        alarm 0;

        my $time_to_wait = $check_interval - $time_taken;
        sleep $time_to_wait if $time_to_wait > 0;

        talk_periodically();
    }
};

if ( $@ and $@ ne "exiting on signal\n" ) {
    syslog(LOG_ERR, "Died: %s", $@);
    exit 1;
}

exit 0;

