#!/usr/bin/perl

use strict;use warnings;
use Digest::SHA qw(sha384_hex);
use Getopt::Std;
use JSON::PP;
use LWP::Simple;

=head1 rotate-tls-session-keys.pl
    Rotate TLS session keys once per day

    Rotate these three keys:
    tls_key_previous, tls_key_current, tls_key_next

    First under this tree:
    video/usher-proxy/production/cert_usher_ttvnw_net

    And next, under this tree:
    video/video-edge/production/$dc/wildcard_hls_ttvnw_net

    where $dc is every datacenter that is seen via:
    curl http://consul.internal.justin.tv/v1/catalog/datacenters

    These keys need to be either 48 binary bytes in the case of
    usher-proxy, or 64 bytes encoding 48 binary bytes, in the case
    of video-edge.

    Rotation is as follows:
    tls_key_previous = tls_key_current
    tls_key_current = tls_key_next
    tls_key_next = <newly generated value of appropriate format>

    All logging is to STDOUT and STDERR, which should land in
    /var/log/jtv/rotate-tls-session-keys-cron.log if this is invoked in the
    expected automatic way.

    This program also creates the three necessary keys when a new
    datacenter is detected.  This detection is via looking for a
    key get failure.

=cut

#basic options
my $opts = {};
getopts('dt', $opts);
#the next two lines are just for clarity; getopts() sets the 'd' or 't', but
#I want to reference those flags with a better, more readable name.
$opts->{debug} = $opts->{d} || 0;  #don't have any debugging stuff set yet
$opts->{test} = $opts->{t} || 0; #causes PUTs to not happen

#The list of actual keys that are rotated through
my @keys = ('tls_key_previous','tls_key_current','tls_key_next');

#convenience function
sub say {
    my $saystr = shift;
    my $now = scalar localtime;
    print "$now: $saystr\n";
}

say 'beginning';
say 'debug set' if $opts->{debug};
say 'test set' if $opts->{test};

#setup some kind of reasonable PATH
$ENV{PATH} = "/bin:/usr/bin:/usr/local/bin:/sbin:/usr/sbin:/usr/local/sbin:$ENV{PATH}";

#See if we can find a sandstorm executable
#Kind of a ghetto way to do it, but it's robust and simple.
{   my $ret = `which sandstorm`;    #the {} syntax forces $ret to be only in
                                    #scope for these two lines of code
    die 'unable to locate sandstorm executable' if length($ret) < 9;
}

#Convert the sensitive value into something short, consistent, and not
#sensitive, and also output how long the original value is
sub _key_fingerprint {
    my $key = shift;
    $key = '' unless defined $key;
    my $length = length($key);
    my $digest = substr(sha384_hex($key), 0, 5);
    return "$digest($length bytes)";
}

#takes the key path and the value to put.
#returns nothing useful, and throws an exception if anything strange
#happens, while logging what it's doing.
sub sandstorm_put {
    my $full_key_path = shift;
    my $value = shift;

    #do not do the put if -t is specified
    if($opts->{test}) {
        say "TEST: sandstorm put $full_key_path " . _key_fingerprint($value);
        return;
    }
    say "sandstorm put $full_key_path " . _key_fingerprint($value);
    eval {
        local $SIG{ALRM} = sub { die "timed out\n"; };  #as elsewhere, a 45
                                                        #second timeout
        alarm 45;
        open my $fh, '|-', "sandstorm put $full_key_path 2>&1"
            or die "open failed: $!";   #$! contains a string with the error
                                        #these functions return 0 on failure.
        print $fh $value
            or die "print failed: $!";
        close $fh or die "close failed: $!";
    };
    alarm 0;
    #eval {} sets $@ if there was a die() inside of it.  So these lines
    #of code are effectively re-throwing an exception, with additional
    #information.
    die "sandstorm_put failed: (\$full_key_path=$full_key_path, \$value=" . _key_fingerprint($value) . "): $@"
        if $@;
}

#nginx wants exactly 48 binary bytes, so get that from /dev/urandom
sub generate_nginx_ticket_key {
    my $ret = eval {
        open my $fh, '<', '/dev/urandom'
            or die "open of /dev/urandom failed: $!";
        read $fh, my $new_key, 48
            or die "read from /dev/urandom failed: $!";
        close $fh or die "close of /dev/urandom failed: $!";
        return $new_key;
    };
    die "generate_nginx_ticket_key failed: $@" if $@;
    return $ret;
}

#haproxy wants exactly 48 binary bytes, base64 encoded, so use
#openssl to create that
sub generate_haproxy_ticket_key {
    my $ret = eval {
        open my $fh, '-|', 'openssl rand 48 -base64'
            or die "open of 'openssl rand 48 -base64' failed: $!";
        read $fh, my $new_key, 64
            or die "read from 'openssl rand 48 -base64' failed: $!";
        close $fh or die "close of 'openssl rand 48 -base64' failed: $!";
        return $new_key;
    };
    die "generate_haproxy_ticket_key failed: $@" if $@;
    return $ret;

}

#Get some sandstorm key, but only the number of bytes that we need.
#This should protect us from anything that the sandstorm system adds to
#the end.
#In golang style, this returns two items: the value (if any) and an
#error (if any)
#But this ALSO throws more traditional exceptions, for more exceptional,
#unforseen situations.  So this should be called in an eval {} block.
sub sandstorm_get {
    my $full_key_path = shift;
    my $required_key_length = shift;
    my $return_err = undef;
    my $ret = eval {
#        if($full_key_path =~ /sel01/) {
#            die 'death because testing';
#        }
        open my $fh, '-|', "sandstorm get $full_key_path 2>&1"
            or die "open failed: $!";
        my $bytes_read = read $fh, (my $value), $required_key_length
            or die "read failed: $!";
        close $fh;
        #close your eyes for this stuff
        #since we don't have any supporting modules, we have to use the
        #primitive, built-in variables to get the return code from the
        #call to 'sandstorm get', above, in order to find out if the
        #get failed.
        my $raw_exit_code = $?; #what actually comes back is 8 bits shifted
        if($raw_exit_code) {
            my $err = $!;   #$! contains the string version of UNIX errno
            my $exit_code = $raw_exit_code / 256;   #..unshifted here to get
                                                    #the real exit code

            #the sandstorm executable returns exit code 1, and STDERR
            #of 'no such secret' if the requested key isn't found.
            #We catch that particular case.
            if($exit_code == 1 and $value =~ /no such secret/) {
                #No secret means we are all done, setting the return_err
                #to a predictable string, and return undef, which will be
                #the first part of the return tuple.
                $return_err = 'no such secret';
                return undef;   #returns out of the eval {} block
            } else {
                die "read failed $err";
            }
        }
        if($bytes_read != $required_key_length) {
            die "$bytes_read bytes read out of a required $required_key_length";
        }
        #this returns from the eval {}
        return $value;
    };
    #first we re-throw if a traditional exception happened, enriching
    #the exception with some useful text.  $@ is where the exception string
    #is put, if any.
    if($@) {
        die "sandstorm_get failed: (\$full_key_path=$full_key_path \$required_key_length=$required_key_length): $@";
    }
    #otherwise, we return the tuple.  One of these should be set, and the
    #other undef.
    return ($ret, $return_err);
}

#get a list of datacenters to operate on
sub get_dcs {
    my @valid_datacenters;
    my @invalid_dc = [ "disasterrecovery", "cmh01", "lhr05", "pdx05" ];
    my $ret = get("http://consul.internal.justin.tv/v1/catalog/datacenters");
    my $dcs = decode_json($ret);

    foreach my $check_dc (@$dcs) {
        my $data_center_services = decode_json(get("http://consul.internal.justin.tv/v1/catalog/services?dc=${check_dc}"));
        if ( exists $data_center_services->{'video-hls-replication'} ) {
            unless ( $check_dc ~~ @invalid_dc ) {
                push @valid_datacenters , $check_dc;
            }
        }
    }

    return @valid_datacenters;
}

#This does a single rotation tuple.
#It takes in the full key prefix, the number of bytes the value must
#contain, and a reference to an appropriate key generation function.
#It also detects the case where a new datacenter hasn't had any keys
#set and rotation is impossible, so it does the initial set of the
#necessary three keys.
#Its first argument is the path to the keys to be rotate.
#Its second argument is the expected length of those keys.
#Its third and final argument is a CODE reference that will, when
#called, return a new key of the proper length.
#It doesn't return anything useful.
sub do_key_triple {
    my $key_path_prefix = shift;
    my $required_key_length = shift;
    my $key_generator = shift;

    #grab the current keys
    my $current_keys = {};
    foreach my $key (sort @keys) {  #The sort isn't technically necessary, but
                                    #it makes log analysis a little easier.
        my $full_path = "$key_path_prefix/$key";
        #see in-line doc with sub sandstorm_get()
        #in short, sandstorm_get() returns a golang style tuple, the first
        #item being the returned value, if any, and the second being the
        #error, if any.
        ($current_keys->{$key}, my $err) = sandstorm_get($full_path, $required_key_length);

        #So this means there was an error (not an exception) in sandstorm_get,
        #and specifically that the requested key didn't exist.  We now assume
        #this is a new datacenter, so we set call put_initial_keys() in
        #lieu of any of the normal rotation logic, and then return
        if($err and $err eq 'no such secret') {
            put_initial_keys($key_path_prefix, $required_key_length, $key_generator);
            return;
        }
        say "current key: $full_path=" . _key_fingerprint($current_keys->{$key});
    }

    #now rotate them
    $current_keys->{tls_key_previous} = $current_keys->{tls_key_current};
    $current_keys->{tls_key_current} = $current_keys->{tls_key_next};
    $current_keys->{tls_key_next} = &$key_generator;

    #put the updated values back
    foreach my $key (sort @keys) {
        my $full_path = "$key_path_prefix/$key";
        say "PUT key: $full_path=" . _key_fingerprint($current_keys->{$key});
        sandstorm_put($full_path, $current_keys->{$key});
    }
}

#takes the same stuff as do_key_triple, but instead of rotating the keys,
#it creates the necessary three and sets them, assuming this is a new
#DC.
sub put_initial_keys {
    my $key_path_prefix = shift;
    my $required_key_length = shift;
    my $key_generator = shift;

    say "Values under $key_path_prefix weren't found, so this is probably a new datacenter.";
    say "Setting all keys to initial values.";
    foreach my $key (sort @keys) {
        my $full_path = "$key_path_prefix/$key";
        my $key_value = &$key_generator;
        say "INITIAL PUT key: $full_path=" . _key_fingerprint($key_value);
        sandstorm_put($full_path, $key_value);
    }
}

#main starts here.  I could do a sub main {}  main();
#but what's the point for this kind of program?


#Keep track of the nagios return string and return code so we can write
#them to a file at the end.
my $nagios_string = 'OK, last run at ' . (scalar localtime);
my $nagios_code = 0;

############################################
#First do usher-proxy.  This is global, not per-datacenter
eval {
    say 'beginning usher-proxy nginx key rotation';
    #the following two lines and the alarm 0; later is about putting a
    #timeout of (in this case) 45 seconds on all of the stuff in the eval {}
    #block, which is, in this case, just do_key_triple()
    local $SIG{ALRM} = sub { die "timed out\n"; };
    alarm 45;
    do_key_triple(
        'video/usher-proxy/production/cert_usher_ttvnw_net',
        48,
        \&generate_nginx_ticket_key
    );
};
alarm 0;
#exception string goes into $@
if($@) {
    say "usher-proxy nginx key rotation failed: $@";
    $nagios_string = 'usher-proxy key (video/usher-proxy/production/cert_usher_ttvnw_net/*) rotation failed';
    $nagios_code = 2;
}

#create haproxy ticket keys for usher-external
#NOTE: I would like to do BOTH temporarily as we slowly migrate from one infra to another.
eval {
    say 'beginning usher-external haproxy key rotation';
    #the following two lines and the alarm 0; later is about putting a
    #timeout of (in this case) 45 seconds on all of the stuff in the eval {}
    #block, which is, in this case, just do_key_triple()
    local $SIG{ALRM} = sub { die "timed out\n"; };
    alarm 45;
    do_key_triple(
        'video/usher-external/production/cert_usher_ttvnw_net',
        64,
        \&generate_haproxy_ticket_key
    );
};
alarm 0;
#exception string goes into $@
if($@) {
    say "usher-proxy haproxy key rotation failed: $@";
    $nagios_string = 'usher-proxy key (video/usher-external/production/cert_usher_ttvnw_net/*) rotation failed';
    $nagios_code = 2;
}


############################################
#Now do video-edge.  This is per datacenter
my $failed_dcs = '';    #for $nagios_string
foreach my $dc (get_dcs()) {
    say "beginning video-edge key rotation \$dc=$dc";

    for (my $attempt=0; $attempt < 3; $attempt++) {
        eval {
            local $SIG{ALRM} = sub { die "timed out\n"; };
            alarm 10;
            do_key_triple(
                "video/video-edge/production/$dc/wildcard_hls_ttvnw_net",
                64,
                \&generate_haproxy_ticket_key
            );
        };
        alarm 0;
        if($@) {
            say "video-edge key rotation failed (\$dc=$dc): $@";
            if ($attempt >= 2) {
                $failed_dcs = "$failed_dcs $dc";
            }
        }
        else {
            last;
        }
    }
}

if($failed_dcs) {
    $nagios_string = "video-edge key (video/video-edge/production/\$dc/wildcard_hls_ttvnw_net) rotation failed for the following DCs: $failed_dcs";
    $nagios_code = 2;
}

say "\$nagios_string=$nagios_string  \$nagios_code=$nagios_code";
if($opts->{test}) {
    say 'TEST ending, without writing nagios info';
    exit 0;
}

#Now write the stuff the nagios NRPE check will read and return
eval {
    local $SIG{ALRM} = sub { die "timed out\n"; };
    alarm 10;
    my $file_prefix = '/var/log/rotate-usher-proxy-tls-session-keys-nagios-';
    {   my $filename = $file_prefix . 'string';
        open my $fh, '>', $filename
            or die "failed to open $filename for writing: $!";
        print $fh "$nagios_string\n"
            or die "failed to print to $filename: $!";
        close $fh or die "failed to close $filename: $!";
    }
    {   my $filename = $file_prefix . 'return-code';
        open my $fh, '>', $filename
            or die "failed to open $filename for writing: $!";
        print $fh "$nagios_code\n"
            or die "failed to print to $filename: $!";
        close $fh or die "failed to close $filename: $!";
    }
};
alarm 0;
if($@) {
    #I don't think there's anything else we can do in this case except log
    #what happened and move on.
    say "Exception in writing nagios info: $@";
}

say 'ending';
exit 0;
