#!/bin/bash
# 2ndQuadrant avoid_anti_wraparound_vacuums script,
# Copyright 2013-2017 2ndQuadrant Ltd, All rights reserved
# Licensed only for use by 2ndQuadrant customers
# Author Martín Marqués <martin@2ndquadrant.com>
#
# Supported PostgreSQL versions 9.1, 9.2, 9.3, 9.4, 9.5 and 9.6
# Should work on earlier versions, but there's no need to go back
# too much.

VERSION=4.1
echo "avoid_anti_wraparound_vacuums version $VERSION"

# The script relies on passing around non-zero values as error conditions between
# functions, so -e cannot be used here.
set -u

unset PGDATABASE PGHOST PGPORT PGUSER PGOPTIONS

function usage() {
    echo "
The avoid_anti_wraparound_vacuums.sh script searches for relations and
materialized views which are close to requiring an anti-wraparound vacuum
freeze. The script then performs pro-active freezing of tuples by either
one of two methods:

1) Performs a VACUUM with vacuum_freeze_table_age set to 0.
2) Calls pg_antifreeze() belonging to the antifreeze extension.

The method used will depends on if the antifreeze extension is installed in
the database being vacuumed or not.

Running the script at times when the server is not very busy will prevent
autovacuum from running an anti-wraparound vacuum (which can't be cancel
like normal autovacuums can) at busy hours. How close tables are allowed to
get to an anti-wraparound vacuum is controlled by the freeze_bound. (See below)

Default connection values are:

   - Host:         /tmp (where the postgres socket normally lives)
   - Port:         5432
   - User:         postgres
   - Freeze Bound: 80%

There are optional parameters that you can use to change the behaviour
(keep in mind that it's best to run it as a postgres superuser), which
we detail next.

  -H     Specify the host to which to connect. Defaults to local socket
         connections.

  -p     Specify a port different from the default 5432.

  -U     Specify a different user other then the postgres database user.

  -t     This option lets the user specify the maximum amount of time in
         seconds that the script will run. This is important because we may
         not want to have the script running outside the maintenance window
         that we have set.
         This defaults to 3600 (1 hour).

  -S     This option sets an upper bound for relation size so we run the
         vacuum only on relations which are smaller than this amount of
         bytes.
         Size is in bytes and if not set, it will not filter on upper
         bound size (other filters may appear)

  -s     This option sets a lower bound for relation size so we run the
         vacuum only on relations which are larger than this amount of
         bytes.
         Size is in bytes and if not set, it will not filter on lower
         bound size (other filters may appear)

  -b     This is used to set the percentage which we will use to select
         relations to be vacuumed.
         The value should be a number between 0 and 99. Only relations that
         have the relfrozenxid from heap or it's toast table higher than
         this percentage will be vacuumed. When using the antifreeze
         extension, the heap and TOAST tables will be considered for freezing
         separately.
         The special value 0 will attempt to freeze all tables.
         If not specified, the default of 80% will be used.

  -c     Use this to specify if there will be special vacuum parameters to
         be used in a configuration file. The file name must be specified
         after.
         Example: -c aawv.conf

  -h     Show help (which is what you are reading now ;) ).


To invoke it you need to:
\"./avoid_anti_wraparound_vacuums.sh -H host -p port -U user -t timeout\"

Authentication
--------------

You will probably want to create a $HOME/.pgpass file containing a line like:

host:port:*:user:password

eg:

localhost:5432:*:postgres:thepassword

to avoid being prompted for a password every time this script invokes psql.
Set the file permissions to 0600 so that psql will read it:

    chmod 0600 $HOME/.pgpass

If you get the error:

    fe_sendauth: no password supplied

during connection test then psql isn't finding your .pgpass file, it isn't
happy with its permissions, or can't find a password for the host and
database you're using in the file.

"
    exit 0
}

# How long has the script been running for?
function update_seconds_elapsed()
{
    local epoch_now=`date '+%s'`
    seconds_elapsed=$((epoch_now - start_time))
}

# How many seconds remain of our allocated time. May be negative if we've overshot.
function update_seconds_remaining()
{
    local epoch_now=`date '+%s'`
    local seconds_elapsed=$((epoch_now - start_time))
    seconds_remaining=$((totaltime - seconds_elapsed))
}

# Timestamp log entries so we can have some traceability for debugging any issues.
function logmsg()
{
    echo "[$(date --iso-8601=seconds)]: $1"
}

function psql_wrap()
{
    DBNAME="postgres"

    if [ "${1:-}" = "-d" ]; then
        DBNAME=$2
        shift
        shift
    fi

    psql --no-psqlrc -w -v ON_ERROR_STOP=0 -h "$host" -p "$port" -U "$user" -d "$DBNAME" -q -A -F " " -t  "$@"
}

function verify_conf_file_sanity()
{
    if [ "$#" -ne 1 ]; then
        return 1
    fi
    grep_str=$1

    # We only check sanity when there's a configuration file passed, else there's
    # nothing to check as there's no file to send over
    if [ -n "${vacuum_configuration_file}" ]; then
        # Debugging variables
        if [ ${debugging} -gt 0 ]; then
            logmsg "DEBUG: Running grep check on file ${vacuum_configuration_file} for string ${grep_str}"
        fi
        # Run the grep check to check sanity of the configuration file passed
        if [ `grep -E -v ${grep_str} ${vacuum_configuration_file} | wc -l` -gt 0 ]; then
            return 1
        fi
    else
        # No configuration file means it's not sane as well. We return non-zero
        return 2
    fi
    return 0
}

# We check the value passed with -b and format the string to use in the query
# We also check that it's a valid value, else fall back to the default.
function parse_freeze_bound()
{
    if [ ${1} -lt 0 ] || [ ${1} -gt 99 ]; then
        echo 80
    else
        # We don't accept values outside the range of 0 to 99.
        echo ${1}
    fi
}

function freeze_tuples()
{
    update_seconds_remaining

    if [ $seconds_remaining -le 0 ]; then
        # No more time to execute here, we're returning a non zero value
        # so appropriate measures are taken outside the function.
        return 1
    fi

    # Time left until the script must end due to timeout parameter
    # in miliseconds, so we multiply by 1000
    local new_timeout=$((seconds_remaining * 1000))

    load_conf_file=""
    if verify_conf_file_sanity '(^SET|^--|^$)'; then
        load_conf_file="\i "${vacuum_configuration_file}
    fi

    # Use the antifreeze extension, if enabled, else we fall
    # back to the good old vacuum with vacuum_freeze_table_age set to zero.
    if [ $use_antifreeze_ext = 't' ]; then
        # We will run the antifreeze extension
        logmsg "Performing freeze on relation $1 in $database database using antifreeze"
        echo ""

        psql_wrap -d "$database" <<EOF
-- Load the antifreeze extension
LOAD 'antifreeze';

-- We need to set antifreeze.freeze_xid_trigger_age to vacuum_freeze_min_age
-- by updating the pg_settings table
UPDATE pg_settings SET setting = current_setting('vacuum_freeze_min_age')
       WHERE name = 'antifreeze.freeze_xid_trigger_age';

-- Load parameters from conf file
${load_conf_file}
SET statement_timeout TO '${new_timeout}';
SELECT pg_antifreeze('$1');

-- Lets sleep for half a second
SELECT pg_sleep(0.5);
EOF
    else
        # We'll run vacuum in the absence of antifreeze

        logmsg "Performing freeze on relation $1 in $database database using VACUUM"
        echo ""

        # Execute the VACUUM in verbose mode
        psql_wrap -d "$database" <<EOF
-- Load parameters from conf file
${load_conf_file}
SET statement_timeout TO '${new_timeout}';
SET vacuum_freeze_table_age TO 0;
SET vacuum_multixact_freeze_table_age TO 0;
VACUUM VERBOSE $1;
-- Lets sleep for half a second
SELECT pg_sleep(0.5);
EOF
    fi
    echo ""

    # make a final check to see if we're now over the timeout.
    update_seconds_remaining

    if [ $seconds_remaining -le 0 ]; then
        return 1;
    fi

    # All seems well enough to continue. We may have gotten a lock timeout, but that
    # does not stop us from going on with other relations.
    return 0;
}

# Counts how many objects are in the "close to antiwraparound" interval
function count_antiwraparound_relations()
{
    # When using the antifreeze extension, we'll list heap and TOAST relations
    # separately. The reason for this is that pg_antifreeze(rel) does not freeze
    # the TOAST table, whereas VACUUM rel; does.
    if [ $use_antifreeze_ext = 't' ]; then
	for count in $(psql_wrap -d "$database" <<EOF
SELECT count(*) AS count
FROM (SELECT c.oid::regclass as table_name,
(current_setting('autovacuum_freeze_max_age')::INT8 - age(c.relfrozenxid)) as xid_left_to_antifreeze
FROM (pg_class c JOIN pg_namespace n ON (c.relnamespace=n.oid))
WHERE c.relkind IN ('r','t','m') and age(c.relfrozenxid)::INT8  > (current_setting('autovacuum_freeze_max_age')::INT8 * (${freeze_bound} / 100.0) )
${RELATION_SIZE_FILTER}
) AS foo;
EOF
		      )
	do
	    echo "$count"
	done

    else
	for count in $(psql_wrap -d "$database" <<EOF
SELECT count(*) AS count
FROM (SELECT c.oid::regclass as table_name,
(current_setting('autovacuum_freeze_max_age')::INT8 - greatest(age(c.relfrozenxid),age(t.relfrozenxid))) as xid_left_to_antifreeze
FROM (pg_class c JOIN pg_namespace n ON (c.relnamespace=n.oid))
LEFT JOIN pg_class t ON c.reltoastrelid = t.oid
WHERE c.relkind IN ('r','m') and greatest(age(c.relfrozenxid),age(t.relfrozenxid))::INT8  > (current_setting('autovacuum_freeze_max_age')::INT8 * (${freeze_bound} / 100.0) )
${RELATION_SIZE_FILTER}
) AS foo;
EOF
		      )
	do
	    echo "$count"
	done
    fi
}

function close_to_antiwraparound_relations()
{
  # We are looking for objects which have a relfrozenxid age older than
  # autovacuum_freeze_max_age multiplied by the freeze_bound

  # We must handle things slightly differently when the antifreeze extension
  # is installed. The reason for this is that normal VACUUM will also VACUUM
  # the TOAST table too, but pg_antifreeze won't. So when antifreeze is
  # installed we'll also fetch the TOAST table names here.

if [ $use_antifreeze_ext = 't' ]; then
    for relation  in $(psql_wrap -d "$database" <<EOF
SELECT table_name
FROM (SELECT c.oid::regclass as table_name,
(current_setting('autovacuum_freeze_max_age')::INT8 - age(c.relfrozenxid)) as xid_left_to_antifreeze,
pg_relation_size(c.oid) as size
FROM (pg_class c JOIN pg_namespace n ON (c.relnamespace=n.oid))
WHERE c.relkind IN ('r','t','m') and age(c.relfrozenxid)::INT8 > (current_setting('autovacuum_freeze_max_age')::INT8 * (${freeze_bound} / 100.0) )
${RELATION_SIZE_FILTER}
ORDER BY 2 ASC) AS foo;
EOF
    )
    do
      echo "$relation";
    done

  else

    for relation  in $(psql_wrap -d "$database" <<EOF
SELECT table_name
FROM (SELECT c.oid::regclass as table_name,
(current_setting('autovacuum_freeze_max_age')::INT8 - greatest(age(c.relfrozenxid),age(t.relfrozenxid))) as xid_left_to_antifreeze,
pg_relation_size(c.oid) as size
FROM (pg_class c JOIN pg_namespace n ON (c.relnamespace=n.oid))
LEFT JOIN pg_class t ON c.reltoastrelid = t.oid
WHERE c.relkind IN ('r','m') and greatest(age(c.relfrozenxid), age(t.relfrozenxid))::INT8 > (current_setting('autovacuum_freeze_max_age')::INT8 * (${freeze_bound} / 100.0) )
${RELATION_SIZE_FILTER}
ORDER BY 2 ASC) AS foo;
EOF
    )
    do
      echo "$relation";
    done
  fi
}


function close_to_multixact_antiwraparound_relations()
{
  # We are looking for objects which have a relfrozenxid age older than
  # autovacuum_freeze_max_age multiplied by the freeze_bound

  # We must handle things slightly differently when the antifreeze extension
  # is installed. The reason for this is that normal VACUUM will also VACUUM
  # the TOAST table too, but pg_antifreeze won't. So when antifreeze is
  # installed we'll also fetch the TOAST table names here.

    if [ $server_version_num -ge "90500" -o $use_antifreeze_ext = 't' ]; then
	if [ $use_antifreeze_ext = 't' ]; then
	    for relation  in $(psql_wrap -d "$database" <<EOF
SELECT table_name
FROM (SELECT c.oid::regclass as table_name,
(current_setting('autovacuum_multixact_freeze_max_age')::INT8 - mxid_age(c.relminmxid)) as multixid_left_to_antifreeze,
pg_relation_size(c.oid) as size
FROM (pg_class c JOIN pg_namespace n ON (c.relnamespace=n.oid))
WHERE c.relkind IN ('r','t','m') and mxid_age(c.relminmxid)::INT8 > (current_setting('autovacuum_multixact_freeze_max_age')::INT8 * (1.0 / 8) )
${RELATION_SIZE_FILTER}
ORDER BY 2 ASC) AS foo;
EOF
			      )
	    do
		echo "$relation";
	    done
	fi
    fi
}


function avoid_anti_wraparound_vacuums()
{
    for r in $(close_to_antiwraparound_relations)
    do
        # Perform freeze on this relation.
        freeze_tuples "${r}"

        # freeze_tuples will return 1 if total execution time has been used,
        # in which case the script should end.
        if [ $? -ne 0 ]; then
            logmsg "Execution aborted during freezing on $database database."
            logmsg "The maximum execution time of ${totaltime} seconds has been reached."

            # Before we exit, we have to bump the min XID fields in the `pg_database` catalog.
            # But only if we used the antifreeze extension.
            if [ $use_antifreeze_ext = 't' ]; then
                psql_wrap -d "$database" -c "SELECT pg_update_datfrozenxid();"
            fi
            relations_left=$(count_antiwraparound_relations)
            logmsg "There are still $relations_left relations to freeze in $database."

            # signal a timeout to the caller.
            return 1;
        fi
    done

    # Now we check for tables close to multixact wraparound
    for r in $(close_to_multixact_antiwraparound_relations)
    do
	echo "Checking database $database, relation $r for MultiXact."
        # Perform freeze on this relation.
        freeze_tuples "${r}"

        # freeze_tuples will return 1 if total execution time has been used,
        # in which case the script should end.
        if [ $? -ne 0 ]; then
            logmsg "Execution aborted during freezing on $database database."
            logmsg "The maximum execution time of ${totaltime} seconds has been reached."

            # Before we exit, we have to bump the min XID fields in the `pg_database` catalog.
            # But only if we used the antifreeze extension.
            if [ $use_antifreeze_ext = 't' ]; then
                psql_wrap -d "$database" -c "SELECT pg_update_datfrozenxid();"
            fi
            relations_left=$(count_antiwraparound_relations)
            logmsg "There are still $relations_left relations to freeze in $database."

            # signal a timeout to the caller.
            return 1;
        fi
    done

    # If we used the antifreeze extension, we have to bump the min XID fields
    # in the `pg_database` catalog.
    if [ $use_antifreeze_ext = 't' ]; then
        psql_wrap -d "$database" -c "SELECT pg_update_datfrozenxid();"
    fi

    logmsg "Finished freezing $database database."
    return 0;
}

# Save the start time of the script so we can calculate when we must finish.
start_time=`date '+%s'`

# New debugging feature. Can take the values on and off for now.
debugging=0

# Self explaind parameters
host="/tmp"
port="5432"
user="postgres"

# Default total execution time in seconds
totaltime="3600"

# Threshold for size of relations. 0 means no limit.
relmaxsize=0

# We will use a default bound of 80% of autovacuum_freeze_max_age, but this
# can be over-written by using the -b option
freeze_bound=80

# If we don't pass a configuration file, we should just do nothing. As what
# we actually do is execute in psql what ever is in vacuum_configuration_file
# we should initialize it empty, so it does nothing if no configuration file
# was passed.
# By the way, we don't check that the file actually exists. That's the
# sysadmin's responsibility. Non existing file will not break the script.
vacuum_configuration_file=""

# We want to show a message with the options passed
op_passed="Options passed: "
#
# We start gathering variables passed by argument
#
while getopts ":hH:p:U:t:S:s:b:c:" optname
do
    case "$optname" in
	"H")
	    host=${OPTARG}
	    op_passed=$op_passed"host -> $host, "
            ;;
	"p")
	    port=${OPTARG}
	    op_passed=$op_passed"port -> $port, "
            ;;
	"U")
	    user=${OPTARG}
	    op_passed=$op_passed"user -> $user, "
            ;;
	"t")
	    totaltime=${OPTARG}
	    op_passed=$op_passed"total time -> $totaltime, "
            ;;
	"S")
	    relmaxsize=${OPTARG}
	    op_passed=$op_passed"max rel size -> $relmaxsize, "
            ;;
	"s")
	    relminsize=${OPTARG}
	    op_passed=$op_passed"min rel size -> $relminsize, "
	    ;;
	"b")
	    freeze_bound=$(parse_freeze_bound ${OPTARG})
	    op_passed=$op_passed"freeze limit -> $freeze_bound, "
	    ;;
	"c")
	    vacuum_configuration_file=${OPTARG}
	    ;;
	"h")
	    usage
	    exit 0;
            ;;
	"?")
            echo "Unknown option $OPTARG"
	    exit 1
            ;;
	":")
            echo "No argument value for option $OPTARG"
	    exit 1
            ;;
	*)
	    # Should not occur
            echo "Unknown error while processing options"
	    exit 5
            ;;
    esac
done

echo
echo $op_passed | sed 's/,/\n               /g' | perl -ne 'chomp;print scalar reverse;' | cut -d',' -f 2- | perl -ne 'chomp;print scalar reverse;'
echo

echo -n "Testing connection..."
if ! psql_wrap -c "SELECT 1" > /dev/null 2>/tmp/$$.psqltest; then
	echo
	echo "Connection parameters invalid or missing ~/.pgpass file."
	echo "psql failed with:"
	echo "-------------"
	cat /tmp/$$.psqltest
	echo "-------------"
	echo "Run this script without parameters for more help"
	exit 1
fi
echo ' connection ok'
rm -f /tmp/$$.psqltest

# First we get the version of the server, as we might have to make query
# decisions based on the version in use
server_version_num=$(psql_wrap -d "postgres" -c "SELECT setting FROM pg_settings WHERE name = 'server_version_num'")


# Ensure we're connected to a master server, since we can't VACUUM on a standby
standby=$(psql_wrap -d "postgres" -c "select pg_is_in_recovery()")
if [ $standby = 't' ]; then
    echo "VACUUM cannot be run on a standby server. This should be run on the master server instead";
    exit 1;
fi


# We initialize RELATION_SIZE_FILTER as an empty string, and then concatenate
# other filters related to relation size
RELATION_SIZE_FILTER=""

# If we set a relsize larger then 0, then we prepare the WHERE to append
# a lower bound based on size
if [ ! -z ${relminsize+xx} ]; then
    RELATION_SIZE_FILTER=$RELATION_SIZE_FILTER" AND pg_relation_size(c.oid::regclass) >= "$relminsize
fi

# We now check if $relmaxsize is set, and use if to build the upper bound filter
if [ ${relmaxsize} -ne 0 ]; then
    RELATION_SIZE_FILTER=$RELATION_SIZE_FILTER" AND pg_relation_size(c.oid::regclass) <= "$relmaxsize
fi

# It's useful to log the current transaction ID so we can see how many have been consumed
# each day by looking at the log files.
txid_current=$(psql_wrap -d "postgres" -c "SELECT txid_current()")
logmsg "Current transaction ID = $txid_current"

# Perform anti wraparound vacuum on each database that we're allowed to connect to on the cluster.
# We're going to generate the SQL string before calling it int he for loop as it will be dynamic
# depending on if we have antifreeze or not installed
if [ $server_version_num -ge '90500' ]; then
    list_databases_sql="
SELECT datname
FROM pg_database
WHERE datallowconn AND datname <> 'bdr_supervisordb' AND
   (age(datfrozenxid) > (current_setting('autovacuum_freeze_max_age')::INT8 * (${freeze_bound} / 100.0) ) or
       mxid_age(datminmxid)::INT8 > (current_setting('autovacuum_multixact_freeze_max_age')::INT8 * (1.0 / 8)))
ORDER BY current_setting('autovacuum_freeze_max_age')::INT8 - age(datfrozenxid) ASC"
else
    list_databases_sql="
SELECT datname
FROM pg_database
WHERE datallowconn and datname <> 'bdr_supervisordb'
ORDER BY current_setting('autovacuum_freeze_max_age')::INT8 - age(datfrozenxid) ASC"
fi


for database in $(psql_wrap -d "postgres" -c "${list_databases_sql}")
do
    # Check if the antifreeze extension is installed on this database. We'll want to use this if it is.
    use_antifreeze_ext=$(psql_wrap -d "$database" -c "SELECT coalesce((SELECT true FROM pg_extension WHERE extname = 'antifreeze'), false)")
    relations_left=$(count_antiwraparound_relations)

    logmsg "Found ${relations_left} relation(s) which require freezing in $database database. There might also be other relations which need freezing for Multixact"

    avoid_anti_wraparound_vacuums

    # if the avoid_anti_wraparound_vacuums function returns a non zero
    # value, we should end execution
    if [ $? -ne 0 ]; then
        exit 0;
    fi

    logmsg "======================================"
done

# It's possible that the script executed for quite a number of hours, in which case
# it is possible that there's relations which previously didn't require freezing which
# now do require freezing. It seems like a wise idea to make the most of any remaining
# time...

update_seconds_remaining

if [ $seconds_remaining -le 0 ]; then
    # No more time to execute here so exit.
    return 0;
fi

for database in $(psql_wrap -d "postgres" -c "${list_databases_sql}")
do
    # Check if the antifreeze extension is installed on this database. We'll want to use this if it is.
    # We must do this before counting the relations as if we're using the antifreeze extension then we
    # need to freeze toast tables explicitly.
    use_antifreeze_ext=$(psql_wrap -d "$database" -c "SELECT coalesce((SELECT true FROM pg_extension WHERE extname = 'antifreeze'), false)")
    relations_left=$(count_antiwraparound_relations)

    # pretend that we're only now retrying. This saves from having to mention this fact in the log file.
    logmsg "Retrying to freeze relations in $database database..."
    # lo and behold we found something!
    logmsg "Found ${relations_left} relation(s) which require freezing in $database database"

    avoid_anti_wraparound_vacuums

    # if the avoid_anti_wraparound_vacuums function returns a non zero
    # value, we should end execution
    if [ $? -ne 0 ]; then
        exit 0;
    fi

    logmsg "======================================"
done
