#!/bin/bash

# This script handles half of the rotation of the development
# databases from the lastest backup in s3. The point of this
# script is that we take the postgresql out of rotation and
# sync to the latest backup.

# Call with the version number of postgres so we can determine full data dir.
# ./devdb-teardown cluster version master_fqdn
# Eg, ./devdb-teardown sitedb 9.3 devdb1.sfo01.justin.tv

# data_dir='/var/lib/postgresql/9.3/main/'
# recovery_date=`date --date='5AM today' +"%Y-%m-%d %T UTC"`
# wale_dir='/etc/wale.d/env/'


echo "Grabbing lock"
source /var/lib/postgresql/bin/devdb-lib.sh
devdb-lock /var/tmp/devdb-rotate
echo "Lock acquired"

cluster=${1}
version=${2}
master_db=${3}
wal_e_cluster=${4}
s3_bucket=${5}
host=$(hostname)
hostname=$(hostname --fqdn)
dest=/var/lib/postgresql/${version}/main

# deal with precise vs xenial service
START_POSTGRESQL='sudo /bin/systemctl start postgresql'
STOP_POSTGRESQL='sudo /bin/systemctl stop postgresql'

# Get the recovery timestamp posted. We fail early so that we don't lose state locally.
url=http://consul.internal.justin.tv/v1/kv/backups/db/${cluster}/latest?dc=us-west2
latest=$(curl -s ${url} | jq --raw-output '.[0].Value' | base64 --decode)
if [ -z "${latest}" ]; then
  echo "Unable to find latest ${cluster} backup."
  exit 1
fi
# XXX AGB: Advance master a little bit so that the replica catches up
# to it. It appears there is a bug that if you restore to the exact same
# point in time, the master and replica will not establish replication.
if [ "${master_db}" == "${hostname}" ]; then
  latest=$(date --utc -d "${latest} +5 minutes" +"%F %T %Z")
fi

echo "Setting up pgbouncer to route queries to remote database."
bounce_pgbouncer proxy1 remote local
bounce_pgbouncer proxy2 remote local

echo "Stopping PostgreSQL"
${STOP_POSTGRESQL}
rm -rf "${dest}"
mkdir -p "${dest}"
echo "Starting recovery to ${latest} at $(date +'%F %T %Z')."
envdir /etc/wal-e.d/env /var/lib/postgresql/virt_env/wal_e/bin/wal-e --aws-instance-profile backup-fetch ${dest} LATEST
echo "Recover to ${latest} done at $(date +'%F %T %Z')."

echo "Copying history file to pg_xlog"
/usr/local/bin/aws s3 cp --recursive s3://${s3_bucket}/postgres/${wal_e_cluster}/wale/history/ ${dest}/pg_xlog/
if [ ! -e ${dest}/pg_xlog/*.history ]; then
  echo "Copying history file failed!"
fi

# Wal-e backup-fetch does not write a recovery.conf so we need to
# write to a pitr and a restore command.  Set pause_at_recovery_target
# so we ensure master just runs and the replica does some more setup
# after recovery completes. See detailed notes below.
recovery=${dest}/recovery.conf
echo "Writing recovery into ${recovery}"
echo "restore_command = 'envdir /etc/wal-e.d/env /var/lib/postgresql/virt_env/wal_e/bin/wal-e --aws-instance-profile wal-fetch \"%f\" \"%p\"'"  > ${recovery}
echo "recovery_target_time = '${latest}'"  >> ${recovery}

if [ "${master_db}" == "${hostname}" ]; then
  case "${version}" in
  "9.3" | "9.4")
    echo 'pause_at_recovery_target = false' >> ${recovery}
    ;;
  *)
    echo 'recovery_target_action = promote' >> ${recovery}
    ;;
  esac
  # Seems this is no longer necessary. 2018-10-24
  # echo 'standby_mode = on' >> ${recovery}
else
  # XXX AGB: I would prefer that the recovery.conf above and below
  # could be combined; however, as of this writing postgresql does not
  # combine recovery.conf recovery_target_time and standby_mode
  # settings to recover to a point and time and join as a replica. We
  # loop here until replay is paused as controlled by the
  # pause_at_recovery_target setting. 2014-11-06

  case "${version}" in
  "9.3" | "9.4")
    echo 'pause_at_recovery_target = true' >> ${recovery}
    ;;
  *)
    echo 'recovery_target_action = pause' >> ${recovery}
    ;;
  esac

  echo "Starting replica PostgreSQL"
  ${START_POSTGRESQL}
  until [ $(/usr/lib/postgresql/${version}/bin/pg_isready --quiet --host=localhost; echo $?) -eq 0 ]; do
    sleep 10
  done
  echo "Postgresql is ready at $(date +'%F %T %Z')."
  until [ $(psql --tuples-only --command "select pg_is_xlog_replay_paused()") == 't' ]; do
    sleep 10
  done
  echo "PostgreSQL is paused at $(date +'%F %T %Z')."

  # Postgresql is frozen, and therefore caught up. We want it to join
  # as a replica so ensure we overwrite recovery.conf here, though
  # recovery.conf has probably already been renamed to recovery.done
  # by the recovery process.
  "Stopping PostgreSQL to rewrite ${recovery}"
  ${STOP_POSTGRESQL}
  echo "recovery_target_timeline = 'latest'" > ${recovery}
  echo 'standby_mode = on' >> ${recovery}
  echo "primary_conninfo = 'host=${master_db} port=5432 user=replication application_name=${host}'" >> ${recovery}
fi

echo "Starting PostgreSQL"
${START_POSTGRESQL}

if [ "${master_db}" == "${hostname}" ]; then
  echo "Waiting until PostgreSQL is ready"
  until [ $(/usr/lib/postgresql/${version}/bin/pg_isready --quiet --host=localhost; echo $?) -eq 0 ]; do
    sleep 10
  done
  echo "Postgresql is ready at $(date +'%F %T %Z')."

  case "${version}" in
  "9.3" | "9.4")
    echo "Promoting"
    /usr/lib/postgresql/${version}/bin/pg_ctl promote -D /etc/postgresql/${version}/main/
    ;;
  esac

  until [ $(psql --tuples-only --command "select pg_is_in_recovery()") == 'f' ]; do
    sleep 10
  done
  echo "Postgresql is done with recovery at $(date +'%F %T %Z')."

  if [[ "${cluster}" == sitedb* ]]; then
    # XXX AGB: This is sitedb specific. We need to come up with a better
    # "stuff to do after restore" than this. 2015-03-24
    echo "Altering Database Name to justintv_dev"
    psql --command "alter database justintv_prod rename to justintv_dev"
    # XXX AGB: I would like to use goose for this. See D8A-15. 2015-06-25
    #psql -d justintv_dev --command "drop table goose_db_version"
    #pushd /var/lib/postgresql/migrations/justintv_dev/current
    #goose up
    #popd
    #pushd /var/lib/postgresql/migrations/justintv_prod/current
    #goose up
    #popd

    echo "Deleting PII"

    psql -d justintv_dev <<EOF
begin;
alter table users drop column location;
alter table users add column location text;
grant update (location) on users to users_service;
commit;
truncate table facebook_connect_users;
truncate table twitter_users;
EOF
    sudo /bin/kill -1 $(cat /var/run/sandstorm-agent.pid)
    /var/lib/postgresql/bin/bulk-query --sleep 0 --limit 20000 --max 0 --start 0 --verbose -d justintv_dev /var/lib/postgresql/sql/scrub_dmca.sql
    /var/lib/postgresql/bin/bulk-query --sleep 0 --limit 20000 --max 0 --start 0 --verbose -d justintv_dev /var/lib/postgresql/sql/scrub_copyright_ip.sql
    /var/lib/postgresql/bin/bulk-query --sleep 0 --limit 20000 --max 0 --start 0 --verbose -d justintv_dev /var/lib/postgresql/sql/scrub_copyrightholder.sql
    /var/lib/postgresql/bin/bulk-query --sleep 0 --limit 20000 --max 0 --start 0 --verbose -d justintv_dev /var/lib/postgresql/sql/scrub_users.sql
    /var/lib/postgresql/bin/bulk-query --sleep 0 --limit 20000 --max 0 --start 0 --verbose -d justintv_dev /var/lib/postgresql/sql/scrub_phonenumbers.sql  
  fi
fi

echo "Done with teardown at $(date +'%F %T %Z')."
