#!/usr/bin/env bash
#/ Usage: ghe-backup-repositories
#/ Take an online, incremental snapshot of all Git repository data.
#/
#/ Note: This command typically isn't called directly. It's invoked by
#/ ghe-backup.
set -e

# This command is designed to allow for transferring active Git repository data
# from a GitHub instance to a backup site in a way that ensures data is
# captured in a consistent state even when being written to.
#
# - All Git GC operations are disabled on the GitHub instance for the duration of
#   the backup. This removes the possibly of objects or packs being removed
#   while the backup is in progress.
#
# - In progress Git GC operations are given a cooldown window to complete. The
#   script will sleep for up to 60 seconds waiting for GC operations to finish.
#
# - Git repository data is transferred in a specific order: auxiliary files,
#   packed refs, loose refs, reflogs, and finally objects and pack files in that
#   order. This ensures that all referenced objects are captured.
#
# - Git GC operations are re-enabled on the GitHub instance.
#
# The script uses multiple runs of rsync to transfer repository files. Each run
# includes a list of filter rules that ensure only specific types of files are
# transferred.
#
# See the "FILTER RULES" and "INCLUDE/EXCLUDE PATTERN RULES" sections of the
# rsync(1) manual for more information:
#      <http://rsync.samba.org/ftp/rsync/rsync.html>

# Bring in the backup configuration
# shellcheck source=share/github-backup-utils/ghe-backup-config
. "$( dirname "${BASH_SOURCE[0]}" )/ghe-backup-config"

bm_start "$(basename $0)"

# Set up remote host and root backup snapshot directory based on config
host="$GHE_HOSTNAME"
backup_dir="$GHE_SNAPSHOT_DIR/repositories"

# Location of last good backup for rsync --link-dest
backup_current="$GHE_DATA_DIR/current/repositories"

# Verify rsync is available.
if ! rsync --version 1>/dev/null 2>&1; then
  echo "Error: rsync not found." 1>&2
  exit 1
fi

# Perform a host-check and establish GHE_REMOTE_XXX variables.
ghe_remote_version_required "$host"

# Split host:port into parts
port=$(ssh_port_part "$GHE_HOSTNAME")
host=$(ssh_host_part "$GHE_HOSTNAME")

# Add user / -l option
user="${host%@*}"
[ "$user" = "$host" ] && user="admin"

hostnames=$host
ssh_config_file_opt=
tempdir=$(mktemp -d -t backup-utils-backup-XXXXXX)
remote_tempdir=$(ghe-ssh "$GHE_HOSTNAME" -- mktemp -d -t backup-utils-backup-XXXXXX)
routes_list=$tempdir/routes_list
remote_routes_list=$remote_tempdir/remote_routes_list
opts="$GHE_EXTRA_SSH_OPTS"

# git server hostnames under cluster
if [ "$GHE_BACKUP_STRATEGY" = "cluster" ]; then
  ssh_config_file="$tempdir/ssh_config"
  ssh_config_file_opt="-F $ssh_config_file"
  opts="$opts -o UserKnownHostsFile=/dev/null -o StrictHostKeyChecking=no -o PasswordAuthentication=no"
  hostnames=$(ghe-cluster-nodes "$GHE_HOSTNAME" "git-server")
  ghe-ssh-config "$GHE_HOSTNAME" "$hostnames" > "$ssh_config_file"
fi

# Make sure root backup dir exists if this is the first run
mkdir -p "$backup_dir"

# Removes the remote sync-in-progress file on exit, re-enabling GC operations
# on the remote instance.
cleanup() {
  for pid in $(jobs -p); do
    kill -KILL $pid > /dev/null 2>&1 || true
  done

  # Enable remote GC operations
  for hostname in $hostnames; do
    ghe-gc-enable $ssh_config_file_opt $hostname:$port || true
  done

  ghe-ssh "$GHE_HOSTNAME" -- rm -rf $remote_tempdir
  rm -rf $tempdir
}
trap 'cleanup' EXIT
trap 'exit $?' INT # ^C always terminate

# Disable remote GC operations
for hostname in $hostnames; do
  ghe-gc-disable $ssh_config_file_opt $hostname:$port
done

# If we have a previous increment, avoid transferring existing files via rsync's
# --link-dest support. This also decreases physical space usage considerably.
if [ -d "$backup_current" ]; then
  link_dest="--link-dest=../../current/repositories"
fi

# Calculate sync routes. This will store the healthy repo paths for each node
#
# This gets a repo path and stores the path in the $node.sync file
# a/nw/a8/3f/02/100000855 dgit-node1 >> dgit-node1.sync
# a/nw/a8/bc/8d/100000880 dgit-node3 >> dgit-node3.sync
# a/nw/a5/06/81/100000659 dgit-node2 >> dgit-node2.sync
# ...
# One route per line.
#
# NOTE: The route generation is performed on the appliance as it is considerably
# more performant than performing over an SSH pipe.
#
bm_start "$(basename $0) - Generating routes"
echo "github-env ./bin/dgit-cluster-backup-routes > $remote_routes_list" | ghe-ssh "$GHE_HOSTNAME" -- /bin/bash
ghe-ssh "$GHE_HOSTNAME" -- cat $remote_routes_list | ghe_debug
bm_end "$(basename $0) - Generating routes"

bm_start "$(basename $0) - Fetching routes"
ghe-ssh "$GHE_HOSTNAME" -- gzip -c $remote_routes_list | gzip -d > $routes_list
cat $routes_list | ghe_debug
bm_end "$(basename $0) - Fetching routes"

bm_start "$(basename $0) - Processing routes"
if [ "$GHE_BACKUP_STRATEGY" != "cluster" ]; then
  server=$host
fi
cat $routes_list | awk -v tempdir="$tempdir" -v server="$server" '{ for(i=2;i<=NF;i++){ server != "" ? host=server : host=$i; print $1 > (tempdir"/"host".rsync") }}'
ghe_debug "\n$(find "$tempdir" -maxdepth 1 -name '*.rsync')"
bm_end "$(basename $0) - Processing routes"

if [ -z "$(find "$tempdir" -maxdepth 1 -name '*.rsync')" ]; then
  echo "Warning: no routes found, skipping repositories backup ..."
  exit 0
fi

# Transfer repository data from a GitHub instance to the current snapshot
# directory, using a previous snapshot to avoid transferring files that have
# already been transferred. A set of rsync filter rules are provided on stdin
# for each invocation.
rsync_repository_data () {
  port=$(ssh_port_part "$1")
  host=$(ssh_host_part "$1")

  #check if we are syncing from a given file list
  if [[ "$2" == *".rsync" ]]; then
    files_list="$2"
    shift
    shift
    ghe-rsync -avr \
      -e "ssh -q $opts -p $port $ssh_config_file_opt -l $user" \
      $link_dest "$@" \
      --rsync-path='sudo -u git rsync' \
      --include-from=- --exclude=\* \
      --files-from="$files_list" \
      --ignore-missing-args \
      "$host:$GHE_REMOTE_DATA_USER_DIR/repositories/" \
      "$backup_dir" 1>&3 2>&3
  else
    shift
    ghe-rsync -avr \
      -e "ssh -q $opts -p $port $ssh_config_file_opt -l $user" \
      $link_dest "$@" \
      --rsync-path='sudo -u git rsync' \
      --include-from=- --exclude=\* \
      --ignore-missing-args \
      "$host:$GHE_REMOTE_DATA_USER_DIR/repositories/" \
      "$backup_dir" 1>&3 2>&3
  fi
}

sync_data (){
  # Sync all auxiliary repository data. This includes files and directories like
  # HEAD, audit_log, config, description, info/, etc. No refs or object data
  # should be transferred here.
  echo 1>&3

  echo "* Transferring auxiliary files ..." 1>&3
  rsync_repository_data $1:122 $2 -z <<RULES
- /__*__/
- /info/

+ /*/
+ /*/*.git
- /*/*.git/objects
- /*/*.git/refs
- /*/*.git/packed-refs
- /*/*.git/logs
+ /*/*.git/**

+ /*/??/
+ /*/??/??/
+ /*/??/??/??/
+ /*/??/??/??/gist/
+ /*/??/??/??/gist/*.git
- /*/??/??/??/gist/*.git/objects
- /*/??/??/??/gist/*.git/refs
- /*/??/??/??/gist/*.git/packed-refs
- /*/??/??/??/gist/*.git/logs
+ /*/??/??/??/gist/*.git/**

+ /*/nw/??/??/??/
+ /*/nw/??/??/??/*/
+ /*/nw/??/??/??/*/*.git
- /*/nw/??/??/??/*/*.git/objects
- /*/nw/??/??/??/*/*.git/refs
- /*/nw/??/??/??/*/*.git/packed-refs
- /*/nw/??/??/??/*/*.git/logs
+ /*/nw/??/??/??/*/*.git/**
RULES

    # Sync packed refs files. This is performed before sync'ing loose refs since
    # loose refs trump packed-refs information.
  echo 1>&3
  echo "* Transferring packed-refs files ..." 1>&3
  rsync_repository_data $1:122 $2 -z <<RULES
- /__*__/
- /info/

+ /*/
+ /*/*.git
+ /*/*.git/packed-refs

+ /*/??/
+ /*/??/??/
+ /*/??/??/??/
+ /*/??/??/??/gist/
+ /*/??/??/??/gist/*.git
+ /*/??/??/??/gist/*.git/packed-refs

+ /*/nw/??/
+ /*/nw/??/??/
+ /*/nw/??/??/??/
+ /*/nw/??/??/??/*/
+ /*/nw/??/??/??/*/*.git
+ /*/nw/??/??/??/*/*.git/packed-refs
RULES
    # Sync loose refs and reflogs. This must be performed before object data is
    # transferred to ensure that all referenced objects are included.
  echo 1>&3
  echo "* Transferring refs and reflogs ..."  1>&3
  rsync_repository_data $1:122 $2 -z <<RULES
- /__*__/
- /info/

+ /*/
+ /*/*.git
+ /*/*.git/refs
+ /*/*.git/refs/**
+ /*/*.git/logs
+ /*/*.git/logs/**

+ /*/??/
+ /*/??/??/
+ /*/??/??/??/
+ /*/??/??/??/gist/
+ /*/??/??/??/gist/*.git
+ /*/??/??/??/gist/*.git/refs
+ /*/??/??/??/gist/*.git/refs/**
+ /*/??/??/??/gist/*.git/logs
+ /*/??/??/??/gist/*.git/logs/**

+ /*/nw/??/
+ /*/nw/??/??/
+ /*/nw/??/??/??/
+ /*/nw/??/??/??/*/
+ /*/nw/??/??/??/*/*.git
+ /*/nw/??/??/??/*/*.git/refs
+ /*/nw/??/??/??/*/*.git/refs/**
+ /*/nw/??/??/??/*/*.git/logs
+ /*/nw/??/??/??/*/*.git/logs/**
RULES

    # Sync git objects and pack files. Compression is disabled during this phase
    # since these files are already well compressed.
  echo 1>&3
  echo "* Transferring objects and packs ..." 1>&3
  rsync_repository_data $1:122 $2 -H <<RULES
- /__*__/
- /info/

+ /*/
+ /*/*.git
+ /*/*.git/objects
- /*/*.git/objects/**/tmp_*
+ /*/*.git/objects/**

+ /*/??/
+ /*/??/??/
+ /*/??/??/??/
+ /*/??/??/??/gist/
+ /*/??/??/??/gist/*.git
+ /*/??/??/??/gist/*.git/objects
- /*/??/??/??/gist/*.git/objects/**/tmp_*
+ /*/??/??/??/gist/*.git/objects/**

+ /*/nw/??/
+ /*/nw/??/??/
+ /*/nw/??/??/??/
+ /*/nw/??/??/??/*/
+ /*/nw/??/??/??/*/*.git
+ /*/nw/??/??/??/*/*.git/objects
- /*/nw/??/??/??/*/*.git/objects/**/tmp_*
+ /*/nw/??/??/??/*/*.git/objects/**
RULES

 echo 1>&3

}

# rsync all the repositories
bm_start "$(basename $0) - Repo sync"
for file_list in $tempdir/*.rsync; do
  hostname=$(basename $file_list .rsync)

  repo_num=$(cat $file_list | wc -l)
  ghe_verbose "* Transferring $repo_num repositories from $hostname"

  sync_data $hostname $file_list &
done

for pid in $(jobs -p); do
  wait $pid
done
bm_end "$(basename $0) - Repo sync"

# Since there are no routes for special data directories, we need to do this
# serially for all hostnames. Good candidate for future optimizations.

bm_start "$(basename $0) - Special Data Directories Sync"
for h in $hostnames; do
  # Sync __special__ data directories, including the __alambic_assets__,
  # __hookshot__, and __purgatory__ directories. The __nodeload_archives__,
  # __gitmon__, and __render__ directories are excludes since they act only as
  # caches.
  #
  # Under v2.x and greater, only the special __purgatory__ directory remains under
  # /data/repositories. All other special user data directories have been moved under
  # the /data/user directory.
  echo 1>&3
  echo "* Transferring special data directories from $h..." 1>&3
  rsync_repository_data $h:122 -z <<RULES
- /__nodeload_archives__/
- /__gitmon__/
- /__render__/
+ /__*__/
+ /__*__/**
+ /info/
- /info/lost+found/
+ /info/*
RULES
  echo 1>&3
done
bm_end "$(basename $0) - Special Data Directories Sync"

if [ -z "$GHE_SKIP_ROUTE_VERIFICATION" ]; then
  bm_start "$(basename $0) - Verifying Routes"
  cat $tempdir/*.rsync | uniq | sort | uniq > $tempdir/source_routes
  (cd $backup_dir/ && find * -mindepth 5 -maxdepth 6 -type d -name \*.git | fix_paths_for_ghe_version | uniq | sort | uniq) > $tempdir/destination_routes

  git --no-pager diff --unified=0 --no-prefix -- $tempdir/source_routes $tempdir/destination_routes || echo "Warning: One or more repository networks and/or gists were not found on the source appliance. Please contact GitHub Enterprise Support for assistance."

  bm_end "$(basename $0) - Verifying Routes"
fi

bm_end "$(basename $0)"
