#!/bin/bash

set -eu

source /etc/milliner-client-env.sh
mkdir -p /var/log/jtv/

exec &> >(tee -a /var/log/jtv/staged-puppet.log)

echo "running $0"

SCRIPT_DIR=$(dirname "$0")
NAGIOS_ADDR=video-nagios.internal.justin.tv
NAGIOS_PORT=6557
NAGIOS_TIMEOUT_SECONDS=10

PUPPET_ENVIRONMENT=production
CODEDEPLOY_REGION=''
APP_ENV_FILE="$0.environment"
# shellcheck source=/dev/null
[ -f "$APP_ENV_FILE" ] && . "$APP_ENV_FILE"

MAX_PUPPET_RUN_COUNT=2
PROXY_FILE=/etc/profile.d/proxy.sh
# shellcheck source=/dev/null
[ -f $PROXY_FILE ] && . $PROXY_FILE

puppet_lock_file=$(puppet config print agent_catalog_run_lockfile)
puppet config set --section agent environment "$PUPPET_ENVIRONMENT"

[ -f "$(puppet config print agent_disabled_lockfile)" ] && exit 0

checks=(
    check_consul
)

VALIDATE_RETRY_ATTEMPTS=3
VALIDATE_DELAY_SECONDS=5

# we need to know MACHINE_CLASS, which is provided via kernel arguments in the /proc/cmdline
# let's transform kernel arguments to the name=value list and source them
# shellcheck source=/dev/null
. <(grep -oP "\S+=\S+" /proc/cmdline)

while read -r check_name; do
    checks+=("$check_name")
done < <(cat "$(dirname "$(realpath "$0")")/$MACHINE_CLASS.checklist" 2>/dev/null || true)

_host_is_downtimed=""
host_is_downtimed() {
    [ -n "$_host_is_downtimed" ] && return $_host_is_downtimed
    # we query livestatus for the downtime information and if it is present we exit with 0
    local status
    status=$(echo -e "GET hosts\nColumns: downtimes\nFilter: host_name = $(hostname -f)\n" \
        | nc -w $NAGIOS_TIMEOUT_SECONDS $NAGIOS_ADDR $NAGIOS_PORT)
    [ -z "$status" ] && exit 1
    _host_is_downtimed=0
    jq -e '.[0][0]|length > 0' >/dev/null <<<"$status" || _host_is_downtimed=$?
    return $_host_is_downtimed
}

unset _check_is_downtimed
check_is_downtimed() {
    local check_name
    [ "${_check_is_downtimed-undefined}" == "undefined" ] && {
        _check_is_downtimed=$(echo -e "GET services\nColumns: description\nFilter: host_name = $(hostname -f)\nFilter: downtimes !=\n" \
        | nc -w $NAGIOS_TIMEOUT_SECONDS $NAGIOS_ADDR $NAGIOS_PORT \
        | jq -r '.[][]')
    }
    for check_name in "$@"; do
        grep -q "^${check_name}$" <<<"$_check_is_downtimed" && return 0
    done
    return 1
}

run_puppet() {
    spmu_region="-r ${CODEDEPLOY_REGION}"
    [ -z "$CODEDEPLOY_REGION" ] && spmu_region=''
    local retries=0 exitcode
    while ((retries < MAX_PUPPET_RUN_COUNT)); do
        exitcode=0
        /usr/bin/puppet agent -t --environment="$PUPPET_ENVIRONMENT" --color=no || exitcode=$?
        ((exitcode == 0)) && {
            "${SCRIPT_DIR}/spmu" -c $exitcode $spmu_region
            return 0
        }
        [ -r "$puppet_lock_file" ] && continue
        ((retries++)) || true
    done
    "${SCRIPT_DIR}/spmu" -c $exitcode $spmu_region
    ((exitcode == 2)) && return 0
    host_is_downtimed && exit 0
    check_is_downtimed "${checks[@]}" && exit 0
    exit 1
}

run_puppet

validate_checks() {
    local check_name exitcode=0
    local -A check_results # hashmap with the key - name of the check, value - check status
    sleep $VALIDATE_DELAY_SECONDS
    check_results=()
    # initially nrdc was restarted to force rerunning all the checks. This was resetting
    # nrdc state so it was decided to switch to nrdc -o to run checks once. Running
    # nrdconce is not updating the state of the running nrdc daemon so we need to parse
    # nrdconce output, which is plain text, not json. We filter out lines that hold
    # check status with grep. After that we transform lines to the form:
    # check_results[check_name]="check_status" and source the result.
    # shellcheck source=/dev/null
    . <(/usr/local/bin/nrdconce 2>&1 \
        | grep -oP "^Service: \K.*state:[^,]+" \
        | sed -e 's/^/check_results[/' -e 's/ state: [^ ]* /]=/')
    for check_name in "${checks[@]}"; do
        [ "${check_results[$check_name]:-}" == "OK" ] && continue
        host_is_downtimed && return 0
        check_is_downtimed "$check_name" && continue
        echo "Error: check \"$check_name\" failed"
        exitcode=1
    done
    return $exitcode
}

for ((i = 0; i < VALIDATE_RETRY_ATTEMPTS; i++)); do
    validate_checks && exit 0
    echo "Attempt $i failed, retrying"
    ((VALIDATE_DELAY_SECONDS *= 2))
done
echo "Some checks are failing"
exit 1
