#!/bin/bash

# this script retrieves job information from the sqs, starts test run for
# the netboot image and notifies jenkins job where to expect results

set -u # unset variable is an error

# time we wait for the message on the sqs, this is used for long polling
WAIT_TIME_SECONDS=10
# this queue is used by jenkins to dispatch testing jobs
QUEUE_URL="https://us-west-2.queue.amazonaws.com/277437172290/netboot-validator-jobs"

# location of directories with test run data
NETBOOT_VALIDATOR_RUN_DIR=/var/netboot-validator/run
# how many test runs we want to keep
NETBOOT_VALIDATOR_RUN_KEEP=999

# jenkins queue parameters
JQ_MAX_MESSAGE_SIZE=2048
JQ_MESSAGE_RETENTION_PERIOD=600
JQ_VISIBILITY_TIMEOUT=30

# time to cleanup rogue queues
JQ_CLEANUP_TIMEOUT=86400 # 1 day

# this is used later to create queue to notify jenkins job of the test results location
JQ_ATTRS="MaximumMessageSize=$JQ_MAX_MESSAGE_SIZE,\
    MessageRetentionPeriod=$JQ_MESSAGE_RETENTION_PERIOD,\
    VisibilityTimeout=$JQ_VISIBILITY_TIMEOUT"
instance_id=$(curl -s http://169.254.169.254/latest/meta-data/instance-id)
AZ=$(curl -s 169.254.169.254/latest/meta-data/placement/availability-zone)
REGION=${AZ::-1}
HOSTNAME_I=$(hostname -I|tr -d ' ')
SECONDS=$(date +%s)

# this function is forked to run test asynchroneously
run_test() {
    # test run work directory
    work_dir=$NETBOOT_VALIDATOR_RUN_DIR/${run_id}
    # qemu parameters
    tftp=$work_dir/output
    bootfile=/boot.ipxe
    # qemu serial log location
    serial_log=$work_dir/serial.log
    # how much time we wait for the initial puppet run to complete
    puppet_wait_time=300
    # location of the ssh config to access vm
    ssh_config=$work_dir/ssh_config

    # function to get free port to access vm via ssh
    get_free_port() {
        local port
        while true; do
            port=$((1024 + RANDOM % 8192))
            netstat -lnt|grep -q ":${port}\b" && continue
            echo $port
            return
        done
    }

    # function to poll vm if initial puppet run is over
    wait_for_puppet() {
        local deadline=$((SECONDS + puppet_wait_time))
        while ((SECONDS < deadline)); do
            ssh -F $ssh_config pxe 'grep "Notice: Finished catalog run in " /var/log/jtv/puppet-cron.log' && return 0
            sleep 1
        done
        return 1
    }

    # function that will be called on the script exit
    at_exit() {
        kill $qemu_pid
    }

    set -x                        # for tracing progress
    trap at_exit EXIT             # set exit handler
    export HOME=$(pwd)            # we run as www-data and home is not writable so we need to redefine $HOME
    curl -s "$test_url"|tar xzf - # let's get test run data
    ssh_port=$(get_free_port)
    ssh-keygen -N "" -f ./ssh-key >/dev/null 2>&1 # to access vm we generate keys ...
    # ... and create config that will be used by the serverspec tests
    printf "%s\n" \
        "Host pxe" \
        "  HostName 127.0.0.1" \
        "  User toor" \
        "  Port $ssh_port" \
        "  IdentityFile $work_dir/ssh-key" \
        "  UserKnownHostsFile /dev/null" \
        "  StrictHostKeyChecking no" \
        "  PasswordAuthentication no" \
        "  LogLevel FATAL" > $ssh_config
    export KVM_SSH_CONFIG=$ssh_config # this is needed by the serverspec helper
    # prepare-boot.sh will generate boot.ipxe with proper urls and post-boot script to set ssh key in the vm
    ./prepare-boot.sh "http://10.0.2.2/run/${run_id}/output" "ssh-key.pub"
    qemu-system-x86_64 \
        -boot n \
        -device virtio-net-pci,netdev=n1 \
        -netdev user,id=n1,tftp=${tftp},bootfile=${bootfile},hostfwd=tcp::${ssh_port}-:22,domainname=$(hostname -d) \
        -nographic \
        -m 4096 \
        -enable-kvm \
        -cpu max >$serial_log 2>&1 &
    qemu_pid=$!
    cat << EOF > metadata
    qemu_pid=$qemu_pid
    distro=$distro_codename
    environment=$environment
EOF

    wait_for_puppet && rake spec
    status=$?
    ssh -F $ssh_config pxe 'cat /var/log/firstboot.log' >firstboot.log 2>&1
    echo $status >status

    # let's cleanup old test runs
    cd $NETBOOT_VALIDATOR_RUN_DIR && ls -dt *|tail -n +${NETBOOT_VALIDATOR_RUN_KEEP}|xargs rm -rf
    # here we delete rogue queues
    for url in $(aws --region us-west-2 sqs list-queues|jq -r .QueueUrls[]); do
        [[ $url =~ netboot-validator-[0-9] ]] || continue
        # vidtools want timestamp to be in a human readable form ('%Y%m%d%H%M%S') so we need to convert it to
        # seconds since epoch
        timestamp=$(date -d "$(basename $url|cut -f3 -d-|sed -e 's/\(....\)\(..\)\(..\)\(..\)\(..\)\(..\)/\1-\2-\3 \4:\5:\6/')" +%s)
        # if queue is older than JQ_CLEANUP_TIMEOUT we delete it
        ((timestamp + JQ_CLEANUP_TIMEOUT < SECONDS)) && aws sqs delete-queue --queue-url $url --region $REGION
    done
    # here we delete queue that was used by this test run
    aws sqs delete-queue --queue-url $jenkins_queue_url --region $REGION || true
    exit
}

# main loop starts here
while true; do
    # if this machine is scheduled for termination by the auto scaling group we sleep forever
    # since we don't want to run any new test jobs on this machine
    [ "$(aws autoscaling describe-auto-scaling-instances \
        --instance-ids  $instance_id \
        --region $REGION \
        --query "AutoScalingInstances[].LifecycleState" --output text)" == "InService" ] || sleep infinity
    # we try to get job data from the queue
    message_json="$(aws sqs receive-message \
        --region "$REGION" \
        --queue-url "$QUEUE_URL" \
        --wait-time-seconds $WAIT_TIME_SECONDS)"
    [ -z "$message_json" ] && continue
    # on success we retrieve Body (with test run data) and ReceiptHandle (which we need to delete message)
    Body="$(echo "$message_json"|jq -r '.Messages[0].Body')"
    ReceiptHandle="$(echo "$message_json"|jq -r '.Messages[0].ReceiptHandle')"
    # we delete message since we will run this test job
    aws sqs delete-message --region "$REGION" --queue-url "$QUEUE_URL" --receipt-handle $ReceiptHandle
    # Body contains parameters in the form "param_name=param_value
    # using declare is a safe way of setting variables without risk of code injection
    while read param; do
        declare "$param"
    done < <(echo "$Body")
    # test_url is mandatory parameter that is used to download test run data
    [ -z "${test_url:-}" ] && echo "Error: job has no no test url" && continue
    # if run_id was not provided we set it
    [ -z "${run_id:-}" ] && run_id="$(basename $(dirname $test_url))"
    work_dir=$NETBOOT_VALIDATOR_RUN_DIR/${run_id}
    # here we create queue to notify jenkins with test run results location
    jenkins_queue_url="$(aws sqs create-queue \
        --region $REGION \
        --queue-name "netboot-validator-$run_id" \
        --attributes "$JQ_ATTRS" \
        | jq -r .QueueUrl)"
    # and send actual message
    aws sqs send-message --queue-url "$jenkins_queue_url" \
        --message-body "run_url=http://${HOSTNAME_I}/run/${run_id}" \
        --region "$REGION" >/dev/null 2>&1
    mkdir $work_dir && cd $work_dir
    # now we do asynchroneous test run
    run_test >shell.log 2>&1 &
done
