Skip to content
147 changes: 103 additions & 44 deletions test/integration/hotrestart_test.sh
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,72 @@

set -e

# The following functions are borrowed from PageSpeed's system test
# infrastructure, where are part of a bash system-test helper library.
# I'm putting them here for this one test to help me debug it, with
# the thinking that if there's traction for this infrastructure I'll
# factor it out into a helpers class for Envoy. Original source link:
#
# https://github.com/apache/incubator-pagespeed-mod/blob/c7cc4f22c79ada8077be2a16afc376dc8f8bd2da/pagespeed/automatic/system_test_helpers.sh#L383

CURRENT_TEST="NONE"
function start_test() {
CURRENT_TEST="$@"
echo "TEST: $CURRENT_TEST"
}

check() {
echo " check" "$@" ...
"$@" || handle_failure
}

BACKGROUND_PID="?"
run_in_background_saving_pid() {
echo " backgrounding:" "$@" ...
"$@" &
BACKGROUND_PID="$!"
}

# By default, print a message like:
# failure at line 374
# FAIL
# and then exit with return value 1. If we expected this test to fail, log to
# $EXPECTED_FAILURES and return without exiting.
#
# If the shell does not support the 'caller' builtin, skip the line number info.
#
# Assumes it's being called from a failure-reporting function and that the
# actual failure the user is interested in is our caller's caller. If it
# weren't for this, fail and handle_failure could be the same.
handle_failure() {
if [ $# -eq 1 ]; then
echo FAILed Input: "$1"
fi

# From http://stackoverflow.com/questions/685435/bash-stacktrace
# to avoid printing 'handle_failure' we start with 1 to skip get_stack caller
local i
local stack_size=${#FUNCNAME[@]}
for (( i=1; i<$stack_size ; i++ )); do
local func="${FUNCNAME[$i]}"
[ -z "$func" ] && func=MAIN
local line_number="${BASH_LINENO[(( i - 1 ))]}"
local src="${BASH_SOURCE[$i]}"
[ -z "$src" ] && src=non_file_source
echo "${src}:${line_number}: $func"
done

# Note: we print line number after "failed input" so that it doesn't get
# knocked out of the terminal buffer.
if type caller > /dev/null 2>&1 ; then
# "caller 1" is our caller's caller.
echo " failure at line $(caller 1 | sed 's/ .*//')" 1>&2
fi
echo "in '$CURRENT_TEST'"
echo FAIL.
exit 1
}

# The heapchecker outputs some data to stderr on every execution. This gets intermingled
# with the output from --hot-restart-version, so disable the heap-checker for these runs.
disableHeapCheck () {
Expand All @@ -25,6 +91,7 @@ JSON_TEST_ARRAY=()
if [[ -z "${ENVOY_IP_TEST_VERSIONS}" ]] || [[ "${ENVOY_IP_TEST_VERSIONS}" == "all" ]] \
|| [[ "${ENVOY_IP_TEST_VERSIONS}" == "v4only" ]]; then
HOT_RESTART_JSON_V4="${TEST_TMPDIR}"/hot_restart_v4.json
echo building ${HOT_RESTART_JSON_V4} ...
cat "${TEST_RUNDIR}"/test/config/integration/server.json |
sed -e "s#{{ upstream_. }}#0#g" | \
sed -e "s#{{ test_rundir }}#$TEST_RUNDIR#" | \
Expand Down Expand Up @@ -58,23 +125,23 @@ echo "Hot restart test using --base-id ${BASE_ID}"
TEST_INDEX=0
for HOT_RESTART_JSON in "${JSON_TEST_ARRAY[@]}"
do
# Test validation.
# TODO(jun03): instead of setting the base-id, the validate server should use the nop hot restart
"${ENVOY_BIN}" -c "${HOT_RESTART_JSON}" --mode validate --service-cluster cluster \
start_test validation
check "${ENVOY_BIN}" -c "${HOT_RESTART_JSON}" --mode validate --service-cluster cluster \
--service-node node --base-id "${BASE_ID}"

# Now start the real server, hot restart it twice, and shut it all down as a basic hot restart
# sanity test.
echo "Starting epoch 0"
start_test Starting epoch 0
ADMIN_ADDRESS_PATH_0="${TEST_TMPDIR}"/admin.0."${TEST_INDEX}".address
"${ENVOY_BIN}" -c "${HOT_RESTART_JSON}" \
run_in_background_saving_pid "${ENVOY_BIN}" -c "${HOT_RESTART_JSON}" \
--restart-epoch 0 --base-id "${BASE_ID}" --service-cluster cluster --service-node node \
--admin-address-path "${ADMIN_ADDRESS_PATH_0}" &
--admin-address-path "${ADMIN_ADDRESS_PATH_0}"

FIRST_SERVER_PID=$!
sleep 3
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Without some time here (not that I love arbitrary sleep in test...), couldn't we send signals to the process before it has installed signal handlers?

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

good catch; done.

FIRST_SERVER_PID=$BACKGROUND_PID

echo "Updating original config json listener addresses"
start_test Updating original config json listener addresses
sleep 3
UPDATED_HOT_RESTART_JSON="${TEST_TMPDIR}"/hot_restart_updated."${TEST_INDEX}".json
"${TEST_RUNDIR}"/tools/socket_passing "-o" "${HOT_RESTART_JSON}" "-a" "${ADMIN_ADDRESS_PATH_0}" \
"-u" "${UPDATED_HOT_RESTART_JSON}"
Expand All @@ -88,84 +155,76 @@ do

disableHeapCheck

echo "Checking for match of --hot-restart-version and admin /hot_restart_version"
start_test Checking for match of --hot-restart-version and admin /hot_restart_version
ADMIN_ADDRESS_0=$(cat "${ADMIN_ADDRESS_PATH_0}")
echo fetching hot restart version from http://${ADMIN_ADDRESS_0}/hot_restart_version ...
ADMIN_HOT_RESTART_VERSION=$(curl -sg http://${ADMIN_ADDRESS_0}/hot_restart_version)
CLI_HOT_RESTART_VERSION=$("${ENVOY_BIN}" --hot-restart-version --base-id "${BASE_ID}" 2>&1)
if [[ "${ADMIN_HOT_RESTART_VERSION}" != "${CLI_HOT_RESTART_VERSION}" ]]; then
echo "Hot restart version mismatch: ${ADMIN_HOT_RESTART_VERSION} != " \
"${CLI_HOT_RESTART_VERSION}"
exit 2
fi
check [ "${ADMIN_HOT_RESTART_VERSION}" = "${CLI_HOT_RESTART_VERSION}" ]

echo "Checking max-obj-name-len"
CLI_HOT_RESTART_VERSION=$("${ENVOY_BIN}" --hot-restart-version --max-obj-name-len 1234 --base-id "${BASE_ID}" 2>&1)
if [[ "${ADMIN_HOT_RESTART_VERSION}" = "${CLI_HOT_RESTART_VERSION}" ]]; then
echo "Hot restart version match when it should mismatch: ${ADMIN_HOT_RESTART_VERSION} == " \
"${CLI_HOT_RESTART_VERSION}"
exit 2
fi
start_test Checking for hot-restart-version mismatch when max-obj-name-len differs
CLI_HOT_RESTART_VERSION=$("${ENVOY_BIN}" --hot-restart-version --base-id "${BASE_ID}" \
--max-obj-name-len 1234 2>&1)
check [ "${ADMIN_HOT_RESTART_VERSION}" != "${CLI_HOT_RESTART_VERSION}" ]

echo "Checking max-stats"
CLI_HOT_RESTART_VERSION=$("${ENVOY_BIN}" --hot-restart-version --max-stats 12345 --base-id "${BASE_ID}" 2>&1)
if [[ "${ADMIN_HOT_RESTART_VERSION}" = "${CLI_HOT_RESTART_VERSION}" ]]; then
echo "Hot restart version match when it should mismatch: ${ADMIN_HOT_RESTART_VERSION} == " \
"${CLI_HOT_RESTART_VERSION}"
exit 2
fi
start_test Checking for hot-start-version mismatch when max-stats differs
CLI_HOT_RESTART_VERSION=$("${ENVOY_BIN}" --hot-restart-version --base-id "${BASE_ID}" \
--max-stats 12345 2>&1)
check [ "${ADMIN_HOT_RESTART_VERSION}" != "${CLI_HOT_RESTART_VERSION}" ]

enableHeapCheck

echo "Starting epoch 1"
start_test Starting epoch 1
ADMIN_ADDRESS_PATH_1="${TEST_TMPDIR}"/admin.1."${TEST_INDEX}".address
"${ENVOY_BIN}" -c "${UPDATED_HOT_RESTART_JSON}" \
run_in_background_saving_pid "${ENVOY_BIN}" -c "${UPDATED_HOT_RESTART_JSON}" \
--restart-epoch 1 --base-id "${BASE_ID}" --service-cluster cluster --service-node node \
--admin-address-path "${ADMIN_ADDRESS_PATH_1}" &
--admin-address-path "${ADMIN_ADDRESS_PATH_1}"

SECOND_SERVER_PID=$BACKGROUND_PID

SECOND_SERVER_PID=$!
# Wait for stat flushing
sleep 7

echo "Checking that listener addresses have not changed"
start_test Checking that listener addresses have not changed
HOT_RESTART_JSON_1="${TEST_TMPDIR}"/hot_restart.1."${TEST_INDEX}".json
"${TEST_RUNDIR}"/tools/socket_passing "-o" "${UPDATED_HOT_RESTART_JSON}" "-a" "${ADMIN_ADDRESS_PATH_1}" \
"-u" "${HOT_RESTART_JSON_1}"
CONFIG_DIFF=$(diff "${UPDATED_HOT_RESTART_JSON}" "${HOT_RESTART_JSON_1}")
[[ -z "${CONFIG_DIFF}" ]]

ADMIN_ADDRESS_PATH_2="${TEST_TMPDIR}"/admin.2."${TEST_INDEX}".address
echo "Starting epoch 2"
"${ENVOY_BIN}" -c "${UPDATED_HOT_RESTART_JSON}" \
--restart-epoch 2 --base-id "${BASE_ID}" --service-cluster cluster --service-node node \
--admin-address-path "${ADMIN_ADDRESS_PATH_2}" &
start_test Starting epoch 2
run_in_background_saving_pid "${ENVOY_BIN}" -c "${UPDATED_HOT_RESTART_JSON}" \
--restart-epoch 2 --base-id "${BASE_ID}" --service-cluster cluster --service-node node \
--admin-address-path "${ADMIN_ADDRESS_PATH_2}"

THIRD_SERVER_PID=$!
THIRD_SERVER_PID=$BACKGROUND_PID
sleep 3

echo "Checking that listener addresses have not changed"
start_test Checking that listener addresses have not changed
HOT_RESTART_JSON_2="${TEST_TMPDIR}"/hot_restart.2."${TEST_INDEX}".json
"${TEST_RUNDIR}"/tools/socket_passing "-o" "${UPDATED_HOT_RESTART_JSON}" "-a" "${ADMIN_ADDRESS_PATH_2}" \
"-u" "${HOT_RESTART_JSON_2}"
CONFIG_DIFF=$(diff "${UPDATED_HOT_RESTART_JSON}" "${HOT_RESTART_JSON_2}")
[[ -z "${CONFIG_DIFF}" ]]

# First server should already be gone.
echo "Waiting for epoch 0"
start_test Waiting for epoch 0
wait ${FIRST_SERVER_PID}
[[ $? == 0 ]]

#Send SIGUSR1 signal to the second server, this should not kill it
echo "Sending SIGUSR1 to the second server"
start_test Sending SIGUSR1 to the second server
kill -SIGUSR1 ${SECOND_SERVER_PID}
sleep 3

# Now term the last server, and the other one should exit also.
echo "Killing and waiting for epoch 2"
start_test Killing and waiting for epoch 2
kill ${THIRD_SERVER_PID}
wait ${THIRD_SERVER_PID}
[[ $? == 0 ]]

echo "Waiting for epoch 1"
start_test Waiting for epoch 1
wait ${SECOND_SERVER_PID}
[[ $? == 0 ]]
TEST_INDEX=$((TEST_INDEX+1))
Expand All @@ -176,7 +235,7 @@ done
set +e
disableHeapCheck

echo "Launching envoy with no parameters. Check the exit value is 1"
start_test Launching envoy with no parameters. Check the exit value is 1
${ENVOY_BIN} --base_id "${BASE_ID}"
EXIT_CODE=$?
# The test should fail if the Envoy binary exits with anything other than 1.
Expand Down