Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 4 additions & 0 deletions Makefile
Original file line number Diff line number Diff line change
Expand Up @@ -154,6 +154,10 @@ test-integration: generate
test-e2e:
hack/e2e-test.sh

.PHONY: test-e2e-pool
test-e2e-pool:
hack/e2e-pool-test.sh

.PHONY: test-e2e-postdeploy
test-e2e-postdeploy:
go test $(GO_MOD_FLAGS) -v -timeout 0 -count=1 ./test/e2e/postdeploy/...
Expand Down
182 changes: 182 additions & 0 deletions hack/e2e-common.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,182 @@
max_tries=60
Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Note to reviewers: There's quite a bit of cleanup/refactoring that could be done here, but I explicitly didn't do that so it'd be easier to review: use your favorite visual diff tool to compare this file to the original e2e-test.sh.

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

do you mind moving this move to separate commit. that would be a lot more read able.

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I'm not sure how much more readable it would make anything. That commit would simply comprise e2e-common.sh and e2e-test.sh in the exact form they exist in this PR.

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Possible misunderstanding here, refactoring that could be done, but isn't in this PR.

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Correct: I kept the existing code from e2e-test.sh, warts and all, and just moved it to e2e-common.sh with as little change as possible, precisely to make this PR more scrutable. We can refactor/reorganize later.

sleep_between_tries=10
# Set timeout for the cluster deployment to install
# timeout = sleep_between_cluster_deployment_status_checks * max_cluster_deployment_status_checks
max_cluster_deployment_status_checks=90
sleep_between_cluster_deployment_status_checks="1m"

export CLUSTER_NAMESPACE="${CLUSTER_NAMESPACE:-cluster-test}"

# In CI, HIVE_IMAGE and RELEASE_IMAGE are set via the job's `dependencies`.
if [[ -z "$HIVE_IMAGE" ]]; then
echo "The HIVE_IMAGE environment variable was not found." >&2
echo "It must be set to the fully-qualified pull spec of a hive container image." >&2
echo "E.g. quay.io/my-user/hive:latest" >&2
exit 1
fi
if [[ -z "$RELEASE_IMAGE" ]]; then
echo "The RELEASE_IMAGE environment variable was not found." >&2
echo "It must be set to the fully-qualified pull spec of an OCP release container image." >&2
echo "E.g. quay.io/openshift-release-dev/ocp-release:4.7.0-x86_64" >&2
exit 1
fi

echo "Running ${TEST_NAME} with HIVE_IMAGE ${HIVE_IMAGE}"
echo "Running ${TEST_NAME} with RELEASE_IMAGE ${RELEASE_IMAGE}"

i=1
while [ $i -le ${max_tries} ]; do
if [ $i -gt 1 ]; then
# Don't sleep on first loop
echo "sleeping ${sleep_between_tries} seconds"
sleep ${sleep_between_tries}
fi

echo -n "Creating namespace ${CLUSTER_NAMESPACE}. Try #${i}/${max_tries}... "
if oc create namespace "${CLUSTER_NAMESPACE}"; then
echo "Success"
break
else
echo -n "Failed, "
fi

i=$((i + 1))
done
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

should we not need to verify that the namespace creation was successfull when we exit this loop (so we can have a clearer error message in the even that this never succeeded)?

Copy link
Member Author

@2uasimojo 2uasimojo Aug 3, 2021

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Absolutely. There's a couple of things here, including the fact that the command to restore the original namespace doesn't work at all for me. I'd like to defer that to a separate PR if possible, to keep this file as cleanly cut/paste as possible.


ORIGINAL_NAMESPACE=$(oc config view -o json | jq -er 'select(.contexts[].name == ."current-context") | .contexts[]?.context.namespace // ""')
echo Original default namespace is ${ORIGINAL_NAMESPACE}
echo Setting default namespace to ${CLUSTER_NAMESPACE}
if ! oc config set-context --current --namespace=${CLUSTER_NAMESPACE}; then
echo "Failed to set the default namespace"
exit 1
fi

function restore_default_namespace() {
echo Restoring default namespace to ${ORIGINAL_NAMESPACE}
oc config set-context --current --namespace=${ORIGINAL_NAMESPACE}
}
trap 'restore_default_namespace' EXIT

if [ $i -ge ${max_tries} ] ; then
# Failed the maximum amount of times.
echo "exiting"
exit 10
fi

CLUSTER_PROFILE_DIR="${CLUSTER_PROFILE_DIR:-/tmp/cluster}"

CLOUD="${CLOUD:-aws}"
export ARTIFACT_DIR="${ARTIFACT_DIR:-/tmp}"

SSH_PUBLIC_KEY_FILE="${SSH_PUBLIC_KEY_FILE:-${CLUSTER_PROFILE_DIR}/ssh-publickey}"
# If not specified or nonexistent, generate a keypair to use
if ! [[ -s "${SSH_PUBLIC_KEY_FILE}" ]]; then
echo "Specified SSH public key file '${SSH_PUBLIC_KEY_FILE}' is invalid or nonexistent. Generating a single-use keypair."
WHERE=${SSH_PUBLIC_KEY_FILE%/*}
mkdir -p ${WHERE}
# Tell the installmanager where to find the private key
export SSH_PRIV_KEY_PATH=$(mktemp -p ${WHERE})
# ssh-keygen will put the public key here
TMP_PUB=${SSH_PRIV_KEY_PATH}.pub
# Answer 'y' to the overwrite prompt, since we touched the file
yes y | ssh-keygen -q -t rsa -N '' -f ${SSH_PRIV_KEY_PATH}
# Now put the pubkey where we expected it
mv ${TMP_PUB} ${SSH_PUBLIC_KEY_FILE}
fi

PULL_SECRET_FILE="${PULL_SECRET_FILE:-${CLUSTER_PROFILE_DIR}/pull-secret}"
export HIVE_NS="hive-e2e"
export HIVE_OPERATOR_NS="hive-operator"

# Install Hive
IMG="${HIVE_IMAGE}" make deploy

function save_hive_logs() {
oc logs -n "${HIVE_NS}" deployment/hive-controllers > "${ARTIFACT_DIR}/hive-controllers.log"
oc logs -n "${HIVE_NS}" deployment/hiveadmission > "${ARTIFACT_DIR}/hiveadmission.log"
}

SRC_ROOT=$(git rev-parse --show-toplevel)

USE_MANAGED_DNS=${USE_MANAGED_DNS:-true}

case "${CLOUD}" in
"aws")
CREDS_FILE="${CLUSTER_PROFILE_DIR}/.awscred"
# Accept creds from the env if the file doesn't exist.
if ! [[ -f $CREDS_FILE ]] && [[ -n "${AWS_ACCESS_KEY_ID}" ]] && [[ -n "${AWS_SECRET_ACCESS_KEY}" ]]; then
# TODO: Refactor contrib/pkg/adm/managedns/enable::generateAWSCredentialsSecret to
# use contrib/pkg/utils/aws/aws::GetAWSCreds, which knows how to look for the env
# vars if the file isn't specified; and use this condition to generate (or not)
# the whole CREDS_FILE_ARG="--creds-file=${CREDS_FILE}".
printf '[default]\naws_access_key_id=%s\naws_secret_access_key=%s\n' "$AWS_ACCESS_KEY_ID" "$AWS_SECRET_ACCESS_KEY" > $CREDS_FILE
fi
BASE_DOMAIN="${BASE_DOMAIN:-hive-ci.openshift.com}"
EXTRA_CREATE_CLUSTER_ARGS="--aws-user-tags expirationDate=$(date -d '4 hours' --iso=minutes --utc)"
;;
"azure")
CREDS_FILE="${CLUSTER_PROFILE_DIR}/osServicePrincipal.json"
BASE_DOMAIN="${BASE_DOMAIN:-ci.azure.devcluster.openshift.com}"
;;
"gcp")
CREDS_FILE="${CLUSTER_PROFILE_DIR}/gce.json"
BASE_DOMAIN="${BASE_DOMAIN:-origin-ci-int-gce.dev.openshift.com}"
;;
*)
echo "unknown cloud: ${CLOUD}"
exit 1
;;
esac

if $USE_MANAGED_DNS; then
# Generate a short random shard string for this cluster similar to OSD prod.
# This is to prevent name conflicts across customer clusters.
CLUSTER_SHARD=$(cat /dev/urandom | tr -dc 'a-z' | fold -w 8 | head -n 1)
CLUSTER_DOMAIN="${CLUSTER_SHARD}.${BASE_DOMAIN}"
go run "${SRC_ROOT}/contrib/cmd/hiveutil/main.go" adm manage-dns enable ${BASE_DOMAIN} \
--creds-file="${CREDS_FILE}" --cloud="${CLOUD}"
MANAGED_DNS_ARG=" --manage-dns"
else
CLUSTER_DOMAIN="${BASE_DOMAIN}"
fi


echo "Using cluster base domain: ${CLUSTER_DOMAIN}"

function capture_manifests() {
oc get clusterdeployment -A -o yaml &> "${ARTIFACT_DIR}/hive_clusterdeployment.yaml" || true
oc get clusterimageset -o yaml &> "${ARTIFACT_DIR}/hive_clusterimagesets.yaml" || true
oc get clusterprovision -A -o yaml &> "${ARTIFACT_DIR}/hive_clusterprovision.yaml" || true
oc get clusterstate -A -o yaml &> "${ARTIFACT_DIR}/hive_clusterstate.yaml" || true
}

function capture_cluster_logs() {
local CLUSTER_NAME=$1
local CLUSTER_NAMESPACE=$2
local INSTALL_RESULT=$3

# Capture install logs
if IMAGESET_JOB_NAME=$(oc get job -l "hive.openshift.io/cluster-deployment-name=${CLUSTER_NAME},hive.openshift.io/imageset=true" -o name -n ${CLUSTER_NAMESPACE}) && [ "${IMAGESET_JOB_NAME}" ]
then
oc logs -c hive -n ${CLUSTER_NAMESPACE} ${IMAGESET_JOB_NAME} &> "${ARTIFACT_DIR}/hive_imageset_job.log" || true
oc get ${IMAGESET_JOB_NAME} -n ${CLUSTER_NAMESPACE} -o yaml &> "${ARTIFACT_DIR}/hive_imageset_job.yaml" || true
fi
if INSTALL_JOB_NAME=$(oc get job -l "hive.openshift.io/cluster-deployment-name=${CLUSTER_NAME},hive.openshift.io/install=true" -o name -n ${CLUSTER_NAMESPACE}) && [ "${INSTALL_JOB_NAME}" ]
then
oc logs -c hive -n ${CLUSTER_NAMESPACE} ${INSTALL_JOB_NAME} &> "${ARTIFACT_DIR}/hive_install_job.log" || true
oc get ${INSTALL_JOB_NAME} -n ${CLUSTER_NAMESPACE} -o yaml &> "${ARTIFACT_DIR}/hive_install_job.yaml" || true
fi
echo "************* INSTALL JOB LOG *************"
if oc get clusterprovision -l "hive.openshift.io/cluster-deployment-name=${CLUSTER_NAME}" -o jsonpath='{.items[0].spec.installLog}' &> "${ARTIFACT_DIR}/hive_install_console.log"; then
cat "${ARTIFACT_DIR}/hive_install_console.log"
else
cat "${ARTIFACT_DIR}/hive_install_job.log"
fi

if [[ "${INSTALL_RESULT}" != "success" ]]
then
mkdir "${ARTIFACT_DIR}/hive"
${SRC_ROOT}/hack/logextractor.sh ${CLUSTER_NAME} "${ARTIFACT_DIR}/hive"
exit 1
fi
}
184 changes: 184 additions & 0 deletions hack/e2e-pool-test.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,184 @@
#!/bin/bash

set -ex

TEST_NAME=e2e-pool
source ${0%/*}/e2e-common.sh

# TODO: Use something better here.
# `make test-e2e-postdeploy` could work, but does more than we need.
echo "Waiting for the deployment to settle"
sleep 120

echo "Creating imageset"
IMAGESET_NAME=cis
oc apply -f -<<EOF
apiVersion: hive.openshift.io/v1
kind: ClusterImageSet
metadata:
name: $IMAGESET_NAME
spec:
releaseImage: $RELEASE_IMAGE
EOF

# NOTE: This is needed in order for the short form (cd) to work
oc get clusterdeployment > /dev/null

function count_cds() {
oc get cd -A -o json | jq -r '.items | length'
}

# Verify no CDs exist yet
NUM_CDS=$(count_cds)
if [[ $NUM_CDS != "0" ]]; then
echo "Got an unexpected number of pre-existing ClusterDeployments." >&2
echo "Expected 0." >&2
echo "Got: $NUM_CDS" >&2
exit 5
fi

# Use the CLUSTER_NAME configured by the test as the pool name. This will result in CD names
# being seeded with that as a prefix, which will make them visible to our leak detector.
POOL_NAME=$CLUSTER_NAME

function cleanup() {
capture_manifests
# Let's save the logs now in case any of the following never finish
echo "Saving hive logs before cleanup"
save_hive_logs
oc delete clusterclaim --all
oc delete clusterpool $POOL_NAME
# Wait indefinitely for all CDs to disappear. If we exceed the test timeout,
# we'll get killed, and resources will leak.
while true; do
sleep ${sleep_between_tries}
NUM_CDS=$(count_cds)
if [[ $NUM_CDS == "0" ]]; then
break
fi
echo "Waiting for $NUM_CDS ClusterDeployment(s) to be cleaned up"
done
# And if we get this far, overwrite the logs with the latest
echo "Saving hive logs after cleanup"
save_hive_logs
}
trap cleanup EXIT

echo "Creating cluster pool"
# TODO: This can't be changed yet -- see other TODOs (search for 'variable POOL_SIZE')
POOL_SIZE=1
# TODO: This is aws-specific at the moment.
go run "${SRC_ROOT}/contrib/cmd/hiveutil/main.go" clusterpool create-pool \
-n "${CLUSTER_NAMESPACE}" \
--cloud="${CLOUD}" \
--creds-file="${CREDS_FILE}" \
--pull-secret-file="${PULL_SECRET_FILE}" \
--image-set "${IMAGESET_NAME}" \
--region us-east-1 \
--size "${POOL_SIZE}" \
${POOL_NAME}

echo "Waiting for pool to create $POOL_SIZE ClusterDeployment(s)"
i=1
while [[ $i -le ${max_tries} ]]; do
if [[ $i -gt 1 ]]; then
# Don't sleep on first loop
echo "sleeping ${sleep_between_tries} seconds"
sleep ${sleep_between_tries}
fi

NUM_CDS=$(count_cds)
if [[ $NUM_CDS == "${POOL_SIZE}" ]]; then
echo "Success"
break
else
echo -n "Failed $(NUM_CDS), "
fi

i=$((i + 1))
done

if [[ $i -ge ${max_tries} ]] ; then
# Failed the maximum amount of times.
echo "exiting"
exit 10
fi

# Get the CD name & namespace (which should be the same)
# TODO: Set this up for variable POOL_SIZE
CLUSTER_NAME=$(oc get cd -A -o json | jq -r .items[0].metadata.name)

echo "Waiting for ClusterDeployment $CLUSTER_NAME to finish installing"
# TODO: Set this up for variable POOL_SIZE
i=1
while [[ $i -le ${max_cluster_deployment_status_checks} ]]; do
CD_JSON=$(oc get cd -n $CLUSTER_NAME $CLUSTER_NAME -o json)
if [[ $(jq .spec.installed <<<"${CD_JSON}") == "true" ]]; then
echo "ClusterDeployment is Installed"
break
fi
PF_COND=$(jq -r '.status.conditions[] | select(.type == "ProvisionFailed")' <<<"${CD_JSON}")
if [[ $(jq -r .status <<<"${PF_COND}") == 'True' ]]; then
FAILURE_REASON=$(jq -r .reason <<<"${PF_COND}")
FAILURE_MESSAGE=$(jq -r .message <<<"${PF_COND}")
echo "ClusterDeployment install failed with reason '$FAILURE_REASON' and message: $FAILURE_MESSAGE" >&2
capture_cluster_logs $CLUSTER_NAME $CLUSTER_NAME failure
exit 7
fi
sleep ${sleep_between_cluster_deployment_status_checks}
echo "Still waiting for the ClusterDeployment ${CLUSTER_NAME} to install. Status check #${i}/${max_cluster_deployment_status_checks}... "
i=$((i + 1))

done

function wait_for_hibernation_state() {
local CLUSTER_NAME=$1
local EXPECTED_STATE=$2
echo "Waiting for ClusterDeployment $CLUSTER_NAME to be $EXPECTED_STATE"
local i=1
while [[ $i -le ${max_tries} ]]; do
if [[ $i -gt 1 ]]; then
# Don't sleep on first loop
echo "sleeping ${sleep_between_tries} seconds"
sleep ${sleep_between_tries}
fi

HIB_COND=$(oc get cd -n $CLUSTER_NAME $CLUSTER_NAME -o json | jq -r '.status.conditions[] | select(.type == "Hibernating")')
if [[ $(jq -r .reason <<<"${HIB_COND}") == $EXPECTED_STATE ]]; then
echo "Success"
break
else
echo -n "Failed, "
fi

i=$((i + 1))
done

if [[ $i -ge ${max_tries} ]] ; then
# Failed the maximum amount of times.
echo "ClusterDeployment $CLUSTER_NAME still not $EXPECTED_STATE" >&2
echo "Reason: $(jq -r .reason <<<"${HIB_COND}")" >&2
echo "Message: $(jq -r .message <<<"${HIB_COND}")" >&2
exit 9
fi
}

wait_for_hibernation_state $CLUSTER_NAME Hibernating

echo "Claiming"
CLAIM_NAME=the-claim
go run "${SRC_ROOT}/contrib/cmd/hiveutil/main.go" clusterpool claim -n $CLUSTER_NAMESPACE $POOL_NAME $CLAIM_NAME

wait_for_hibernation_state $CLUSTER_NAME Running

echo "Re-hibernating"
oc patch cd -n $CLUSTER_NAME $CLUSTER_NAME --type=merge -p '{"spec": {"powerState": "Hibernating"}}'

wait_for_hibernation_state $CLUSTER_NAME Hibernating

echo "Re-resuming"
oc patch cd -n $CLUSTER_NAME $CLUSTER_NAME --type=merge -p '{"spec": {"powerState": "Running"}}'

wait_for_hibernation_state $CLUSTER_NAME Running

# Let the cleanup trap do the cleanup.
Loading