From 3013859bdc4147f0e64216a5d80c586d2dfae62c Mon Sep 17 00:00:00 2001 From: "W. Trevor King" Date: Tue, 13 Aug 2019 15:44:19 -0700 Subject: [PATCH] ci-operator/templates/openshift/installer/cluster-launch-installer-upi-e2e: Gather on bootstrap failure Currently UPI bootstrap failures die with [1]: time="2019-08-13T20:38:56Z" level=debug msg="Still waiting for the Kubernetes API: the server could not find the requested resource" time="2019-08-13T20:39:12Z" level=info msg="Use the following commands to gather logs from the cluster" time="2019-08-13T20:39:12Z" level=info msg="openshift-install gather bootstrap --help" time="2019-08-13T20:39:12Z" level=fatal msg="waiting for Kubernetes API: context deadline exceeded" but don't actually gather those recommended logs [2]. With this commit, I've added a setup-script global GATHER_BOOTSTRAP_ARGS which the various per-platform flows can populate as they create resources. Then if the wait-for-bootstrap command dies and that variable is non-empty, we run the gather to store the logs in the installer's artifact directory. We can't use: --master ${CONTROL_PLANE_0_IP},${CONTROL_PLANE_1_IP},${CONTROL_PLANE_2_IP} because the backing installer code [3] uses StringArrayVar [4], which does not perform StringSliceVar's [5] comma-splitting. The GATHER_BOOTSTRAP_ARGS approach is a bit of a cludge, because the expansion in gather-bootstrap-and-fail is not quoted; relying instead on a lack of shell-sensitive characters in the IP arguments. That's likely fine in practice, but if we wanted to tighten it down we could switch the script from sh to Bash and use an array variable. For now; I'm punting that to future work. There's also crufy Terraform business around this in the teardown container, which I've left alone for now. [1]: https://storage.googleapis.com/origin-ci-test/pr-logs/pull/openshift_release/4719/rehearse-4719-pull-ci-openshift-installer-master-e2e-aws-proxy/5/artifacts/e2e-aws-proxy/installer/.openshift_install.log [2]: https://gcsweb-ci.svc.ci.openshift.org/gcs/origin-ci-test/pr-logs/pull/openshift_release/4719/rehearse-4719-pull-ci-openshift-installer-master-e2e-aws-proxy/5/artifacts/e2e-aws-proxy/installer/ [3]: https://github.com/openshift/installer/blob/8f972b45987a32cc91bc61c39a727e9a1224693d/cmd/openshift-install/gather.go#L71 [4]: https://godoc.org/github.com/spf13/pflag#FlagSet.StringArrayVar [5]: https://godoc.org/github.com/spf13/pflag#FlagSet.StringSliceVar --- .../cluster-launch-installer-upi-e2e.yaml | 15 ++++++++++++++- 1 file changed, 14 insertions(+), 1 deletion(-) diff --git a/ci-operator/templates/openshift/installer/cluster-launch-installer-upi-e2e.yaml b/ci-operator/templates/openshift/installer/cluster-launch-installer-upi-e2e.yaml index ab828307382b3..3dbbd7a751711 100644 --- a/ci-operator/templates/openshift/installer/cluster-launch-installer-upi-e2e.yaml +++ b/ci-operator/templates/openshift/installer/cluster-launch-installer-upi-e2e.yaml @@ -340,6 +340,8 @@ objects: value: ${BASE_DOMAIN} - name: SSH_PUB_KEY_PATH value: /etc/openshift-installer/ssh-publickey + - name: SSH_PRIVATE_KEY_PATH + value: /etc/openshift-installer/ssh-privatekey - name: PULL_SECRET_PATH value: /etc/openshift-installer/pull-secret - name: TFVARS_PATH @@ -363,6 +365,15 @@ objects: trap 'rc=$?; if test "${rc}" -eq 0; then touch /tmp/setup-success; else touch /tmp/exit; fi; exit "${rc}"' EXIT trap 'CHILDREN=$(jobs -p); if test -n "${CHILDREN}"; then kill ${CHILDREN} && wait; fi' TERM + GATHER_BOOTSTRAP_ARGS= + + function gather_bootstrap_and_fail() { + if test -n "${GATHER_BOOTSTRAP_ARGS}"; then + openshift-install --dir=/tmp/artifacts/installer gather bootstrap --key "${SSH_PRIVATE_KEY_PATH}" ${GATHER_BOOTSTRAP_ARGS} + fi + return 1 + } + while true; do if [[ -f /tmp/exit ]]; then echo "Another process exited" 2>&1 @@ -591,6 +602,7 @@ objects: BOOTSTRAP_IP="$(aws cloudformation describe-stacks --stack-name "${CLUSTER_NAME}-bootstrap" \ --query 'Stacks[].Outputs[?OutputKey == `BootstrapPublicIp`].OutputValue' --output text)" + GATHER_BOOTSTRAP_ARGS="${GATHER_BOOTSTRAP_ARGS} --bootstrap ${BOOTSTRAP_IP}" aws cloudformation create-stack \ --stack-name "${CLUSTER_NAME}-control-plane" \ @@ -622,6 +634,7 @@ objects: CONTROL_PLANE_0_IP="$(echo "${CONTROL_PLANE_IPS}" | cut -d, -f1)" CONTROL_PLANE_1_IP="$(echo "${CONTROL_PLANE_IPS}" | cut -d, -f2)" CONTROL_PLANE_2_IP="$(echo "${CONTROL_PLANE_IPS}" | cut -d, -f3)" + GATHER_BOOTSTRAP_ARGS="${GATHER_BOOTSTRAP_ARGS} --master ${CONTROL_PLANE_0_IP} --master ${CONTROL_PLANE_1_IP} --master ${CONTROL_PLANE_2_IP}" for INDEX in 0 1 2 do @@ -708,7 +721,7 @@ objects: echo "Waiting for bootstrap to complete" openshift-install --dir=/tmp/artifacts/installer wait-for bootstrap-complete & - wait "$!" + wait "$!" || gather_bootstrap_and_fail echo "Bootstrap complete, destroying bootstrap resources" if [[ "${CLUSTER_TYPE}" == "aws" ]]; then