diff --git a/ci-operator/step-registry/gather/aws-console/OWNERS b/ci-operator/step-registry/gather/aws-console/OWNERS new file mode 100644 index 0000000000000..2b40bdc329b7e --- /dev/null +++ b/ci-operator/step-registry/gather/aws-console/OWNERS @@ -0,0 +1,5 @@ +approvers: +- cgwalters +- enxebre +- vrutkovs +- wking diff --git a/ci-operator/step-registry/gather/aws-console/gather-aws-console-commands.sh b/ci-operator/step-registry/gather/aws-console/gather-aws-console-commands.sh new file mode 100755 index 0000000000000..889f69c5df031 --- /dev/null +++ b/ci-operator/step-registry/gather/aws-console/gather-aws-console-commands.sh @@ -0,0 +1,40 @@ +#!/bin/bash + +set -o nounset +set -o errexit +set -o pipefail + +trap 'CHILDREN=$(jobs -p); if test -n "${CHILDREN}"; then kill ${CHILDREN} && wait; fi' TERM + +export AWS_SHARED_CREDENTIALS_FILE="${CLUSTER_PROFILE_DIR}/.awscred" + +if test ! -f "${SHARED_DIR}/metadata.json" +then + echo "No metadata.json, so unknown AWS region, so unable to gathering console logs." + exit 0 +fi + +if test -f "${KUBECONFIG}" +then + oc --request-timeout=5s get nodes -o jsonpath --template '{range .items[*]}{.spec.providerID}{"\n"}{end}' | sed 's|.*/||' > "${TMPDIR}/node-provider-IDs.txt" & + wait "$!" + + oc --request-timeout=5s -n openshift-machine-api get machines -o jsonpath --template '{range .items[*]}{.spec.providerID}{"\n"}{end}' | sed 's|.*/||' >> "${TMPDIR}/node-provider-IDs" & + wait "$!" +else + echo "No kubeconfig; skipping providerID extraction." + exit 0 +fi + +if test -f "${SHARED_DIR}/aws-instance-ids.txt" +then + cat "${SHARED_DIR}/aws-instance-ids.txt" >> "${TMPDIR}/node-provider-IDs.txt" +fi + +REGION="$(jq -r .aws.region "${SHARED_DIR}/metadata.json")" +cat "${TMPDIR}/node-provider-IDs.txt" | sort | uniq | while read -r INSTANCE_ID +do + echo "Gathering console logs for ${INSTANCE_ID}" + aws --region "${REGION}" ec2 get-console-output --instance-id "${INSTANCE_ID}" --output text > "${ARTIFACT_DIR}/${INSTANCE_ID}" & + wait "$!" +done diff --git a/ci-operator/step-registry/gather/aws-console/gather-aws-console-ref.yaml b/ci-operator/step-registry/gather/aws-console/gather-aws-console-ref.yaml new file mode 100644 index 0000000000000..9f6659e921522 --- /dev/null +++ b/ci-operator/step-registry/gather/aws-console/gather-aws-console-ref.yaml @@ -0,0 +1,14 @@ +ref: + as: gather-aws-console + from: upi-installer + commands: gather-aws-console-commands.sh + env: + - name: TMPDIR + default: /tmp + documentation: A pathname of a directory made available for programs that need a place to create temporary files. + resources: + requests: + cpu: 300m + memory: 300Mi + documentation: |- + The pre-deprovision artifacts step collects console logs from AWS instances. It gathers console logs for all nodes and machines with a provider ID, as well as any instance IDs listed in ${SHARED_DIR}/aws-instance-ids.txt. aws-instance-ids.txt entries should have a single provider ID per line. Duplicate entries are ok; the step deduplicates before gathering. diff --git a/ci-operator/step-registry/gather/extra/gather-extra-commands.sh b/ci-operator/step-registry/gather/extra/gather-extra-commands.sh index 00f6b5e25c18c..44db37d378257 100755 --- a/ci-operator/step-registry/gather/extra/gather-extra-commands.sh +++ b/ci-operator/step-registry/gather/extra/gather-extra-commands.sh @@ -28,8 +28,6 @@ echo "Gathering artifacts ..." mkdir -p ${ARTIFACT_DIR}/pods ${ARTIFACT_DIR}/nodes ${ARTIFACT_DIR}/metrics ${ARTIFACT_DIR}/bootstrap ${ARTIFACT_DIR}/network ${ARTIFACT_DIR}/oc_cmds oc --insecure-skip-tls-verify --request-timeout=5s get nodes -o jsonpath --template '{range .items[*]}{.metadata.name}{"\n"}{end}' > /tmp/nodes -oc --insecure-skip-tls-verify --request-timeout=5s get nodes -o jsonpath --template '{range .items[*]}{.spec.providerID}{"\n"}{end}' | sed 's|.*/||' > /tmp/node-provider-IDs -oc --insecure-skip-tls-verify --request-timeout=5s -n openshift-machine-api get machines -o jsonpath --template '{range .items[*]}{.spec.providerID}{"\n"}{end}' | sed 's|.*/||' >> /tmp/node-provider-IDs oc --insecure-skip-tls-verify --request-timeout=5s get pods --all-namespaces --template '{{ range .items }}{{ $name := .metadata.name }}{{ $ns := .metadata.namespace }}{{ range .spec.containers }}-n {{ $ns }} {{ $name }} -c {{ .name }}{{ "\n" }}{{ end }}{{ range .spec.initContainers }}-n {{ $ns }} {{ $name }} -c {{ .name }}{{ "\n" }}{{ end }}{{ end }}' > /tmp/containers oc --insecure-skip-tls-verify --request-timeout=5s get pods -l openshift.io/component=api --all-namespaces --template '{{ range .items }}-n {{ .metadata.namespace }} {{ .metadata.name }}{{ "\n" }}{{ end }}' > /tmp/pods-api diff --git a/ci-operator/step-registry/ipi/aws/post/ipi-aws-post-chain.yaml b/ci-operator/step-registry/ipi/aws/post/ipi-aws-post-chain.yaml index 10f2532365498..3abd2f613fac7 100644 --- a/ci-operator/step-registry/ipi/aws/post/ipi-aws-post-chain.yaml +++ b/ci-operator/step-registry/ipi/aws/post/ipi-aws-post-chain.yaml @@ -1,7 +1,7 @@ chain: as: ipi-aws-post steps: + - ref: gather-aws-console - chain: ipi-deprovision documentation: |- - The IPI cleanup step contains all steps that deprovision an OpenShift - cluster on AWS, provisioned by the `ipi-aws-pre` chain. + The IPI cleanup step contains all steps that gather and deprovision an OpenShift cluster on AWS, provisioned by the `ipi-aws-pre` chain. diff --git a/ci-operator/step-registry/ipi/deprovision/ipi-deprovision-chain.yaml b/ci-operator/step-registry/ipi/deprovision/ipi-deprovision-chain.yaml index 76b5756740992..9628d9b15653d 100644 --- a/ci-operator/step-registry/ipi/deprovision/ipi-deprovision-chain.yaml +++ b/ci-operator/step-registry/ipi/deprovision/ipi-deprovision-chain.yaml @@ -4,4 +4,4 @@ chain: - chain: gather - ref: ipi-deprovision-deprovision documentation: |- - The IPI deprovision step chain contains all the individual steps necessary to deprovision an OpenShift cluster. + The IPI deprovision step chain contains all the individual steps necessary to gather and deprovision an OpenShift cluster.