Skip to content
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
24 changes: 13 additions & 11 deletions ci-operator/step-registry/gather/extra/gather-extra-commands.sh
Original file line number Diff line number Diff line change
Expand Up @@ -16,7 +16,9 @@ function queue() {
fi
}

export PATH=$PATH:/tmp/shared
export HOME=/tmp
export WORKSPACE=${WORKSPACE:-/tmp}
export PATH="${PATH}:${WORKSPACE}"

if test ! -f "${KUBECONFIG}"
then
Expand All @@ -27,11 +29,11 @@ fi
echo "Gathering artifacts ..."
mkdir -p ${ARTIFACT_DIR}/pods ${ARTIFACT_DIR}/nodes ${ARTIFACT_DIR}/metrics ${ARTIFACT_DIR}/bootstrap ${ARTIFACT_DIR}/network ${ARTIFACT_DIR}/oc_cmds

oc --insecure-skip-tls-verify --request-timeout=5s get nodes -o jsonpath --template '{range .items[*]}{.metadata.name}{"\n"}{end}' > /tmp/nodes
oc --insecure-skip-tls-verify --request-timeout=5s get nodes -o jsonpath --template '{range .items[*]}{.spec.providerID}{"\n"}{end}' | sed 's|.*/||' > /tmp/node-provider-IDs
oc --insecure-skip-tls-verify --request-timeout=5s -n openshift-machine-api get machines -o jsonpath --template '{range .items[*]}{.spec.providerID}{"\n"}{end}' | sed 's|.*/||' >> /tmp/node-provider-IDs
oc --insecure-skip-tls-verify --request-timeout=5s get pods --all-namespaces --template '{{ range .items }}{{ $name := .metadata.name }}{{ $ns := .metadata.namespace }}{{ range .spec.containers }}-n {{ $ns }} {{ $name }} -c {{ .name }}{{ "\n" }}{{ end }}{{ range .spec.initContainers }}-n {{ $ns }} {{ $name }} -c {{ .name }}{{ "\n" }}{{ end }}{{ end }}' > /tmp/containers
oc --insecure-skip-tls-verify --request-timeout=5s get pods -l openshift.io/component=api --all-namespaces --template '{{ range .items }}-n {{ .metadata.namespace }} {{ .metadata.name }}{{ "\n" }}{{ end }}' > /tmp/pods-api
oc --insecure-skip-tls-verify --request-timeout=5s get nodes -o jsonpath --template '{range .items[*]}{.metadata.name}{"\n"}{end}' > ${WORKSPACE}/nodes
oc --insecure-skip-tls-verify --request-timeout=5s get nodes -o jsonpath --template '{range .items[*]}{.spec.providerID}{"\n"}{end}' | sed 's|.*/||' > ${WORKSPACE}/node-provider-IDs
oc --insecure-skip-tls-verify --request-timeout=5s -n openshift-machine-api get machines -o jsonpath --template '{range .items[*]}{.spec.providerID}{"\n"}{end}' | sed 's|.*/||' >> ${WORKSPACE}/node-provider-IDs
oc --insecure-skip-tls-verify --request-timeout=5s get pods --all-namespaces --template '{{ range .items }}{{ $name := .metadata.name }}{{ $ns := .metadata.namespace }}{{ range .spec.containers }}-n {{ $ns }} {{ $name }} -c {{ .name }}{{ "\n" }}{{ end }}{{ range .spec.initContainers }}-n {{ $ns }} {{ $name }} -c {{ .name }}{{ "\n" }}{{ end }}{{ end }}' > ${WORKSPACE}/containers
oc --insecure-skip-tls-verify --request-timeout=5s get pods -l openshift.io/component=api --all-namespaces --template '{{ range .items }}-n {{ .metadata.namespace }} {{ .metadata.name }}{{ "\n" }}{{ end }}' > ${WORKSPACE}/pods-api

queue ${ARTIFACT_DIR}/config-resources.json oc --insecure-skip-tls-verify --request-timeout=5s get apiserver.config.openshift.io authentication.config.openshift.io build.config.openshift.io console.config.openshift.io dns.config.openshift.io featuregate.config.openshift.io image.config.openshift.io infrastructure.config.openshift.io ingress.config.openshift.io network.config.openshift.io oauth.config.openshift.io project.config.openshift.io scheduler.config.openshift.io -o json
queue ${ARTIFACT_DIR}/apiservices.json oc --insecure-skip-tls-verify --request-timeout=5s get apiservices -o json
Expand Down Expand Up @@ -94,30 +96,30 @@ FILTER=gzip queue ${ARTIFACT_DIR}/openapi.json.gz oc --insecure-skip-tls-verify
while IFS= read -r i; do
mkdir -p ${ARTIFACT_DIR}/nodes/$i
queue ${ARTIFACT_DIR}/nodes/$i/heap oc --insecure-skip-tls-verify get --request-timeout=20s --raw /api/v1/nodes/$i/proxy/debug/pprof/heap
done < /tmp/nodes
done < ${WORKSPACE}/nodes

FILTER=gzip queue ${ARTIFACT_DIR}/nodes/masters-journal.gz oc --insecure-skip-tls-verify adm node-logs --role=master --unify=false
FILTER=gzip queue ${ARTIFACT_DIR}/nodes/masters-journal-previous.gz oc --insecure-skip-tls-verify adm node-logs --boot=-1 --role=master --unify=false
FILTER=gzip queue ${ARTIFACT_DIR}/nodes/workers-journal.gz oc --insecure-skip-tls-verify adm node-logs --role=worker --unify=false
FILTER=gzip queue ${ARTIFACT_DIR}/nodes/workers-journal-previous.gz oc --insecure-skip-tls-verify adm node-logs --boot=-1 --role=worker --unify=false

# Snapshot iptables-save on each node for debugging possible kube-proxy issues
oc --insecure-skip-tls-verify get --request-timeout=20s -n openshift-sdn -l app=sdn pods --template '{{ range .items }}{{ .metadata.name }}{{ "\n" }}{{ end }}' > /tmp/sdn-pods
oc --insecure-skip-tls-verify get --request-timeout=20s -n openshift-sdn -l app=sdn pods --template '{{ range .items }}{{ .metadata.name }}{{ "\n" }}{{ end }}' > ${WORKSPACE}/sdn-pods
while IFS= read -r i; do
queue ${ARTIFACT_DIR}/network/iptables-save-$i oc --insecure-skip-tls-verify rsh --timeout=20 -n openshift-sdn -c sdn $i iptables-save -c
done < /tmp/sdn-pods
done < ${WORKSPACE}/sdn-pods

while IFS= read -r i; do
file="$( echo "$i" | cut -d ' ' -f 3 | tr -s ' ' '_' )"
queue ${ARTIFACT_DIR}/metrics/${file}-heap oc --insecure-skip-tls-verify exec $i -- /bin/bash -c 'oc --insecure-skip-tls-verify get --raw /debug/pprof/heap --server "https://$( hostname ):8443" --config /etc/origin/master/admin.kubeconfig'
queue ${ARTIFACT_DIR}/metrics/${file}-controllers-heap oc --insecure-skip-tls-verify exec $i -- /bin/bash -c 'oc --insecure-skip-tls-verify get --raw /debug/pprof/heap --server "https://$( hostname ):8444" --config /etc/origin/master/admin.kubeconfig'
done < /tmp/pods-api
done < ${WORKSPACE}/pods-api

while IFS= read -r i; do
file="$( echo "$i" | cut -d ' ' -f 2,3,5 | tr -s ' ' '_' )"
FILTER=gzip queue ${ARTIFACT_DIR}/pods/${file}.log.gz oc --insecure-skip-tls-verify logs --request-timeout=20s $i
FILTER=gzip queue ${ARTIFACT_DIR}/pods/${file}_previous.log.gz oc --insecure-skip-tls-verify logs --request-timeout=20s -p $i
done < /tmp/containers
done < ${WORKSPACE}/containers

echo "Snapshotting prometheus (may take 15s) ..."
queue ${ARTIFACT_DIR}/metrics/prometheus.tar.gz oc --insecure-skip-tls-verify exec -n openshift-monitoring prometheus-k8s-0 -- tar cvzf - -C /prometheus .
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,10 @@ set -o nounset
set -o errexit
set -o pipefail

export HOME=/tmp
export WORKSPACE=${WORKSPACE:-/tmp}
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Do you want this to be something generically available to all steps?

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I think that would be useful imo. making sure we have the workspace and path setup like this.

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Can we use TMPDIR, which is in POSIX?

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

We can also lean on openshift/ci-tools#854 and define variables for these (example in #9676) to keep them out of the script.

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Can we use TMPDIR, which is in POSIX?

i like WORKSPACE name more beacuse it is more clearer for it's responsibility. and works better when using it in jumped formed.

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

We can also lean on openshift/ci-tools#854 and define variables for these (example in #9676) to keep them out of the script.

The step env variables are user configurable inputs to the steps, so using it for something that is more about core function, i.e. where do i create my files seems like a mis use. I would never want someone to change the location of the workspace in a workflow because it is proabably going to break the step as we don't create or check permissions for that dir it is given to us.

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

i like WORKSPACE name more beacuse it is more clearer for it's responsibility. and works better when using it in jumped formed.

Is #9553 actually setting WORKSPACE anywhere?

The step env variables are user configurable inputs to the steps...

And they're also a way to set convenient defaults, for things where the script has no opinion.

I would never want someone to change the location of the workspace in a workflow because it is proabably going to break the step as we don't create or check permissions for that dir it is given to us.

Then just hard-code the value, and skip the variable? If this is something where you have some consumers that want a knob (e.g. some #9553 logic I'm missing about running these jobs with a different WORKSPACE on a bastion), then add an environment knob. And if callers point it at a non-existent directory or whatever, and the job breaks as a result, that's the caller's problem. I don't think we need defensive guards for things like that, and I don't think the lack of defensive guards is a reason to avoid a declared env variable in these internal tools.

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I think exposing the env from steps that are not actually user-configurable or changing them just breaks is bad API. So I do not think workspace is a good idea.

Also #9553 only uses the bash script itself and not the step, it copies the bash script to the bastion and then runs it using a unique workspace. I do not need these to available as an API on the steps level just to these scripts for reusing.

export PATH="${PATH}:${WORKSPACE}"

if test ! -f "${KUBECONFIG}"
then
echo "No kubeconfig, so no point in calling must-gather."
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -9,21 +9,26 @@ trap 'CHILDREN=$(jobs -p); if test -n "${CHILDREN}"; then kill ${CHILDREN} && wa
export AWS_SHARED_CREDENTIALS_FILE=$CLUSTER_PROFILE_DIR/.awscred
export AZURE_AUTH_LOCATION=$CLUSTER_PROFILE_DIR/osServicePrincipal.json
export GOOGLE_CLOUD_KEYFILE_JSON=$CLUSTER_PROFILE_DIR/gce.json
export HOME=/tmp
export WORKSPACE=${WORKSPACE:-/tmp}
export PATH="${PATH}:${WORKSPACE}"

echo "Deprovisioning cluster ..."
if [[ ! -s "${SHARED_DIR}/metadata.json" ]]; then
echo "Skipping: ${SHARED_DIR}/metadata.json not found."
exit
fi

cp -ar "${SHARED_DIR}" /tmp/installer
openshift-install --dir /tmp/installer destroy cluster &
dir=${WORKSPACE}/installer
mkdir -p "${dir}/"
cp -ar "${SHARED_DIR}"/* "${dir}/"
openshift-install --dir "${dir}" destroy cluster &

set +e
wait "$!"
ret="$?"
set -e

cp /tmp/installer/.openshift_install.log "${ARTIFACT_DIR}"
cp "${dir}"/.openshift_install.log "${ARTIFACT_DIR}"

exit "$ret"
Original file line number Diff line number Diff line change
Expand Up @@ -16,6 +16,8 @@ export PULL_SECRET_PATH=${CLUSTER_PROFILE_DIR}/pull-secret
export OPENSHIFT_INSTALL_RELEASE_IMAGE_OVERRIDE=${RELEASE_IMAGE_LATEST}
export OPENSHIFT_INSTALL_INVOKER=openshift-internal-ci/${JOB_NAME}/${BUILD_ID}
export HOME=/tmp
export WORKSPACE=${WORKSPACE:-/tmp}
export PATH="${PATH}:${WORKSPACE}"

case "${CLUSTER_TYPE}" in
aws) export AWS_SHARED_CREDENTIALS_FILE=${CLUSTER_PROFILE_DIR}/.awscred;;
Expand All @@ -25,8 +27,8 @@ vsphere) ;;
*) echo >&2 "Unsupported cluster type '${CLUSTER_TYPE}'"
esac

dir=/tmp/installer
mkdir "${dir}/"
dir=${WORKSPACE}/installer
mkdir -p "${dir}/"
cp "${SHARED_DIR}/install-config.yaml" "${dir}/"

# move private key to ~/.ssh/ so that installer can use it to gather logs on
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -7,13 +7,12 @@ set -o pipefail
export AWS_SHARED_CREDENTIALS_FILE=${CLUSTER_PROFILE_DIR}/.awscred
export AZURE_AUTH_LOCATION=${CLUSTER_PROFILE_DIR}/osServicePrincipal.json
export GCP_SHARED_CREDENTIALS_FILE=${CLUSTER_PROFILE_DIR}/gce.json
export HOME=/tmp/home
export PATH=/usr/libexec/origin:$PATH
export HOME=/tmp
export WORKSPACE=${WORKSPACE:-/tmp}
export PATH=/usr/libexec/origin:${WORKSPACE}:$PATH

trap 'CHILDREN=$(jobs -p); if test -n "${CHILDREN}"; then kill ${CHILDREN} && wait; fi' TERM

mkdir -p "${HOME}"

# if the cluster profile included an insights secret, install it to the cluster to
# report support data from the support-operator
if [[ -f "${CLUSTER_PROFILE_DIR}/insights-live.yaml" ]]; then
Expand Down Expand Up @@ -47,16 +46,16 @@ vsphere) export TEST_PROVIDER=vsphere;;
*) echo >&2 "Unsupported cluster type '${CLUSTER_TYPE}'"; exit 1;;
esac

mkdir -p /tmp/output
cd /tmp/output
mkdir -p ${WORKSPACE}/output
cd ${WORKSPACE}/output

if [[ "${CLUSTER_TYPE}" == gcp ]]; then
pushd /tmp
pushd ${WORKSPACE}
curl -O https://dl.google.com/dl/cloudsdk/channels/rapid/downloads/google-cloud-sdk-256.0.0-linux-x86_64.tar.gz
tar -xzf google-cloud-sdk-256.0.0-linux-x86_64.tar.gz
export PATH=$PATH:/tmp/google-cloud-sdk/bin
export PATH=$PATH:${WORKSPACE}/google-cloud-sdk/bin
mkdir gcloudconfig
export CLOUDSDK_CONFIG=/tmp/gcloudconfig
export CLOUDSDK_CONFIG=${WORKSPACE}/gcloudconfig
gcloud auth activate-service-account --key-file="${GCP_SHARED_CREDENTIALS_FILE}"
gcloud config set project openshift-gce-devel-ci
popd
Expand All @@ -69,5 +68,5 @@ fi

openshift-tests run "${test_suite}" \
--provider "${TEST_PROVIDER}" \
-o /tmp/artifacts/e2e.log \
--junit-dir /tmp/artifacts/junit
-o ${ARTIFACT_DIR}/e2e.log \
--junit-dir ${ARTIFACT_DIR}/junit