diff --git a/pkg/steps/clusterinstall/template.go b/pkg/steps/clusterinstall/template.go
index b8f7c8ba066..8008d5190ee 100644
--- a/pkg/steps/clusterinstall/template.go
+++ b/pkg/steps/clusterinstall/template.go
@@ -31,7 +31,6 @@ parameters:
required: false
- name: CLUSTER_VARIANT
- name: USE_LEASE_CLIENT
-- name: LEASED_RESOURCE
objects:
@@ -99,7 +98,7 @@ objects:
namespace: ${NAMESPACE}
annotations:
# we want to gather the teardown logs no matter what
- ci-operator.openshift.io/wait-for-container-artifacts: resourcewatch,teardown
+ ci-operator.openshift.io/wait-for-container-artifacts: teardown
ci-operator.openshift.io/save-container-logs: "true"
ci-operator.openshift.io/container-sub-tests: "setup,test,teardown"
spec:
@@ -116,54 +115,6 @@ objects:
secretName: ${JOB_NAME_SAFE}-cluster-profile
containers:
- - name: resourcewatch
- image: ${IMAGE_TESTS}
- terminationMessagePolicy: FallbackToLogsOnError
- volumeMounts:
- - name: shared-tmp
- mountPath: /tmp/shared
- - name: artifacts
- mountPath: /tmp/artifacts
- env:
- - name: ARTIFACT_DIR
- value: /tmp/artifacts
- - name: HOME
- value: /tmp/home
- - name: KUBECONFIG
- value: /tmp/artifacts/installer/auth/kubeconfig
- command:
- - /bin/bash
- - -c
- - |
- #!/bin/bash
- set -uo pipefail
-
- function runwatcher() {
- while true; do
- [[ ! -f "${KUBECONFIG}" ]] && sleep 1s && continue # make sure we have KUBECONFIG
- echo "== $(date) =="
- oc get clusteroperators --request-timeout=5s --insecure-skip-tls-verify --ignore-not-found -o jsonpath='{range .items[*]}{"\n"}{.metadata.name} {range .status.conditions[*]}{" "}{.type}={.status}({.reason}[{.message}])'
- sleep 5s
- done
- }
-
- trap 'jobs -p | xargs -r kill || true; exit 0' TERM
-
- runwatcher &> /tmp/artifacts/resourcewatch.log &
- watcherpid=$!
-
- for i in $(seq 1 220); do
- if [[ -f /tmp/shared/exit ]]; then
- echo "== watch terminated at $(date)" >>/tmp/artifacts/resourcewatch.log
- kill $watcherpid
- exit 0
- fi
- sleep 60 &
- sleeppid=$!
- wait $sleeppid
- done
-
-
# Once the cluster is up, executes shared tests
- name: test
image: ${IMAGE_TESTS}
@@ -196,8 +147,6 @@ objects:
value: ${IMAGE_FORMAT}
- name: KUBECONFIG
value: /tmp/artifacts/installer/auth/kubeconfig
- - name: MIRROR_BASE
- value: registry.svc.ci.openshift.org/${NAMESPACE}/release
command:
- /bin/bash
- -c
@@ -211,26 +160,48 @@ objects:
trap 'jobs -p | xargs -r kill || true; exit 0' TERM
function fips_check() {
- oc --insecure-skip-tls-verify --request-timeout=60s get nodes -o jsonpath --template '{range .items[*]}{.metadata.name}{"\n"}{end}' > /tmp/nodelist
- while IFS= read -r i; do
- oc -n default --insecure-skip-tls-verify --request-timeout=60s debug --image centos:7 node/$i -- cat /proc/sys/crypto/fips_enabled > /tmp/enabled
+ get_nodes=$(oc --request-timeout=60s get nodes -o jsonpath --template '{range .items[*]}{.metadata.name}{"\n"}{end}')
+ nodes=( $get_nodes )
+ # bash doesn't handle '.' in array elements easily
+ num_nodes="${#nodes[@]}"
+ # TODO: This must be replaced by code that waits for all the expected number
+ # of nodes to be ready.
+ for (( i=0; i<$num_nodes; i++ )); do
+ attempt=0
+ while true; do
+ out=$(oc --request-timeout=60s -n default debug node/"${nodes[i]}" -- cat /proc/sys/crypto/fips_enabled || true)
+ if [[ ! -z "${out}" ]]; then
+ break
+ fi
+ attempt=$(( attempt + 1 ))
+ if [[ $attempt -gt 3 ]]; then
+ break
+ fi
+ echo "command failed, $(( 4 - $attempt )) retries left"
+ sleep 5
+ done
+
+ if [[ -z "${out}" ]]; then
+ echo "oc debug node/${nodes[i]} failed"
+ exit 1
+ fi
if [[ "${CLUSTER_VARIANT}" =~ "fips" ]]; then
- if [[ $(< /tmp/enabled) == "0" ]]; then
- echo fips not enabled in node "$i" but should be, exiting
+ if [[ "${out}" -ne 1 ]]; then
+ echo "fips not enabled in node ${nodes[i]} but should be, exiting"
exit 1
fi
else
- if [[ $(< /tmp/enabled) == "1" ]]; then
- echo fips is enabled in node "$i" but should not be, exiting
+ if [[ "${out}" -ne 0 ]]; then
+ echo "fips is enabled in node ${nodes[i]} but should not be, exiting"
exit 1
fi
fi
- done samples-patch.yaml
- op: add
path: /spec/skippedImagestreams
@@ -312,9 +283,7 @@ objects:
export KUBE_SSH_USER=core
mkdir -p ~/.ssh
cp /tmp/cluster/ssh-privatekey ~/.ssh/google_compute_engine || true
- # TODO: make openshift-tests auto-discover this from cluster config
- REGION="$(oc get -o jsonpath='{.status.platformStatus.gcp.region}' infrastructure cluster)"
- export TEST_PROVIDER="{\"type\":\"gce\",\"region\":\"${REGION}\",\"multizone\": true,\"multimaster\":true,\"projectid\":\"openshift-gce-devel-ci\"}"
+ export TEST_PROVIDER='{"type":"gce","region":"us-east1","multizone": true,"multimaster":true,"projectid":"openshift-gce-devel-ci"}'
elif [[ "${CLUSTER_TYPE}" == "aws" ]]; then
mkdir -p ~/.ssh
cp /tmp/cluster/ssh-privatekey ~/.ssh/kube_aws_rsa || true
@@ -374,13 +343,13 @@ objects:
function run-upgrade-tests() {
openshift-tests run-upgrade "${TEST_SUITE}" --to-image "${IMAGE:-${RELEASE_IMAGE_LATEST}}" \
- --options "${TEST_OPTIONS:-}" \
- --provider "${TEST_PROVIDER:-}" -o /tmp/artifacts/e2e.log --junit-dir /tmp/artifacts/junit
+ --options "${TEST_UPGRADE_OPTIONS:-}" \
+ --provider "${TEST_PROVIDER:-}" -o ${ARTIFACT_DIR}/e2e.log --junit-dir ${ARTIFACT_DIR}/junit
}
function run-tests() {
openshift-tests run "${TEST_SUITE}" \
- --provider "${TEST_PROVIDER:-}" -o /tmp/artifacts/e2e.log --junit-dir /tmp/artifacts/junit
+ --provider "${TEST_PROVIDER:-}" -o ${ARTIFACT_DIR}/e2e.log --junit-dir ${ARTIFACT_DIR}/junit
}
if [[ "${CLUSTER_TYPE}" == "gcp" ]]; then
@@ -401,10 +370,14 @@ objects:
- name: artifacts
mountPath: /tmp/artifacts
env:
+ - name: ARTIFACT_DIR
+ value: /tmp/artifacts
- name: AWS_SHARED_CREDENTIALS_FILE
value: /etc/openshift-installer/.awscred
- name: AZURE_AUTH_LOCATION
value: /etc/openshift-installer/osServicePrincipal.json
+ - name: GCP_REGION
+ value: us-east1
- name: GCP_PROJECT
value: openshift-gce-devel-ci
- name: GOOGLE_CLOUD_KEYFILE_JSON
@@ -429,8 +402,6 @@ objects:
value: test
- name: HOME
value: /tmp
- - name: MIRROR_BASE
- value: registry.svc.ci.openshift.org/${NAMESPACE}/release
- name: INSTALL_INITIAL_RELEASE
- name: RELEASE_IMAGE_INITIAL
command:
@@ -443,7 +414,7 @@ objects:
trap 'rc=$?; if test "${rc}" -eq 0; then touch /tmp/setup-success; else touch /tmp/exit /tmp/setup-failed; fi; exit "${rc}"' EXIT
trap 'CHILDREN=$(jobs -p); if test -n "${CHILDREN}"; then kill ${CHILDREN} && wait; fi' TERM
cp "$(command -v openshift-install)" /tmp
- mkdir /tmp/artifacts/installer
+ mkdir ${ARTIFACT_DIR}/installer
function has_variant() {
regex="(^|,)$1($|,)"
@@ -463,6 +434,7 @@ objects:
# mirror the release image and override the release image to point to the mirrored one
mkdir /tmp/.docker && cp /etc/openshift-installer/pull-secret /tmp/.docker/config.json
oc registry login
+ MIRROR_BASE=$( oc get is release -o 'jsonpath={.status.publicDockerImageRepository}' )
oc adm release new --from-release ${RELEASE_IMAGE_LATEST} --to-image ${MIRROR_BASE}-scratch:release --mirror ${MIRROR_BASE}-scratch || echo 'ignore: the release could not be reproduced from its inputs'
oc adm release mirror --from ${MIRROR_BASE}-scratch:release --to ${MIRROR_BASE} --to-release-image ${MIRROR_BASE}:mirrored
RELEASE_PAYLOAD_IMAGE_SHA=$(oc get istag ${MIRROR_BASE##*/}:mirrored -o=jsonpath="{.image.metadata.name}")
@@ -483,24 +455,35 @@ objects:
mkdir -p ~/.ssh
cp "${SSH_PRIV_KEY_PATH}" ~/.ssh/
+ masters=3
+ if has_variant "single-node" ;then
+ masters=1
+ fi
+
workers=3
- if has_variant "compact"; then
+ if has_variant "compact" || has_variant "multisocket" || has_variant "single-node"; then
workers=0
fi
- if [[ "${CLUSTER_TYPE}" = "aws" ]]; then
+
+ if [[ "${CLUSTER_TYPE}" == "aws" ]]; then
master_type=null
- if has_variant "xlarge"; then
+ if has_variant "multisocket"; then
+ master_type=c5n.metal
+ elif has_variant "xlarge"; then
master_type=m5.8xlarge
elif has_variant "large"; then
master_type=m5.4xlarge
elif has_variant "compact"; then
master_type=m5.2xlarge
fi
- AWS_REGION="${LEASED_RESOURCE}"
- case "${AWS_REGION}" in
- us-east-1)
+ case $((RANDOM % 4)) in
+ 0) AWS_REGION=us-east-1
ZONE_1=us-east-1b
ZONE_2=us-east-1c;;
+ 1) AWS_REGION=us-east-2;;
+ 2) AWS_REGION=us-west-1;;
+ 3) AWS_REGION=us-west-2;;
+ *) echo >&2 "invalid AWS region index"; exit 1;;
esac
echo "AWS region: ${AWS_REGION} (zones: ${ZONE_1:-${AWS_REGION}a} ${ZONE_2:-${AWS_REGION}b})"
subnets="[]"
@@ -526,14 +509,14 @@ objects:
esac
echo "Subnets : ${subnets}"
fi
- cat > /tmp/artifacts/installer/install-config.yaml << EOF
+ cat > ${ARTIFACT_DIR}/installer/install-config.yaml << EOF
apiVersion: v1
baseDomain: ${BASE_DOMAIN:-origin-ci-int-aws.dev.rhcloud.com}
metadata:
name: ${CLUSTER_NAME}
controlPlane:
name: master
- replicas: 3
+ replicas: ${masters}
platform:
aws:
type: ${master_type}
@@ -562,7 +545,17 @@ objects:
EOF
elif [[ "${CLUSTER_TYPE}" == "azure4" ]]; then
- AZURE_REGION="${LEASED_RESOURCE}"
+ case $((RANDOM % 8)) in
+ 0) AZURE_REGION=centralus;;
+ 1) AZURE_REGION=centralus;;
+ 2) AZURE_REGION=centralus;;
+ 3) AZURE_REGION=centralus;;
+ 4) AZURE_REGION=centralus;;
+ 5) AZURE_REGION=centralus;;
+ 6) AZURE_REGION=eastus2;;
+ 7) AZURE_REGION=westus;;
+ *) echo >&2 "invalid Azure region index"; exit 1;;
+ esac
echo "Azure region: ${AZURE_REGION}"
vnetrg=""
@@ -575,7 +568,7 @@ objects:
ctrlsubnet="subnet-1"
computesubnet="subnet-2"
fi
- cat > /tmp/artifacts/installer/install-config.yaml << EOF
+ cat > ${ARTIFACT_DIR}/installer/install-config.yaml << EOF
apiVersion: v1
baseDomain: ${BASE_DOMAIN:-ci.azure.devcluster.openshift.com}
metadata:
@@ -586,6 +579,9 @@ objects:
compute:
- name: worker
replicas: ${workers}
+ platform:
+ azure:
+ type: Standard_D4s_v3
platform:
azure:
baseDomainResourceGroupName: os4-common
@@ -600,7 +596,6 @@ objects:
${SSH_PUB_KEY}
EOF
elif [[ "${CLUSTER_TYPE}" == "gcp" ]]; then
- GCP_REGION="${LEASED_RESOURCE}"
master_type=null
if has_variant "xlarge"; then
master_type=n1-standard-32
@@ -630,7 +625,7 @@ objects:
ctrlsubnet="do-not-delete-shared-master-subnet"
computesubnet="do-not-delete-shared-worker-subnet"
fi
- cat > /tmp/artifacts/installer/install-config.yaml << EOF
+ cat > ${ARTIFACT_DIR}/installer/install-config.yaml << EOF
apiVersion: v1
baseDomain: ${BASE_DOMAIN:-origin-ci-int-gce.dev.openshift.com}
metadata:
@@ -669,7 +664,7 @@ objects:
# use a http endpoint for the httpsProxy value
# TODO: revert back to using https://ewolinet:5f6ccbbbafc66013d012839921ada773@35.231.5.161:3128/
- cat >> /tmp/artifacts/installer/install-config.yaml << EOF
+ cat >> ${ARTIFACT_DIR}/installer/install-config.yaml << EOF
proxy:
httpsProxy: http://ewolinet:5f6ccbbbafc66013d012839921ada773@35.196.128.173:3128/
httpProxy: http://ewolinet:5f6ccbbbafc66013d012839921ada773@35.196.128.173:3128/
@@ -749,10 +744,23 @@ objects:
network_type="${CLUSTER_NETWORK_TYPE-}"
if has_variant "ovn"; then
network_type=OVNKubernetes
+ elif has_variant "calico"; then
+ network_type=Calico
+ fi
+
+ cidr_size=16
+ host_prefix=23
+ if has_variant "xlarge" || has_variant "large"; then
+ cidr_size=12
+ host_prefix=22
+ if [[ -z "${network_type}" ]]; then
+ network_type=OpenShiftSDN
+ fi
fi
+
if has_variant "ipv6"; then
export OPENSHIFT_INSTALL_AZURE_EMULATE_SINGLESTACK_IPV6=true
- cat >> /tmp/artifacts/installer/install-config.yaml << EOF
+ cat >> ${ARTIFACT_DIR}/installer/install-config.yaml << EOF
networking:
networkType: OVNKubernetes
machineNetwork:
@@ -765,14 +773,18 @@ objects:
- fd02::/112
EOF
elif [[ -n "${network_type}" ]]; then
- cat >> /tmp/artifacts/installer/install-config.yaml << EOF
+ cat >> ${ARTIFACT_DIR}/installer/install-config.yaml << EOF
networking:
networkType: ${network_type}
+ machineNetwork:
+ - cidr: 10.0.0.0/16
+ clusterNetwork:
+ - cidr: 10.128.0.0/${cidr_size}
+ hostPrefix: ${host_prefix}
EOF
fi
-
if has_variant "mirror"; then
- cat >> /tmp/artifacts/installer/install-config.yaml << EOF
+ cat >> ${ARTIFACT_DIR}/installer/install-config.yaml << EOF
imageContentSources:
- source: "${MIRROR_BASE}-scratch"
mirrors:
@@ -781,28 +793,61 @@ objects:
fi
if has_variant "fips"; then
- cat >> /tmp/artifacts/installer/install-config.yaml << EOF
+ cat >> ${ARTIFACT_DIR}/installer/install-config.yaml << EOF
fips: true
EOF
fi
- if has_variant "preserve_bootstrap"; then
+ if has_variant "preserve-bootstrap"; then
export OPENSHIFT_INSTALL_PRESERVE_BOOTSTRAP=true
fi
- # TODO: Replace with a more concise manifest injection approach
+ openshift-install --dir=${ARTIFACT_DIR}/installer/ create manifests &
+ wait "$!"
+
+ manifests=${ARTIFACT_DIR}/installer/manifests/
+
+ sed -i '/^ channel:/d' ${manifests}/cvo-overrides.yaml
+
if [[ -n "${CLUSTER_NETWORK_MANIFEST:-}" ]]; then
- openshift-install --dir=/tmp/artifacts/installer/ create manifests
- echo "${CLUSTER_NETWORK_MANIFEST}" > /tmp/artifacts/installer/manifests/cluster-network-03-config.yml
+ echo "${CLUSTER_NETWORK_MANIFEST}" > ${manifests}/cluster-network-03-config.yml
+ fi
+
+ if [[ "${network_type}" == "Calico" ]]; then
+ pushd ${manifests}/..
+
+ # Copied exactly from https://docs.projectcalico.org/getting-started/openshift/installation
+ curl https://docs.projectcalico.org/manifests/ocp/crds/01-crd-installation.yaml -o manifests/01-crd-installation.yaml
+ curl https://docs.projectcalico.org/manifests/ocp/crds/01-crd-tigerastatus.yaml -o manifests/01-crd-tigerastatus.yaml
+ curl https://docs.projectcalico.org/manifests/ocp/crds/calico/kdd/crd.projectcalico.org_bgpconfigurations.yaml -o manifests/crd.projectcalico.org_bgpconfigurations.yaml
+ curl https://docs.projectcalico.org/manifests/ocp/crds/calico/kdd/crd.projectcalico.org_bgppeers.yaml -o manifests/crd.projectcalico.org_bgppeers.yaml
+ curl https://docs.projectcalico.org/manifests/ocp/crds/calico/kdd/crd.projectcalico.org_blockaffinities.yaml -o manifests/crd.projectcalico.org_blockaffinities.yaml
+ curl https://docs.projectcalico.org/manifests/ocp/crds/calico/kdd/crd.projectcalico.org_clusterinformations.yaml -o manifests/crd.projectcalico.org_clusterinformations.yaml
+ curl https://docs.projectcalico.org/manifests/ocp/crds/calico/kdd/crd.projectcalico.org_felixconfigurations.yaml -o manifests/crd.projectcalico.org_felixconfigurations.yaml
+ curl https://docs.projectcalico.org/manifests/ocp/crds/calico/kdd/crd.projectcalico.org_globalnetworkpolicies.yaml -o manifests/crd.projectcalico.org_globalnetworkpolicies.yaml
+ curl https://docs.projectcalico.org/manifests/ocp/crds/calico/kdd/crd.projectcalico.org_globalnetworksets.yaml -o manifests/crd.projectcalico.org_globalnetworksets.yaml
+ curl https://docs.projectcalico.org/manifests/ocp/crds/calico/kdd/crd.projectcalico.org_hostendpoints.yaml -o manifests/crd.projectcalico.org_hostendpoints.yaml
+ curl https://docs.projectcalico.org/manifests/ocp/crds/calico/kdd/crd.projectcalico.org_ipamblocks.yaml -o manifests/crd.projectcalico.org_ipamblocks.yaml
+ curl https://docs.projectcalico.org/manifests/ocp/crds/calico/kdd/crd.projectcalico.org_ipamconfigs.yaml -o manifests/crd.projectcalico.org_ipamconfigs.yaml
+ curl https://docs.projectcalico.org/manifests/ocp/crds/calico/kdd/crd.projectcalico.org_ipamhandles.yaml -o manifests/crd.projectcalico.org_ipamhandles.yaml
+ curl https://docs.projectcalico.org/manifests/ocp/crds/calico/kdd/crd.projectcalico.org_ippools.yaml -o manifests/crd.projectcalico.org_ippools.yaml
+ curl https://docs.projectcalico.org/manifests/ocp/crds/calico/kdd/crd.projectcalico.org_kubecontrollersconfigurations.yaml -o manifests/crd.projectcalico.org_kubecontrollersconfigurations.yaml
+ curl https://docs.projectcalico.org/manifests/ocp/crds/calico/kdd/crd.projectcalico.org_networkpolicies.yaml -o manifests/crd.projectcalico.org_networkpolicies.yaml
+ curl https://docs.projectcalico.org/manifests/ocp/crds/calico/kdd/crd.projectcalico.org_networksets.yaml -o manifests/crd.projectcalico.org_networksets.yaml
+ curl https://docs.projectcalico.org/manifests/ocp/tigera-operator/00-namespace-tigera-operator.yaml -o manifests/00-namespace-tigera-operator.yaml
+ curl https://docs.projectcalico.org/manifests/ocp/tigera-operator/02-rolebinding-tigera-operator.yaml -o manifests/02-rolebinding-tigera-operator.yaml
+ curl https://docs.projectcalico.org/manifests/ocp/tigera-operator/02-role-tigera-operator.yaml -o manifests/02-role-tigera-operator.yaml
+ curl https://docs.projectcalico.org/manifests/ocp/tigera-operator/02-serviceaccount-tigera-operator.yaml -o manifests/02-serviceaccount-tigera-operator.yaml
+ curl https://docs.projectcalico.org/manifests/ocp/tigera-operator/02-configmap-calico-resources.yaml -o manifests/02-configmap-calico-resources.yaml
+ curl https://docs.projectcalico.org/manifests/ocp/tigera-operator/02-tigera-operator.yaml -o manifests/02-tigera-operator.yaml
+ curl https://docs.projectcalico.org/manifests/ocp/01-cr-installation.yaml -o manifests/01-cr-installation.yaml
+ # end copied
+
+ popd
fi
if has_variant "rt"; then
- if [[ -n "${CLUSTER_NETWORK_MANIFEST:-}" ]]; then
- echo 'error: CLUSTER_NETWORK_MANIFEST is incompatible with the "rt" variant'
- exit 1
- fi
- openshift-install --dir=/tmp/artifacts/installer/ create manifests
- cat > /tmp/artifacts/installer/manifests/realtime-worker-machine-config.yml << EOF
+ cat > ${manifests}/realtime-worker-machine-config.yml << EOF
apiVersion: machineconfiguration.openshift.io/v1
kind: MachineConfig
metadata:
@@ -814,9 +859,63 @@ objects:
EOF
fi
- TF_LOG=debug openshift-install --dir=/tmp/artifacts/installer create cluster 2>&1 | grep --line-buffered -v password &
+ if has_variant "rt-debug"; then
+ cat > ${manifests}/realtime-worker-machine-config.yml << EOF
+ apiVersion: machineconfiguration.openshift.io/v1
+ kind: MachineConfig
+ metadata:
+ labels:
+ machineconfiguration.openshift.io/role: worker
+ name: realtime-worker
+ spec:
+ kernelType: realtime
+ EOF
+ cat > ${manifests}/worker-kernel-debug-machine-config.yml << EOF
+ apiVersion: machineconfiguration.openshift.io/v1
+ kind: MachineConfig
+ metadata:
+ labels:
+ machineconfiguration.openshift.io/role: worker
+ name: kernel-debug
+ spec:
+ kernelArguments:
+ - 'debug'
+ EOF
+ fi
+
+ if has_variant "multisocket"; then
+ # TODO: waiting for https://issues.redhat.com/browse/GRPA-1895
+ cat > ${manifests}/multisocket-machine-config.yml << EOF
+ ---
+ apiVersion: machineconfiguration.openshift.io/v1
+ kind: MachineConfigPool
+ metadata:
+ labels:
+ topology-manager: enabled
+ name: master
+ ---
+ apiVersion: machineconfiguration.openshift.io/v1
+ kind: KubeletConfig
+ metadata:
+ name: enable-topology-manager
+ spec:
+ machineConfigPoolSelector:
+ matchLabels:
+ topology-manager: enabled
+ kubeletConfig:
+ cpuManagerPolicy: "static"
+ cpuManagerReconcilePeriod: "10s"
+ topologyManagerPolicy: "single-numa-node"
+ reservedSystemCPUs: 1,3,5,7
+ EOF
+ fi
+
+ TF_LOG=debug openshift-install --dir=${ARTIFACT_DIR}/installer create cluster 2>&1 | grep --line-buffered -v password &
wait "$!"
+ # Password for the cluster gets leaked in the installer logs and hence removing them.
+ sed -i 's/password: .*/password: REDACTED"/g' ${ARTIFACT_DIR}/installer/.openshift_install.log
+
# Performs cleanup of all created resources
- name: teardown
image: ${IMAGE_TESTS}
@@ -829,6 +928,8 @@ objects:
- name: artifacts
mountPath: /tmp/artifacts
env:
+ - name: ARTIFACT_DIR
+ value: /tmp/artifacts
- name: INSTANCE_PREFIX
value: ${NAMESPACE}-${JOB_NAME_HASH}
- name: AWS_SHARED_CREDENTIALS_FILE
@@ -874,7 +975,7 @@ objects:
export PATH=$PATH:/tmp/shared
echo "Gathering artifacts ..."
- mkdir -p /tmp/artifacts/pods /tmp/artifacts/nodes /tmp/artifacts/metrics /tmp/artifacts/bootstrap /tmp/artifacts/network
+ mkdir -p ${ARTIFACT_DIR}/pods ${ARTIFACT_DIR}/nodes ${ARTIFACT_DIR}/metrics ${ARTIFACT_DIR}/bootstrap ${ARTIFACT_DIR}/network
oc --insecure-skip-tls-verify --request-timeout=5s get nodes -o jsonpath --template '{range .items[*]}{.metadata.name}{"\n"}{end}' > /tmp/nodes
oc --insecure-skip-tls-verify --request-timeout=5s get nodes -o jsonpath --template '{range .items[*]}{.spec.providerID}{"\n"}{end}' | sed 's|.*/||' > /tmp/node-provider-IDs
@@ -882,106 +983,110 @@ objects:
oc --insecure-skip-tls-verify --request-timeout=5s get pods --all-namespaces --template '{{ range .items }}{{ $name := .metadata.name }}{{ $ns := .metadata.namespace }}{{ range .spec.containers }}-n {{ $ns }} {{ $name }} -c {{ .name }}{{ "\n" }}{{ end }}{{ range .spec.initContainers }}-n {{ $ns }} {{ $name }} -c {{ .name }}{{ "\n" }}{{ end }}{{ end }}' > /tmp/containers
oc --insecure-skip-tls-verify --request-timeout=5s get pods -l openshift.io/component=api --all-namespaces --template '{{ range .items }}-n {{ .metadata.namespace }} {{ .metadata.name }}{{ "\n" }}{{ end }}' > /tmp/pods-api
- queue /tmp/artifacts/config-resources.json oc --insecure-skip-tls-verify --request-timeout=5s get apiserver.config.openshift.io authentication.config.openshift.io build.config.openshift.io console.config.openshift.io dns.config.openshift.io featuregate.config.openshift.io image.config.openshift.io infrastructure.config.openshift.io ingress.config.openshift.io network.config.openshift.io oauth.config.openshift.io project.config.openshift.io scheduler.config.openshift.io -o json
- queue /tmp/artifacts/apiservices.json oc --insecure-skip-tls-verify --request-timeout=5s get apiservices -o json
- queue /tmp/artifacts/clusteroperators.json oc --insecure-skip-tls-verify --request-timeout=5s get clusteroperators -o json
- queue /tmp/artifacts/clusterversion.json oc --insecure-skip-tls-verify --request-timeout=5s get clusterversion -o json
- queue /tmp/artifacts/configmaps.json oc --insecure-skip-tls-verify --request-timeout=5s get configmaps --all-namespaces -o json
- queue /tmp/artifacts/credentialsrequests.json oc --insecure-skip-tls-verify --request-timeout=5s get credentialsrequests --all-namespaces -o json
- queue /tmp/artifacts/csr.json oc --insecure-skip-tls-verify --request-timeout=5s get csr -o json
- queue /tmp/artifacts/endpoints.json oc --insecure-skip-tls-verify --request-timeout=5s get endpoints --all-namespaces -o json
- FILTER=gzip queue /tmp/artifacts/deployments.json.gz oc --insecure-skip-tls-verify --request-timeout=5s get deployments --all-namespaces -o json
- FILTER=gzip queue /tmp/artifacts/daemonsets.json.gz oc --insecure-skip-tls-verify --request-timeout=5s get daemonsets --all-namespaces -o json
- queue /tmp/artifacts/events.json oc --insecure-skip-tls-verify --request-timeout=5s get events --all-namespaces -o json
- queue /tmp/artifacts/kubeapiserver.json oc --insecure-skip-tls-verify --request-timeout=5s get kubeapiserver -o json
- queue /tmp/artifacts/kubecontrollermanager.json oc --insecure-skip-tls-verify --request-timeout=5s get kubecontrollermanager -o json
- queue /tmp/artifacts/machineconfigpools.json oc --insecure-skip-tls-verify --request-timeout=5s get machineconfigpools -o json
- queue /tmp/artifacts/machineconfigs.json oc --insecure-skip-tls-verify --request-timeout=5s get machineconfigs -o json
- queue /tmp/artifacts/machinesets.json oc --insecure-skip-tls-verify --request-timeout=5s get machinesets -A -o json
- queue /tmp/artifacts/machines.json oc --insecure-skip-tls-verify --request-timeout=5s get machines -A -o json
- queue /tmp/artifacts/namespaces.json oc --insecure-skip-tls-verify --request-timeout=5s get namespaces -o json
- queue /tmp/artifacts/nodes.json oc --insecure-skip-tls-verify --request-timeout=5s get nodes -o json
- queue /tmp/artifacts/openshiftapiserver.json oc --insecure-skip-tls-verify --request-timeout=5s get openshiftapiserver -o json
- queue /tmp/artifacts/pods.json oc --insecure-skip-tls-verify --request-timeout=5s get pods --all-namespaces -o json
- queue /tmp/artifacts/persistentvolumes.json oc --insecure-skip-tls-verify --request-timeout=5s get persistentvolumes --all-namespaces -o json
- queue /tmp/artifacts/persistentvolumeclaims.json oc --insecure-skip-tls-verify --request-timeout=5s get persistentvolumeclaims --all-namespaces -o json
- FILTER=gzip queue /tmp/artifacts/replicasets.json.gz oc --insecure-skip-tls-verify --request-timeout=5s get replicasets --all-namespaces -o json
- queue /tmp/artifacts/rolebindings.json oc --insecure-skip-tls-verify --request-timeout=5s get rolebindings --all-namespaces -o json
- queue /tmp/artifacts/roles.json oc --insecure-skip-tls-verify --request-timeout=5s get roles --all-namespaces -o json
- queue /tmp/artifacts/services.json oc --insecure-skip-tls-verify --request-timeout=5s get services --all-namespaces -o json
- FILTER=gzip queue /tmp/artifacts/statefulsets.json.gz oc --insecure-skip-tls-verify --request-timeout=5s get statefulsets --all-namespaces -o json
-
- FILTER=gzip queue /tmp/artifacts/openapi.json.gz oc --insecure-skip-tls-verify --request-timeout=5s get --raw /openapi/v2
+ queue ${ARTIFACT_DIR}/config-resources.json oc --insecure-skip-tls-verify --request-timeout=5s get apiserver.config.openshift.io authentication.config.openshift.io build.config.openshift.io console.config.openshift.io dns.config.openshift.io featuregate.config.openshift.io image.config.openshift.io infrastructure.config.openshift.io ingress.config.openshift.io network.config.openshift.io oauth.config.openshift.io project.config.openshift.io scheduler.config.openshift.io -o json
+ queue ${ARTIFACT_DIR}/apiservices.json oc --insecure-skip-tls-verify --request-timeout=5s get apiservices -o json
+ queue ${ARTIFACT_DIR}/clusteroperators.json oc --insecure-skip-tls-verify --request-timeout=5s get clusteroperators -o json
+ queue ${ARTIFACT_DIR}/clusterversion.json oc --insecure-skip-tls-verify --request-timeout=5s get clusterversion -o json
+ queue ${ARTIFACT_DIR}/configmaps.json oc --insecure-skip-tls-verify --request-timeout=5s get configmaps --all-namespaces -o json
+ queue ${ARTIFACT_DIR}/credentialsrequests.json oc --insecure-skip-tls-verify --request-timeout=5s get credentialsrequests --all-namespaces -o json
+ queue ${ARTIFACT_DIR}/csr.json oc --insecure-skip-tls-verify --request-timeout=5s get csr -o json
+ queue ${ARTIFACT_DIR}/endpoints.json oc --insecure-skip-tls-verify --request-timeout=5s get endpoints --all-namespaces -o json
+ FILTER=gzip queue ${ARTIFACT_DIR}/deployments.json.gz oc --insecure-skip-tls-verify --request-timeout=5s get deployments --all-namespaces -o json
+ FILTER=gzip queue ${ARTIFACT_DIR}/daemonsets.json.gz oc --insecure-skip-tls-verify --request-timeout=5s get daemonsets --all-namespaces -o json
+ queue ${ARTIFACT_DIR}/events.json oc --insecure-skip-tls-verify --request-timeout=5s get events --all-namespaces -o json
+ queue ${ARTIFACT_DIR}/kubeapiserver.json oc --insecure-skip-tls-verify --request-timeout=5s get kubeapiserver -o json
+ queue ${ARTIFACT_DIR}/kubecontrollermanager.json oc --insecure-skip-tls-verify --request-timeout=5s get kubecontrollermanager -o json
+ queue ${ARTIFACT_DIR}/machineconfigpools.json oc --insecure-skip-tls-verify --request-timeout=5s get machineconfigpools -o json
+ queue ${ARTIFACT_DIR}/machineconfigs.json oc --insecure-skip-tls-verify --request-timeout=5s get machineconfigs -o json
+ queue ${ARTIFACT_DIR}/machinesets.json oc --insecure-skip-tls-verify --request-timeout=5s get machinesets -A -o json
+ queue ${ARTIFACT_DIR}/machines.json oc --insecure-skip-tls-verify --request-timeout=5s get machines -A -o json
+ queue ${ARTIFACT_DIR}/namespaces.json oc --insecure-skip-tls-verify --request-timeout=5s get namespaces -o json
+ queue ${ARTIFACT_DIR}/nodes.json oc --insecure-skip-tls-verify --request-timeout=5s get nodes -o json
+ queue ${ARTIFACT_DIR}/openshiftapiserver.json oc --insecure-skip-tls-verify --request-timeout=5s get openshiftapiserver -o json
+ queue ${ARTIFACT_DIR}/pods.json oc --insecure-skip-tls-verify --request-timeout=5s get pods --all-namespaces -o json
+ queue ${ARTIFACT_DIR}/persistentvolumes.json oc --insecure-skip-tls-verify --request-timeout=5s get persistentvolumes --all-namespaces -o json
+ queue ${ARTIFACT_DIR}/persistentvolumeclaims.json oc --insecure-skip-tls-verify --request-timeout=5s get persistentvolumeclaims --all-namespaces -o json
+ FILTER=gzip queue ${ARTIFACT_DIR}/replicasets.json.gz oc --insecure-skip-tls-verify --request-timeout=5s get replicasets --all-namespaces -o json
+ queue ${ARTIFACT_DIR}/rolebindings.json oc --insecure-skip-tls-verify --request-timeout=5s get rolebindings --all-namespaces -o json
+ queue ${ARTIFACT_DIR}/roles.json oc --insecure-skip-tls-verify --request-timeout=5s get roles --all-namespaces -o json
+ queue ${ARTIFACT_DIR}/services.json oc --insecure-skip-tls-verify --request-timeout=5s get services --all-namespaces -o json
+ FILTER=gzip queue ${ARTIFACT_DIR}/statefulsets.json.gz oc --insecure-skip-tls-verify --request-timeout=5s get statefulsets --all-namespaces -o json
+
+ FILTER=gzip queue ${ARTIFACT_DIR}/openapi.json.gz oc --insecure-skip-tls-verify --request-timeout=5s get --raw /openapi/v2
# gather nodes first in parallel since they may contain the most relevant debugging info
while IFS= read -r i; do
- mkdir -p /tmp/artifacts/nodes/$i
- queue /tmp/artifacts/nodes/$i/heap oc --insecure-skip-tls-verify get --request-timeout=20s --raw /api/v1/nodes/$i/proxy/debug/pprof/heap
+ mkdir -p ${ARTIFACT_DIR}/nodes/$i
+ queue ${ARTIFACT_DIR}/nodes/$i/heap oc --insecure-skip-tls-verify get --request-timeout=20s --raw /api/v1/nodes/$i/proxy/debug/pprof/heap
+ FILTER=gzip queue ${ARTIFACT_DIR}/nodes/$i/journal.gz oc --insecure-skip-tls-verify adm node-logs $i --unify=false
+ FILTER=gzip queue ${ARTIFACT_DIR}/nodes/$i/journal-previous.gz oc --insecure-skip-tls-verify adm node-logs $i --unify=false --boot=-1
done < /tmp/nodes
- if [[ "${CLUSTER_TYPE}" = "aws" ]]; then
+ if [[ "${CLUSTER_TYPE}" == "aws" ]]; then
# FIXME: get epel-release or otherwise add awscli to our teardown image
export PATH="${HOME}/.local/bin:${PATH}"
- easy_install --user pip # our Python 2.7.5 is even too old for ensurepip
+ easy_install --user 'pip<21' # our Python 2.7.5 is even too old for ensurepip
pip install --user awscli
- export AWS_DEFAULT_REGION="$(python -c 'import json; data = json.load(open("/tmp/artifacts/installer/metadata.json")); print(data["aws"]["region"])')"
+ export AWS_DEFAULT_REGION="$(python -c 'import json; data = json.load(open("${ARTIFACT_DIR}/installer/metadata.json")); print(data["aws"]["region"])')"
echo "gathering node console output from ${AWS_DEFAULT_REGION}"
fi
while IFS= read -r i; do
- mkdir -p "/tmp/artifacts/nodes/${i}"
- if [[ "${CLUSTER_TYPE}" = "aws" ]]; then
- queue /tmp/artifacts/nodes/$i/console aws ec2 get-console-output --instance-id "${i}" --output text
+ mkdir -p "${ARTIFACT_DIR}/nodes/${i}"
+ if [[ "${CLUSTER_TYPE}" == "aws" ]]; then
+ queue ${ARTIFACT_DIR}/nodes/$i/console aws ec2 get-console-output --instance-id "${i}" --output text
fi
done < <(sort /tmp/node-provider-IDs | uniq)
- FILTER=gzip queue /tmp/artifacts/nodes/masters-journal.gz oc --insecure-skip-tls-verify adm node-logs --role=master --unify=false
- FILTER=gzip queue /tmp/artifacts/nodes/workers-journal.gz oc --insecure-skip-tls-verify adm node-logs --role=worker --unify=false
-
# Snapshot iptables-save on each node for debugging possible kube-proxy issues
oc --insecure-skip-tls-verify get --request-timeout=20s -n openshift-sdn -l app=sdn pods --template '{{ range .items }}{{ .metadata.name }}{{ "\n" }}{{ end }}' > /tmp/sdn-pods
while IFS= read -r i; do
- queue /tmp/artifacts/network/iptables-save-$i oc --insecure-skip-tls-verify rsh --timeout=20 -n openshift-sdn -c sdn $i iptables-save -c
+ queue ${ARTIFACT_DIR}/network/iptables-save-$i oc --insecure-skip-tls-verify rsh --timeout=20 -n openshift-sdn -c sdn $i iptables-save -c
done < /tmp/sdn-pods
while IFS= read -r i; do
file="$( echo "$i" | cut -d ' ' -f 3 | tr -s ' ' '_' )"
- queue /tmp/artifacts/metrics/${file}-heap oc --insecure-skip-tls-verify exec $i -- /bin/bash -c 'oc --insecure-skip-tls-verify get --raw /debug/pprof/heap --server "https://$( hostname ):8443" --config /etc/origin/master/admin.kubeconfig'
- queue /tmp/artifacts/metrics/${file}-controllers-heap oc --insecure-skip-tls-verify exec $i -- /bin/bash -c 'oc --insecure-skip-tls-verify get --raw /debug/pprof/heap --server "https://$( hostname ):8444" --config /etc/origin/master/admin.kubeconfig'
+ queue ${ARTIFACT_DIR}/metrics/${file}-heap oc --insecure-skip-tls-verify exec $i -- /bin/bash -c 'oc --insecure-skip-tls-verify get --raw /debug/pprof/heap --server "https://$( hostname ):8443" --config /etc/origin/master/admin.kubeconfig'
+ queue ${ARTIFACT_DIR}/metrics/${file}-controllers-heap oc --insecure-skip-tls-verify exec $i -- /bin/bash -c 'oc --insecure-skip-tls-verify get --raw /debug/pprof/heap --server "https://$( hostname ):8444" --config /etc/origin/master/admin.kubeconfig'
done < /tmp/pods-api
while IFS= read -r i; do
file="$( echo "$i" | cut -d ' ' -f 2,3,5 | tr -s ' ' '_' )"
- FILTER=gzip queue /tmp/artifacts/pods/${file}.log.gz oc --insecure-skip-tls-verify logs --request-timeout=20s $i
- FILTER=gzip queue /tmp/artifacts/pods/${file}_previous.log.gz oc --insecure-skip-tls-verify logs --request-timeout=20s -p $i
+ FILTER=gzip queue ${ARTIFACT_DIR}/pods/${file}.log.gz oc --insecure-skip-tls-verify logs --request-timeout=20s $i
+ FILTER=gzip queue ${ARTIFACT_DIR}/pods/${file}_previous.log.gz oc --insecure-skip-tls-verify logs --request-timeout=20s -p $i
done < /tmp/containers
echo "Snapshotting prometheus (may take 15s) ..."
- queue /tmp/artifacts/metrics/prometheus.tar.gz oc --insecure-skip-tls-verify exec -n openshift-monitoring prometheus-k8s-0 -- tar cvzf - -C /prometheus .
- FILTER=gzip queue /tmp/artifacts/metrics/prometheus-target-metadata.json.gz oc --insecure-skip-tls-verify exec -n openshift-monitoring prometheus-k8s-0 -- /bin/bash -c "curl -G http://localhost:9090/api/v1/targets/metadata --data-urlencode 'match_target={instance!=\"\"}'"
+ queue ${ARTIFACT_DIR}/metrics/prometheus.tar.gz oc --insecure-skip-tls-verify exec -n openshift-monitoring prometheus-k8s-0 -- tar cvzf - -C /prometheus .
+ FILTER=gzip queue ${ARTIFACT_DIR}/metrics/prometheus-target-metadata.json.gz oc --insecure-skip-tls-verify exec -n openshift-monitoring prometheus-k8s-0 -- /bin/bash -c "curl -G http://localhost:9090/api/v1/targets/metadata --data-urlencode 'match_target={instance!=\"\"}'"
echo "Running must-gather..."
- mkdir -p /tmp/artifacts/must-gather
- queue /tmp/artifacts/must-gather/must-gather.log oc --insecure-skip-tls-verify adm must-gather --dest-dir /tmp/artifacts/must-gather
+ mkdir -p ${ARTIFACT_DIR}/must-gather
+ queue ${ARTIFACT_DIR}/must-gather/must-gather.log oc --insecure-skip-tls-verify adm must-gather --dest-dir ${ARTIFACT_DIR}/must-gather
echo "Gathering audit logs..."
- mkdir -p /tmp/artifacts/audit-logs
- queue /tmp/artifacts/audit-logs/must-gather.log oc --insecure-skip-tls-verify adm must-gather --dest-dir /tmp/artifacts/audit-logs -- /usr/bin/gather_audit_logs
+ mkdir -p ${ARTIFACT_DIR}/audit-logs
+ queue ${ARTIFACT_DIR}/audit-logs/must-gather.log oc --insecure-skip-tls-verify adm must-gather --dest-dir ${ARTIFACT_DIR}/audit-logs -- /usr/bin/gather_audit_logs
echo "Waiting for logs ..."
wait
+ # This is a temporary conversion of cluster operator status to JSON matching the upgrade - may be moved to code in the future
+ mkdir -p ${ARTIFACT_DIR}/junit
+ curl -sL https://github.com/stedolan/jq/releases/download/jq-1.6/jq-linux64 >/tmp/jq && chmod ug+x /tmp/jq
+ <${ARTIFACT_DIR}/clusteroperators.json /tmp/jq -r 'def one(condition; t): t as $t | first([.[] | select(condition)] | map(.type=t)[]) // null; def msg: "Operator \(.type) (\(.reason)): \(.message)"; def xmlfailure: if .failure then "\(.failure | @html)" else "" end; def xmltest: "\( xmlfailure )"; def withconditions: map({name: "operator conditions \(.metadata.name)"} + ((.status.conditions // [{type:"Available",status: "False",message:"operator is not reporting conditions"}]) | (one(.type=="Available" and .status!="True"; "unavailable") // one(.type=="Degraded" and .status=="True"; "degraded") // one(.type=="Progressing" and .status=="True"; "progressing") // null) | if . then {failure: .|msg} else null end)); .items | withconditions | "\n\( [.[] | xmltest] | join("\n"))\n"' >${ARTIFACT_DIR}/junit/junit_install_status.xml
+
# This is an experimental wiring of autogenerated failure detection.
echo "Detect known failures from symptoms (experimental) ..."
- curl -f https://gist.githubusercontent.com/smarterclayton/03b50c8f9b6351b2d9903d7fb35b342f/raw/symptom.sh 2>/dev/null | bash -s /tmp/artifacts > /tmp/artifacts/junit/junit_symptoms.xml
+ curl -f https://gist.githubusercontent.com/smarterclayton/03b50c8f9b6351b2d9903d7fb35b342f/raw/symptom.sh 2>/dev/null | bash -s ${ARTIFACT_DIR} > ${ARTIFACT_DIR}/junit/junit_symptoms.xml
for artifact in must-gather audit-logs ; do
- tar -czC /tmp/artifacts/${artifact} -f /tmp/artifacts/${artifact}.tar.gz . &&
- rm -rf /tmp/artifacts/${artifact}
+ tar -czC ${ARTIFACT_DIR}/${artifact} -f ${ARTIFACT_DIR}/${artifact}.tar.gz . &&
+ rm -rf ${ARTIFACT_DIR}/${artifact}
done
echo "Deprovisioning cluster ..."
- openshift-install --dir /tmp/artifacts/installer destroy cluster
+ openshift-install --dir ${ARTIFACT_DIR}/installer destroy cluster
}
trap 'teardown' EXIT