diff --git a/ci-operator/config/openshift/installer/openshift-installer-master.yaml b/ci-operator/config/openshift/installer/openshift-installer-master.yaml index 7df26855c056b..47321eb65c4d2 100644 --- a/ci-operator/config/openshift/installer/openshift-installer-master.yaml +++ b/ci-operator/config/openshift/installer/openshift-installer-master.yaml @@ -100,3 +100,7 @@ tests: commands: TEST_SUITE=openshift/conformance run-tests openshift_ansible_40: cluster_profile: aws-centos-40 +- as: e2e-vsphere + commands: TEST_SUITE=openshift/conformance/parallel run-tests + openshift_installer_upi: + cluster_profile: vsphere diff --git a/ci-operator/jobs/openshift/installer/openshift-installer-master-presubmits.yaml b/ci-operator/jobs/openshift/installer/openshift-installer-master-presubmits.yaml index be994c38d696f..006205114c0ed 100644 --- a/ci-operator/jobs/openshift/installer/openshift-installer-master-presubmits.yaml +++ b/ci-operator/jobs/openshift/installer/openshift-installer-master-presubmits.yaml @@ -386,6 +386,64 @@ presubmits: name: prow-job-cluster-scaleup-e2e-40 name: job-definition trigger: '(?m)^/test (?:.*? )?e2e-rhel-scaleup(?: .*?)?$' + - agent: kubernetes + always_run: false + branches: + - master + context: ci/prow/e2e-vsphere + decorate: true + decoration_config: + skip_cloning: true + labels: + ci-operator.openshift.io/prowgen-controlled: "true" + name: pull-ci-openshift-installer-master-e2e-vsphere + optional: true + rerun_command: /test e2e-vsphere + spec: + containers: + - args: + - --artifact-dir=$(ARTIFACTS) + - --give-pr-author-access-to-namespace=true + - --secret-dir=/usr/local/e2e-vsphere-cluster-profile + - --target=e2e-vsphere + - --template=/usr/local/e2e-vsphere + command: + - ci-operator + env: + - name: CLUSTER_TYPE + value: vsphere + - name: CONFIG_SPEC + valueFrom: + configMapKeyRef: + key: openshift-installer-master.yaml + name: ci-operator-master-configs + - name: JOB_NAME_SAFE + value: e2e-vsphere + - name: TEST_COMMAND + value: TEST_SUITE=openshift/conformance/parallel run-tests + image: ci-operator:latest + imagePullPolicy: Always + name: "" + resources: + requests: + cpu: 10m + volumeMounts: + - mountPath: /usr/local/e2e-vsphere-cluster-profile + name: cluster-profile + - mountPath: /usr/local/e2e-vsphere + name: job-definition + subPath: cluster-launch-installer-upi-e2e.yaml + serviceAccountName: ci-operator + volumes: + - name: cluster-profile + projected: + sources: + - secret: + name: cluster-secrets-vsphere + - configMap: + name: prow-job-cluster-launch-installer-upi-e2e + name: job-definition + trigger: '(?m)^/test (?:.*? )?e2e-vsphere(?: .*?)?$' - agent: kubernetes always_run: true branches: diff --git a/ci-operator/templates/openshift/installer/cluster-launch-installer-upi-e2e.yaml b/ci-operator/templates/openshift/installer/cluster-launch-installer-upi-e2e.yaml new file mode 100644 index 0000000000000..e84e80151c811 --- /dev/null +++ b/ci-operator/templates/openshift/installer/cluster-launch-installer-upi-e2e.yaml @@ -0,0 +1,638 @@ +kind: Template +apiVersion: template.openshift.io/v1 + +parameters: +- name: JOB_NAME_SAFE + required: true +- name: JOB_NAME_HASH + required: true +- name: NAMESPACE + required: true +- name: IMAGE_TESTS + required: true +- name: LOCAL_IMAGE_UPI_INSTALLER + required: true +- name: CLUSTER_TYPE + required: true +- name: TEST_COMMAND + required: true +- name: RELEASE_IMAGE_LATEST + required: true +- name: BASE_DOMAIN + value: origin-ci-int-aws.dev.rhcloud.com + required: true + +objects: + +# We want the cluster to be able to access these images +- kind: RoleBinding + apiVersion: authorization.openshift.io/v1 + metadata: + name: ${JOB_NAME_SAFE}-image-puller + namespace: ${NAMESPACE} + roleRef: + name: system:image-puller + subjects: + - kind: SystemGroup + name: system:unauthenticated + - kind: SystemGroup + name: system:authenticated + +# Give edit access to a known bot +- kind: RoleBinding + apiVersion: authorization.openshift.io/v1 + metadata: + name: ${JOB_NAME_SAFE}-namespace-editors + namespace: ${NAMESPACE} + roleRef: + name: edit + subjects: + - kind: ServiceAccount + namespace: ci + name: ci-chat-bot + +# Route for boostrap ignition file +- apiVersion: route.openshift.io/v1 + kind: Route + metadata: + name: ${JOB_NAME_SAFE}-bootstrap-exporter + namespace: ${NAMESPACE} + spec: + host: ${JOB_NAME_SAFE}-bootstrap-exporter-${NAMESPACE}.svc.ci.openshift.org + to: + name: ${JOB_NAME_SAFE}-bootstrap-exporter + tls: + termination: Edge + insecureEdgeTerminationPolicy: Redirect + +# Service for ignition file +- apiVersion: v1 + kind: Service + metadata: + name: ${JOB_NAME_SAFE}-bootstrap-exporter + namespace: ${NAMESPACE} + spec: + selector: + app: testpod + ports: + - port: 80 + targetPort: 8080 + +# The e2e pod spins up a cluster, runs e2e tests, and then cleans up the cluster. +- kind: Pod + apiVersion: v1 + metadata: + name: ${JOB_NAME_SAFE} + namespace: ${NAMESPACE} + annotations: + # we want to gather the teardown logs no matter what + ci-operator.openshift.io/wait-for-container-artifacts: teardown + ci-operator.openshift.io/save-container-logs: "true" + ci-operator.openshift.io/container-sub-tests: "setup,test,teardown" + labels: + app: testpod + spec: + restartPolicy: Never + activeDeadlineSeconds: 14400 + terminationGracePeriodSeconds: 900 + volumes: + - name: shared-ignition-files + emptyDir: {} + - name: artifacts + emptyDir: {} + - name: shared-tmp + emptyDir: {} + - name: cluster-profile + secret: + secretName: ${JOB_NAME_SAFE}-cluster-profile + + containers: + + - name: ignition-exporter + image: registry.svc.ci.openshift.org/openshift/origin-v4.0:artifacts + volumeMounts: + - name: shared-ignition-files + mountPath: /srv + - name: shared-tmp + mountPath: /tmp/shared + workingDir: /srv + command: + - /bin/bash + - -c + args: + - | + #!/bin/bash + set -euo pipefail + cat <>/tmp/serve.py + import os, SocketServer, SimpleHTTPServer + + addr = ('', 8080) + httpd = SocketServer.TCPServer(addr, SimpleHTTPServer.SimpleHTTPRequestHandler) + while not os.path.isfile("/tmp/shared/exit"): + httpd.handle_request() + END + python /tmp/serve.py + ports: + - containerPort: 8080 + protocol: TCP + livenessProbe: + httpGet: + path: / + port: 8080 + scheme: HTTP + readinessProbe: + failureThreshold: 3 + httpGet: + path: / + port: 8080 + scheme: HTTP + resources: + requests: + cpu: 50m + memory: 50Mi + + # Once the cluster is up, executes shared tests + - name: test + image: ${IMAGE_TESTS} + terminationMessagePolicy: FallbackToLogsOnError + resources: + requests: + cpu: 1 + memory: 300Mi + limits: + memory: 3Gi + volumeMounts: + - name: shared-tmp + mountPath: /tmp/shared + - name: cluster-profile + mountPath: /tmp/cluster + - name: artifacts + mountPath: /tmp/artifacts + env: + - name: AWS_SHARED_CREDENTIALS_FILE + value: /tmp/cluster/.awscred + - name: AWS_DEFAULT_REGION + value: us-east-1 + - name: ARTIFACT_DIR + value: /tmp/artifacts + - name: HOME + value: /tmp/home + - name: KUBECONFIG + value: /tmp/artifacts/installer/auth/kubeconfig + command: + - /bin/bash + - -c + - | + #!/bin/bash + set -euo pipefail + + export PATH=/usr/libexec/origin:$PATH + + trap 'touch /tmp/shared/exit' EXIT + trap 'kill $(jobs -p); exit 0' TERM + + mkdir -p "${HOME}" + + # wait for the API to come up + while true; do + if [[ -f /tmp/shared/exit ]]; then + echo "Another process exited" 2>&1 + exit 1 + fi + if [[ ! -f /tmp/shared/setup-success ]]; then + sleep 15 & wait + continue + fi + # don't let clients impact the global kubeconfig + cp "${KUBECONFIG}" /tmp/admin.kubeconfig + export KUBECONFIG=/tmp/admin.kubeconfig + break + done + + # set up cloud-provider-specific env vars + export KUBE_SSH_BASTION="$( oc --insecure-skip-tls-verify get node -l node-role.kubernetes.io/master -o 'jsonpath={.items[0].status.addresses[?(@.type=="ExternalIP")].address}' ):22" + export KUBE_SSH_KEY_PATH=/tmp/cluster/ssh-privatekey + if [[ "${CLUSTER_TYPE}" == "gcp" ]]; then + export GOOGLE_APPLICATION_CREDENTIALS="/tmp/cluster/gce.json" + export KUBE_SSH_USER=cloud-user + mkdir -p ~/.ssh + cp /tmp/cluster/ssh-privatekey ~/.ssh/google_compute_engine || true + export PROVIDER_ARGS='-provider=gce -gce-zone=us-east1-c -gce-project=openshift-gce-devel-ci' + export TEST_PROVIDER='{"type":"gce","zone":"us-east1-c","projectid":"openshift-gce-devel-ci"}' + elif [[ "${CLUSTER_TYPE}" == "aws" ]]; then + mkdir -p ~/.ssh + cp /tmp/cluster/ssh-privatekey ~/.ssh/kube_aws_rsa || true + export PROVIDER_ARGS="-provider=aws -gce-zone=us-east-1" + # TODO: make openshift-tests auto-discover this from cluster config + export TEST_PROVIDER='{"type":"aws","region":"us-east-1","zone":"us-east-1a","multizone":true,"multimaster":true}' + export KUBE_SSH_USER=core + fi + + mkdir -p /tmp/output + cd /tmp/output + + function run-upgrade-tests() { + openshift-tests run-upgrade "${TEST_SUITE}" --to-image "${RELEASE_IMAGE_LATEST}" \ + --provider "${TEST_PROVIDER:-}" -o /tmp/artifacts/e2e.log --junit-dir /tmp/artifacts/junit + exit 0 + } + + function run-tests() { + openshift-tests run "${TEST_SUITE}" \ + --provider "${TEST_PROVIDER:-}" -o /tmp/artifacts/e2e.log --junit-dir /tmp/artifacts/junit + exit 0 + } + + ${TEST_COMMAND} + + # Runs an install + - name: setup + image: ${LOCAL_IMAGE_UPI_INSTALLER} + terminationMessagePolicy: FallbackToLogsOnError + volumeMounts: + - name: shared-tmp + mountPath: /tmp + - name: shared-ignition-files + mountPath: /srv + - name: cluster-profile + mountPath: /etc/openshift-installer + - name: artifacts + mountPath: /tmp/artifacts + env: + - name: TYPE + value: ${CLUSTER_TYPE} + - name: AWS_SHARED_CREDENTIALS_FILE + value: /etc/openshift-installer/.awscred + - name: AWS_REGION + value: us-east-1 + - name: CLUSTER_NAME + value: ${NAMESPACE}-${JOB_NAME_HASH} + - name: BASE_DOMAIN + value: ${BASE_DOMAIN} + - name: SSH_PUB_KEY_PATH + value: /etc/openshift-installer/ssh-publickey + - name: PULL_SECRET_PATH + value: /etc/openshift-installer/pull-secret + - name: TFVARS_PATH + value: /etc/openshift-installer/secret.auto.tfvars + - name: OPENSHIFT_INSTALL_RELEASE_IMAGE_OVERRIDE + value: ${RELEASE_IMAGE_LATEST} + - name: USER + value: test + - name: HOME + value: /tmp + - name: INSTALL_INITIAL_RELEASE + - name: RELEASE_IMAGE_INITIAL + command: + - /bin/sh + - -c + - | + #!/bin/sh + set -e + trap 'rc=$?; if test "${rc}" -eq 0; then touch /tmp/setup-success; else touch /tmp/exit; fi; exit "${rc}"' EXIT + trap 'CHILDREN=$(jobs -p); if test -n "${CHILDREN}"; then kill ${CHILDREN} && wait; fi' TERM + + cp "$(command -v openshift-install)" /tmp + mkdir /tmp/artifacts/installer + + if [[ -n "${INSTALL_INITIAL_RELEASE}" && -n "${RELEASE_IMAGE_INITIAL}" ]]; then + echo "Installing from initial release ${RELEASE_IMAGE_INITIAL}" + OPENSHIFT_INSTALL_RELEASE_IMAGE_OVERRIDE="${RELEASE_IMAGE_INITIAL}" + else + echo "Installing from release ${RELEASE_IMAGE_LATEST}" + fi + + export EXPIRATION_DATE=$(date -d '4 hours' --iso=minutes --utc) + export SSH_PUB_KEY=$(cat "${SSH_PUB_KEY_PATH}") + export PULL_SECRET=$(cat "${PULL_SECRET_PATH}") + + if [[ "${CLUSTER_TYPE}" == "aws" ]]; then + cat > /tmp/artifacts/installer/install-config.yaml << EOF + apiVersion: v1beta4 + baseDomain: ${BASE_DOMAIN} + metadata: + name: ${CLUSTER_NAME} + networking: + clusterNetwork: + - cidr: 10.128.0.0/14 + hostPrefix: 23 + machineCIDR: 10.0.0.0/16 + serviceNetwork: + - 172.30.0.0/16 + networkType: OpenShiftSDN + platform: + aws: + region: ${AWS_REGION} + userTags: + expirationDate: ${EXPIRATION_DATE} + pullSecret: > + ${PULL_SECRET} + sshKey: | + ${SSH_PUB_KEY} + EOF + elif [[ "${CLUSTER_TYPE}" == "vsphere" ]]; then + # Get user and password from TFVARS_PATH + export VSPHERE_USER=$(grep -oP 'vsphere_user="\K[^"]+' ${TFVARS_PATH}) + export VSPHERE_PASSWORD=$(grep -oP 'vsphere_password="\K[^"]+' ${TFVARS_PATH}) + cat > /tmp/artifacts/installer/install-config.yaml << EOF + apiVersion: v1beta4 + baseDomain: ${BASE_DOMAIN} + metadata: + name: ${CLUSTER_NAME} + networking: + machineCIDR: 139.178.73.0/26 + platform: + vsphere: + vCenter: vcsa-ci.vmware.devcluster.openshift.com + username: "${VSPHERE_USER}" + password: "${VSPHERE_PASSWORD}" + datacenter: dc1 + defaultDatastore: nvme-ds1 + pullSecret: > + ${PULL_SECRET} + sshKey: | + ${SSH_PUB_KEY} + EOF + else + echo "Unsupported cluster type '${CLUSTER_TYPE}'" + exit 1 + fi + + echo "Creating ignition configs" + openshift-install --dir=/tmp/artifacts/installer create ignition-configs & + wait "$!" + + cp /tmp/artifacts/installer/bootstrap.ign /srv + + mkdir -p /tmp/tf + + # Copy sample UPI files + cp -r /var/lib/openshift-install/upi/${CLUSTER_TYPE}/* /tmp/tf + + # Create terraform.tfvars + export BOOTSTRAP_URL="http://${JOB_NAME_SAFE}-bootstrap-exporter-${NAMESPACE}.svc.ci.openshift.org/bootstrap.ign" + export MASTER_IGN=$(cat /tmp/artifacts/installer/master.ign) + export WORKER_IGN=$(cat /tmp/artifacts/installer/worker.ign) + + cat > /tmp/tf/terraform.tfvars <<-EOF + machine_cidr = "139.178.73.0/26" + + vm_template = "rhcos-latest" + + vsphere_cluster = "devel" + + vsphere_datacenter = "dc1" + + vsphere_datastore = "nvme-ds1" + + vsphere_server = "vcsa-ci.vmware.devcluster.openshift.com" + + ipam = "139.178.89.254" + + cluster_id = "${CLUSTER_NAME}" + + base_domain = "${BASE_DOMAIN}" + + cluster_domain = "${CLUSTER_NAME}.${BASE_DOMAIN}" + + bootstrap_ignition_url = "${BOOTSTRAP_URL}" + + // Ignition config for the control plane machines. You should copy the contents of the master.ign generated by the installer. + control_plane_ignition = <"${TARGET}" & + else + "${@}" >"${TARGET}" & + fi + } + + function teardown() { + set +e + touch /tmp/shared/exit + export PATH=$PATH:/tmp/shared + + echo "Gathering artifacts ..." + mkdir -p /tmp/artifacts/pods /tmp/artifacts/nodes /tmp/artifacts/metrics /tmp/artifacts/bootstrap /tmp/artifacts/network + + if [ -f /tmp/artifacts/installer/.openshift_install_state.json ] + then + # Remove VSPHERE_USER and VSPHERE_PASSWORD from install state json + export VSPHERE_USER=$(grep -oP 'vsphere_user="\K[^"]+' ${TFVARS_PATH}) + export VSPHERE_PASSWORD=$(grep -oP 'vsphere_password="\K[^"]+' ${TFVARS_PATH}) + sed -i "s;${VSPHERE_USER};REDACTED;g" /tmp/artifacts/installer/.openshift_install_state.json + sed -i "s;${VSPHERE_PASSWORD};REDACTED;g" /tmp/artifacts/installer/.openshift_install_state.json + + # bootstrap.ign also contains passwords + rm -rf /tmp/artifacts/installer/bootstrap.ign + jq -r '."*bootstrap.Bootstrap" |= {"Config": "REDACTED"}' /tmp/artifacts/installer/.openshift_install_state.json > /tmp/artifacts/installer/openshift_install_state_updated.json + mv /tmp/artifacts/installer/openshift_install_state_updated.json /tmp/artifacts/installer/.openshift_install_state.json + fi + + if [ -f /tmp/artifacts/installer/terraform.tfstate ] + then + # we don't have jq, so the python equivalent of + # jq '.modules[].resources."aws_instance.bootstrap".primary.attributes."public_ip" | select(.)' + bootstrap_ip=$(python -c \ + 'import sys, json; d=reduce(lambda x,y: dict(x.items() + y.items()), map(lambda x: x["resources"], json.load(sys.stdin)["modules"])); k="aws_instance.bootstrap"; print d[k]["primary"]["attributes"]["public_ip"] if k in d else ""' \ + < /tmp/artifacts/installer/terraform.tfstate + ) + + if [ -n "${bootstrap_ip}" ] + then + for service in bootkube openshift kubelet crio + do + queue "/tmp/artifacts/bootstrap/${service}.service" curl \ + --insecure \ + --silent \ + --connect-timeout 5 \ + --retry 3 \ + --cert /tmp/artifacts/installer/tls/journal-gatewayd.crt \ + --key /tmp/artifacts/installer/tls/journal-gatewayd.key \ + --url "https://${bootstrap_ip}:19531/entries?_SYSTEMD_UNIT=${service}.service" + done + fi + else + echo "No terraform statefile found. Skipping collection of bootstrap logs." + fi + + oc --insecure-skip-tls-verify --request-timeout=5s get nodes -o jsonpath --template '{range .items[*]}{.metadata.name}{"\n"}{end}' > /tmp/nodes + oc --insecure-skip-tls-verify --request-timeout=5s get pods --all-namespaces --template '{{ range .items }}{{ $name := .metadata.name }}{{ $ns := .metadata.namespace }}{{ range .spec.containers }}-n {{ $ns }} {{ $name }} -c {{ .name }}{{ "\n" }}{{ end }}{{ range .spec.initContainers }}-n {{ $ns }} {{ $name }} -c {{ .name }}{{ "\n" }}{{ end }}{{ end }}' > /tmp/containers + oc --insecure-skip-tls-verify --request-timeout=5s get pods -l openshift.io/component=api --all-namespaces --template '{{ range .items }}-n {{ .metadata.namespace }} {{ .metadata.name }}{{ "\n" }}{{ end }}' > /tmp/pods-api + + queue /tmp/artifacts/config-resources.json oc --insecure-skip-tls-verify --request-timeout=5s get apiserver.config.openshift.io authentication.config.openshift.io build.config.openshift.io console.config.openshift.io dns.config.openshift.io featuregate.config.openshift.io image.config.openshift.io infrastructure.config.openshift.io ingress.config.openshift.io network.config.openshift.io oauth.config.openshift.io project.config.openshift.io scheduler.config.openshift.io -o json + queue /tmp/artifacts/apiservices.json oc --insecure-skip-tls-verify --request-timeout=5s get apiservices -o json + queue /tmp/artifacts/apiservices.json oc --insecure-skip-tls-verify --request-timeout=5s get apiservices -o json + queue /tmp/artifacts/clusteroperators.json oc --insecure-skip-tls-verify --request-timeout=5s get clusteroperators -o json + queue /tmp/artifacts/clusterversion.json oc --insecure-skip-tls-verify --request-timeout=5s get clusterversion -o json + queue /tmp/artifacts/configmaps.json oc --insecure-skip-tls-verify --request-timeout=5s get configmaps --all-namespaces -o json + queue /tmp/artifacts/csr.json oc --insecure-skip-tls-verify --request-timeout=5s get csr -o json + queue /tmp/artifacts/endpoints.json oc --insecure-skip-tls-verify --request-timeout=5s get endpoints --all-namespaces -o json + queue /tmp/artifacts/events.json oc --insecure-skip-tls-verify --request-timeout=5s get events --all-namespaces -o json + queue /tmp/artifacts/kubeapiserver.json oc --insecure-skip-tls-verify --request-timeout=5s get kubeapiserver -o json + queue /tmp/artifacts/kubecontrollermanager.json oc --insecure-skip-tls-verify --request-timeout=5s get kubecontrollermanager -o json + queue /tmp/artifacts/machineconfigpools.json oc --insecure-skip-tls-verify --request-timeout=5s get machineconfigpools -o json + queue /tmp/artifacts/machineconfigs.json oc --insecure-skip-tls-verify --request-timeout=5s get machineconfigs -o json + queue /tmp/artifacts/namespaces.json oc --insecure-skip-tls-verify --request-timeout=5s get namespaces -o json + queue /tmp/artifacts/nodes.json oc --insecure-skip-tls-verify --request-timeout=5s get nodes -o json + queue /tmp/artifacts/openshiftapiserver.json oc --insecure-skip-tls-verify --request-timeout=5s get openshiftapiserver -o json + queue /tmp/artifacts/pods.json oc --insecure-skip-tls-verify --request-timeout=5s get pods --all-namespaces -o json + queue /tmp/artifacts/rolebindings.json oc --insecure-skip-tls-verify --request-timeout=5s get rolebindings --all-namespaces -o json + queue /tmp/artifacts/roles.json oc --insecure-skip-tls-verify --request-timeout=5s get roles --all-namespaces -o json + queue /tmp/artifacts/services.json oc --insecure-skip-tls-verify --request-timeout=5s get services --all-namespaces -o json + + FILTER=gzip queue /tmp/artifacts/openapi.json.gz oc --insecure-skip-tls-verify --request-timeout=5s get --raw /openapi/v2 + + # gather nodes first in parallel since they may contain the most relevant debugging info + while IFS= read -r i; do + mkdir -p /tmp/artifacts/nodes/$i + queue /tmp/artifacts/nodes/$i/heap oc --insecure-skip-tls-verify get --request-timeout=20s --raw /api/v1/nodes/$i/proxy/debug/pprof/heap + done < /tmp/nodes + + if oc --insecure-skip-tls-verify adm node-logs -h &>/dev/null; then + # starting in 4.0 we can query node logs directly + FILTER=gzip queue /tmp/artifacts/nodes/masters-journal.gz oc --insecure-skip-tls-verify adm node-logs --role=master --unify=false + FILTER=gzip queue /tmp/artifacts/nodes/workers-journal.gz oc --insecure-skip-tls-verify adm node-logs --role=worker --unify=false + else + while IFS= read -r i; do + FILTER=gzip queue /tmp/artifacts/nodes/$i/messages.gz oc --insecure-skip-tls-verify get --request-timeout=20s --raw /api/v1/nodes/$i/proxy/logs/messages + oc --insecure-skip-tls-verify get --request-timeout=20s --raw /api/v1/nodes/$i/proxy/logs/journal | sed -e 's|.*href="\(.*\)".*|\1|;t;d' > /tmp/journals + while IFS= read -r j; do + FILTER=gzip queue /tmp/artifacts/nodes/$i/journal.gz oc --insecure-skip-tls-verify get --request-timeout=20s --raw /api/v1/nodes/$i/proxy/logs/journal/${j}system.journal + done < /tmp/journals + FILTER=gzip queue /tmp/artifacts/nodes/$i/secure.gz oc --insecure-skip-tls-verify get --request-timeout=20s --raw /api/v1/nodes/$i/proxy/logs/secure + FILTER=gzip queue /tmp/artifacts/nodes/$i/audit.gz oc --insecure-skip-tls-verify get --request-timeout=20s --raw /api/v1/nodes/$i/proxy/logs/audit + done < /tmp/nodes + fi + + # Snapshot iptables-save on each node for debugging possible kube-proxy issues + oc --insecure-skip-tls-verify get --request-timeout=20s -n openshift-sdn -l app=sdn pods --template '{{ range .items }}{{ .metadata.name }}{{ "\n" }}{{ end }}' > /tmp/sdn-pods + while IFS= read -r i; do + queue /tmp/artifacts/network/iptables-save-$i oc --insecure-skip-tls-verify rsh --timeout=20 -n openshift-sdn -c sdn $i iptables-save -c + done < /tmp/sdn-pods + + while IFS= read -r i; do + file="$( echo "$i" | cut -d ' ' -f 3 | tr -s ' ' '_' )" + queue /tmp/artifacts/metrics/${file}-heap oc --insecure-skip-tls-verify exec $i -- /bin/bash -c 'oc --insecure-skip-tls-verify get --raw /debug/pprof/heap --server "https://$( hostname ):8443" --config /etc/origin/master/admin.kubeconfig' + queue /tmp/artifacts/metrics/${file}-controllers-heap oc --insecure-skip-tls-verify exec $i -- /bin/bash -c 'oc --insecure-skip-tls-verify get --raw /debug/pprof/heap --server "https://$( hostname ):8444" --config /etc/origin/master/admin.kubeconfig' + done < /tmp/pods-api + + while IFS= read -r i; do + file="$( echo "$i" | cut -d ' ' -f 2,3,5 | tr -s ' ' '_' )" + FILTER=gzip queue /tmp/artifacts/pods/${file}.log.gz oc --insecure-skip-tls-verify logs --request-timeout=20s $i + FILTER=gzip queue /tmp/artifacts/pods/${file}_previous.log.gz oc --insecure-skip-tls-verify logs --request-timeout=20s -p $i + done < /tmp/containers + + echo "Gathering kube-apiserver audit.log ..." + oc --insecure-skip-tls-verify adm node-logs --role=master --path=kube-apiserver/ > /tmp/kube-audit-logs + while IFS=$'\n' read -r line; do + IFS=' ' read -ra log <<< "${line}" + FILTER=gzip queue /tmp/artifacts/nodes/"${log[0]}"-"${log[1]}".gz oc --insecure-skip-tls-verify adm node-logs "${log[0]}" --path=kube-apiserver/"${log[1]}" + done < /tmp/kube-audit-logs + + echo "Gathering openshift-apiserver audit.log ..." + oc --insecure-skip-tls-verify adm node-logs --role=master --path=openshift-apiserver/ > /tmp/openshift-audit-logs + while IFS=$'\n' read -r line; do + IFS=' ' read -ra log <<< "${line}" + FILTER=gzip queue /tmp/artifacts/nodes/"${log[0]}"-"${log[1]}".gz oc --insecure-skip-tls-verify adm node-logs "${log[0]}" --path=openshift-apiserver/"${log[1]}" + done < /tmp/openshift-audit-logs + + echo "Snapshotting prometheus (may take 15s) ..." + queue /tmp/artifacts/metrics/prometheus.tar.gz oc --insecure-skip-tls-verify exec -n openshift-monitoring prometheus-k8s-0 -- tar cvzf - -C /prometheus . + + echo "Waiting for logs ..." + wait + + echo "Deprovisioning cluster ..." + cd /tmp/shared/tf + rm -rf .terraform + terraform init -input=false -no-color + terraform destroy -auto-approve -no-color + } + + trap 'teardown' EXIT + trap 'kill $(jobs -p); exit 0' TERM + + for i in $(seq 1 180); do + if [[ -f /tmp/shared/exit ]]; then + exit 0 + fi + sleep 60 & wait + done diff --git a/cluster/ci/config/prow/plugins.yaml b/cluster/ci/config/prow/plugins.yaml index f47e6417d7ea3..029f73cac346a 100644 --- a/cluster/ci/config/prow/plugins.yaml +++ b/cluster/ci/config/prow/plugins.yaml @@ -867,6 +867,10 @@ config_updater: name: prow-job-cluster-launch-installer-libvirt-e2e additional_namespaces: - ci-stg + ci-operator/templates/openshift/installer/cluster-launch-installer-upi-e2e.yaml: + name: prow-job-cluster-launch-installer-upi-e2e + additional_namespaces: + - ci-stg ci-operator/templates/openshift/installer/cluster-launch-installer-openstack-e2e.yaml: name: prow-job-cluster-launch-installer-openstack-e2e additional_namespaces: