diff --git a/ci-operator/config/openshift/installer/openshift-installer-master.yaml b/ci-operator/config/openshift/installer/openshift-installer-master.yaml index 9364bf2fe6470..ddacd80e84d2f 100644 --- a/ci-operator/config/openshift/installer/openshift-installer-master.yaml +++ b/ci-operator/config/openshift/installer/openshift-installer-master.yaml @@ -146,3 +146,9 @@ tests: commands: TEST_SUITE=openshift/conformance/parallel run-tests openshift_installer_upi: cluster_profile: vsphere +- as: e2e-restore-cluster-state + commands: | + restore-cluster-state + TEST_SUITE=openshift/conformance/parallel run-tests + openshift_installer: + cluster_profile: aws diff --git a/ci-operator/jobs/openshift/installer/openshift-installer-master-presubmits.yaml b/ci-operator/jobs/openshift/installer/openshift-installer-master-presubmits.yaml index f062f46995ade..136218a526b58 100644 --- a/ci-operator/jobs/openshift/installer/openshift-installer-master-presubmits.yaml +++ b/ci-operator/jobs/openshift/installer/openshift-installer-master-presubmits.yaml @@ -580,6 +580,73 @@ presubmits: name: prow-job-cluster-launch-installer-openstack-e2e name: job-definition trigger: '(?m)^/test (?:.*? )?e2e-openstack(?: .*?)?$' + - agent: kubernetes + always_run: false + branches: + - master + context: ci/prow/e2e-restore-cluster-state + decorate: true + decoration_config: + skip_cloning: true + labels: + ci-operator.openshift.io/prowgen-controlled: "true" + name: pull-ci-openshift-installer-master-e2e-restore-cluster-state + optional: true + rerun_command: /test e2e-restore-cluster-state + spec: + containers: + - args: + - --artifact-dir=$(ARTIFACTS) + - --give-pr-author-access-to-namespace=true + - --secret-dir=/usr/local/e2e-restore-cluster-state-cluster-profile + - --sentry-dsn-path=/etc/sentry-dsn/ci-operator + - --target=e2e-restore-cluster-state + - --template=/usr/local/e2e-restore-cluster-state + command: + - ci-operator + env: + - name: CLUSTER_TYPE + value: aws + - name: CONFIG_SPEC + valueFrom: + configMapKeyRef: + key: openshift-installer-master.yaml + name: ci-operator-master-configs + - name: JOB_NAME_SAFE + value: e2e-restore-cluster-state + - name: TEST_COMMAND + value: | + restore-cluster-state + TEST_SUITE=openshift/conformance/parallel run-tests + image: ci-operator:latest + imagePullPolicy: Always + name: "" + resources: + requests: + cpu: 10m + volumeMounts: + - mountPath: /usr/local/e2e-restore-cluster-state-cluster-profile + name: cluster-profile + - mountPath: /usr/local/e2e-restore-cluster-state + name: job-definition + subPath: cluster-launch-installer-e2e.yaml + - mountPath: /etc/sentry-dsn + name: sentry-dsn + readOnly: true + serviceAccountName: ci-operator + volumes: + - name: cluster-profile + projected: + sources: + - secret: + name: cluster-secrets-aws + - configMap: + name: prow-job-cluster-launch-installer-e2e + name: job-definition + - name: sentry-dsn + secret: + secretName: sentry-dsn + trigger: '(?m)^/test (?:.*? )?e2e-restore-cluster-state(?: .*?)?$' - agent: kubernetes always_run: false branches: diff --git a/ci-operator/templates/openshift/installer/cluster-launch-installer-e2e.yaml b/ci-operator/templates/openshift/installer/cluster-launch-installer-e2e.yaml index 10bb5952a20f2..85a035fe52a14 100644 --- a/ci-operator/templates/openshift/installer/cluster-launch-installer-e2e.yaml +++ b/ci-operator/templates/openshift/installer/cluster-launch-installer-e2e.yaml @@ -159,6 +159,189 @@ objects: mkdir -p /tmp/output cd /tmp/output + function restore-cluster-state() { + echo "Placing file /etc/rollback-test with contents A" + cat > /tmp/machineconfig.yaml <<'EOF' + apiVersion: machineconfiguration.openshift.io/v1 + kind: MachineConfig + metadata: + labels: + machineconfiguration.openshift.io/role: master + name: 99-rollback-test + spec: + config: + ignition: + version: 2.2.0 + storage: + files: + - contents: + source: data:,A + filesystem: root + mode: 420 + path: /etc/rollback-test + EOF + oc create -f /tmp/machineconfig.yaml + + function wait_for_machineconfigpool_to_apply() { + for i in $(seq 0 10); do oc wait machineconfigpool/master --for=condition=Updating --timeout=5m && break; done + for i in $(seq 0 10); do oc wait machineconfigpool/master --for=condition=Updated --timeout=5m && break; sleep 30; done + } + + wait_for_machineconfigpool_to_apply + + echo "Setting up ssh bastion" + mkdir -p ~/.ssh || true + cp "${KUBE_SSH_KEY_PATH}" ~/.ssh/id_rsa + chmod 0600 ~/.ssh/id_rsa + echo "${USER:-default}:x:$(id -u):$(id -g):Default User:$HOME:/sbin/nologin" >> /etc/passwd + curl https://raw.githubusercontent.com/eparis/ssh-bastion/master/deploy/deploy.sh | bash + BASTION_HOST=$(oc get service -n openshift-ssh-bastion ssh-bastion -o jsonpath='{.status.loadBalancer.ingress[0].hostname}') + + function bastion_ssh() { + while true + do + ssh -o LogLevel=error -o ConnectionAttempts=100 -o ConnectTimeout=5 -o StrictHostKeyChecking=no -o ProxyCommand="ssh -A -o StrictHostKeyChecking=no -o LogLevel=error -o ServerAliveInterval=30 -o ConnectionAttempts=100 -o ConnectTimeout=5 -W %h:%p core@${BASTION_HOST}" $@ && break + sleep 10 + done + } + + echo "Make etcd backup on first master" + FIRST_MASTER=$(oc get node -l node-role.kubernetes.io/master= -o name | head -n1 | cut -d '/' -f 2) + cat > /tmp/etcd_backup.sh <<'EOF' + #!/bin/bash + set -x + RUNNING_ETCD_POD=$(crictl pods -q --label k8s-app=etcd --state=Ready) + RUNNING_ETCD_CONTAINER=$(crictl ps --pod ${RUNNING_ETCD_POD} --name etcd-member -q) + crictl exec ${RUNNING_ETCD_CONTAINER} /bin/sh -c 'source /run/etcd/environment && ETCDCTL_API=3 etcdctl --cert /etc/ssl/etcd/system:etcd-peer:${ETCD_DNS_NAME}.crt --key /etc/ssl/etcd/system:etcd-peer:${ETCD_DNS_NAME}.key --cacert /etc/ssl/etcd/ca.crt snapshot save /var/lib/etcd/snapshot.db' + EOF + chmod +x /tmp/etcd_backup.sh + scp -o StrictHostKeyChecking=no -o ProxyCommand="ssh -A -o StrictHostKeyChecking=no -o ServerAliveInterval=30 -W %h:%p core@${BASTION_HOST}" /tmp/etcd_backup.sh "core@${FIRST_MASTER}":/tmp + bastion_ssh "core@${FIRST_MASTER}" "sudo -i /tmp/etcd_backup.sh" + + echo "Backup manifests" + mapfile -t MASTERS < <(oc get node -l node-role.kubernetes.io/master= -o name | cut -d '/' -f 2) + for master in "${MASTERS[@]}" + do + bastion_ssh "core@${master}" "sudo -i mkdir /etc/kubernetes/manifests-backup && sudo -i cp -rvf /etc/kubernetes/manifests/* /etc/kubernetes/manifests-backup" + done + + # TODO: upgrade conditionally here + + echo "Update rollback-test machineconfig" + oc patch machineconfig 99-rollback-test -n openshift-machine-api --patch '{"spec":{"config":{"storage":{"files":[{"contents":{"source":"data:,B","verification":{}},"filesystem":"root","mode":420,"path":"/etc/rollback-test"}]}}}}' --type=merge + wait_for_machineconfigpool_to_apply + + echo "Distribute snapshot across all masters" + mapfile -t MASTERS < <(oc get node -l node-role.kubernetes.io/master= -o name | cut -d '/' -f 2) + for master in "${MASTERS[@]}" + do + scp -o StrictHostKeyChecking=no -o ProxyCommand="ssh -A -o StrictHostKeyChecking=no -o ServerAliveInterval=30 -W %h:%p core@${BASTION_HOST}" ${KUBE_SSH_KEY_PATH} "core@${master}":/home/core/.ssh/id_rsa + bastion_ssh "core@${master}" "chmod 0600 /home/core/.ssh/id_rsa" + bastion_ssh "core@${FIRST_MASTER}" "scp -o StrictHostKeyChecking=no /var/lib/etcd/snapshot.db core@${master}:/tmp/snapshot.db" + done + + echo "Collect etcd names" + ETCD_CONN_STRING_TEMP="" + for master in "${MASTERS[@]}" + do + ETCD_NAME="etcd-member-$(bastion_ssh core@${master} hostname -f)" + ETCD_URI=$(bastion_ssh "core@${master}" 'source /run/etcd/environment && echo "https://${ETCD_DNS_NAME}:2380"') + ETCD_CONN_STRING_TEMP="${ETCD_CONN_STRING_TEMP}${ETCD_NAME}=${ETCD_URI}," + done + ETCD_CONN_STRING=${ETCD_CONN_STRING_TEMP::-1} + + echo "Stop static pods" + for master in "${MASTERS[@]}" + do + bastion_ssh "core@${master}" "sudo -i mkdir /etc/kubernetes/stopped-pods && sudo -i mv /etc/kubernetes/manifests/* /etc/kubernetes/stopped-pods" + done + + echo "Restore etcd from snapshot" + cat > /tmp/etcd_restore.sh <<'EOF' + #!/bin/bash + set -ex + + ETCD_VER=v3.3.10 + DOWNLOAD_URL=https://storage.googleapis.com/etcd + ASSET_DIR=/root/.local + + mkdir -p ${ASSET_DIR}/bin ${ASSET_DIR}/tmp ${ASSET_DIR}/shared + + echo "Downloading etcdctl binary.." + curl -s -L ${DOWNLOAD_URL}/${ETCD_VER}/etcd-${ETCD_VER}-linux-amd64.tar.gz -o $ASSET_DIR/tmp/etcd-${ETCD_VER}-linux-amd64.tar.gz \ + && tar -xzf $ASSET_DIR/tmp/etcd-${ETCD_VER}-linux-amd64.tar.gz -C $ASSET_DIR/shared --strip-components=1 \ + && mv $ASSET_DIR/shared/etcdctl $ASSET_DIR/bin/ \ + && rm $ASSET_DIR/shared/etcd \ + && ETCDCTL_API=3 $ASSET_DIR/bin/etcdctl version + + ETCD_NAME="etcd-member-$(hostname -f)" + + rm -rf /var/lib/etcd/ + + source /run/etcd/environment + ETCDCTL_API=3 etcdctl \ + --cert /etc/ssl/etcd/system:etcd-peer:${ETCD_DNS_NAME}.crt \ + --key /etc/ssl/etcd/system:etcd-peer:${ETCD_DNS_NAME}.key \ + --cacert /etc/ssl/etcd/ca.crt \ + snapshot restore /tmp/snapshot.db \ + --name ${ETCD_NAME} \ + --initial-cluster "$@" \ + --initial-cluster-token etcd-cluster-1 \ + --skip-hash-check=true \ + --initial-advertise-peer-urls https://${ETCD_IPV4_ADDRESS}:2380 \ + --data-dir /var/lib/etcd/ + EOF + chmod +x /tmp/etcd_restore.sh + for master in "${MASTERS[@]}" + do + scp -o StrictHostKeyChecking=no -o ProxyCommand="ssh -A -o StrictHostKeyChecking=no -o ServerAliveInterval=30 -W %h:%p core@${BASTION_HOST}" /tmp/etcd_restore.sh "core@${master}":/tmp + bastion_ssh "core@${master}" "sudo -i /tmp/etcd_restore.sh ${ETCD_CONN_STRING}" + done + + for master in "${MASTERS[@]}" + do + while true + do + bastion_ssh "core@${master}" "sudo -i mv /etc/kubernetes/manifests-backup/* /etc/kubernetes/manifests" && break; sleep 10 + done + done + + echo "Wait for API server to come up" + for i in $(seq 0 10) + do + oc get nodes && break + sleep 30 + done + + echo "Wait for MCO to rollout new configs" + for i in $(seq 0 10); do oc get machineconfigpool/master > /dev/null && break; sleep 30; done + wait_for_machineconfigpool_to_apply + + echo "Verify 99-rollback-test machineconfig" + MC="$(oc get machineconfig/99-rollback-test -o jsonpath='{.spec.config.storage.files[0].contents.source}')" + if [[ "${MC}" != "data:,A" ]]; then + echo "Unexpected MachineConfig output: ${MC}" + exit 1 + fi + + echo "Verify /etc/rollback-test contents" + rc=0 + for master in "${MASTERS[@]}" + do + ROLLBACK="$(bastion_ssh core@${master} "sudo -i cat /etc/rollback-test")" + if [[ "${ROLLBACK}" != "A" ]]; then + echo "Master ${master} rollback contents was ${ROLLBACK}" + rc=1 + fi + done + + if [[ "${rc}" == "1" ]]; then exit 1; fi + + echo "Removing ssh-bastion" + oc delete project openshift-ssh-bastion + oc wait namespace/openshift-ssh-bastion --for delete --timeout=10m + } + function run-upgrade-tests() { openshift-tests run-upgrade "${TEST_SUITE}" --to-image "${RELEASE_IMAGE_LATEST}" \ --provider "${TEST_PROVIDER:-}" -o /tmp/artifacts/e2e.log --junit-dir /tmp/artifacts/junit