Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Original file line number Diff line number Diff line change
Expand Up @@ -146,3 +146,9 @@ tests:
commands: TEST_SUITE=openshift/conformance/parallel run-tests
openshift_installer_upi:
cluster_profile: vsphere
- as: e2e-restore-cluster-state
commands: |
restore-cluster-state
TEST_SUITE=openshift/conformance/parallel run-tests
openshift_installer:
cluster_profile: aws
Original file line number Diff line number Diff line change
Expand Up @@ -580,6 +580,73 @@ presubmits:
name: prow-job-cluster-launch-installer-openstack-e2e
name: job-definition
trigger: '(?m)^/test (?:.*? )?e2e-openstack(?: .*?)?$'
- agent: kubernetes
always_run: false
branches:
- master
context: ci/prow/e2e-restore-cluster-state
decorate: true
decoration_config:
skip_cloning: true
labels:
ci-operator.openshift.io/prowgen-controlled: "true"
name: pull-ci-openshift-installer-master-e2e-restore-cluster-state
optional: true
rerun_command: /test e2e-restore-cluster-state
spec:
containers:
- args:
- --artifact-dir=$(ARTIFACTS)
- --give-pr-author-access-to-namespace=true
- --secret-dir=/usr/local/e2e-restore-cluster-state-cluster-profile
- --sentry-dsn-path=/etc/sentry-dsn/ci-operator
- --target=e2e-restore-cluster-state
- --template=/usr/local/e2e-restore-cluster-state
command:
- ci-operator
env:
- name: CLUSTER_TYPE
value: aws
- name: CONFIG_SPEC
valueFrom:
configMapKeyRef:
key: openshift-installer-master.yaml
name: ci-operator-master-configs
- name: JOB_NAME_SAFE
value: e2e-restore-cluster-state
- name: TEST_COMMAND
value: |
restore-cluster-state
TEST_SUITE=openshift/conformance/parallel run-tests
image: ci-operator:latest
imagePullPolicy: Always
name: ""
resources:
requests:
cpu: 10m
volumeMounts:
- mountPath: /usr/local/e2e-restore-cluster-state-cluster-profile
name: cluster-profile
- mountPath: /usr/local/e2e-restore-cluster-state
name: job-definition
subPath: cluster-launch-installer-e2e.yaml
- mountPath: /etc/sentry-dsn
name: sentry-dsn
readOnly: true
serviceAccountName: ci-operator
volumes:
- name: cluster-profile
projected:
sources:
- secret:
name: cluster-secrets-aws
- configMap:
name: prow-job-cluster-launch-installer-e2e
name: job-definition
- name: sentry-dsn
secret:
secretName: sentry-dsn
trigger: '(?m)^/test (?:.*? )?e2e-restore-cluster-state(?: .*?)?$'
- agent: kubernetes
always_run: false
branches:
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -159,6 +159,189 @@ objects:
mkdir -p /tmp/output
cd /tmp/output

function restore-cluster-state() {
echo "Placing file /etc/rollback-test with contents A"
cat > /tmp/machineconfig.yaml <<'EOF'
apiVersion: machineconfiguration.openshift.io/v1
kind: MachineConfig
metadata:
labels:
machineconfiguration.openshift.io/role: master
name: 99-rollback-test
spec:
config:
ignition:
version: 2.2.0
storage:
files:
- contents:
source: data:,A
filesystem: root
mode: 420
path: /etc/rollback-test
EOF
oc create -f /tmp/machineconfig.yaml

function wait_for_machineconfigpool_to_apply() {
for i in $(seq 0 10); do oc wait machineconfigpool/master --for=condition=Updating --timeout=5m && break; done
for i in $(seq 0 10); do oc wait machineconfigpool/master --for=condition=Updated --timeout=5m && break; sleep 30; done
}

wait_for_machineconfigpool_to_apply

echo "Setting up ssh bastion"
mkdir -p ~/.ssh || true
cp "${KUBE_SSH_KEY_PATH}" ~/.ssh/id_rsa
chmod 0600 ~/.ssh/id_rsa
echo "${USER:-default}:x:$(id -u):$(id -g):Default User:$HOME:/sbin/nologin" >> /etc/passwd
curl https://raw.githubusercontent.com/eparis/ssh-bastion/master/deploy/deploy.sh | bash
BASTION_HOST=$(oc get service -n openshift-ssh-bastion ssh-bastion -o jsonpath='{.status.loadBalancer.ingress[0].hostname}')

function bastion_ssh() {
while true
do
ssh -o LogLevel=error -o ConnectionAttempts=100 -o ConnectTimeout=5 -o StrictHostKeyChecking=no -o ProxyCommand="ssh -A -o StrictHostKeyChecking=no -o LogLevel=error -o ServerAliveInterval=30 -o ConnectionAttempts=100 -o ConnectTimeout=5 -W %h:%p core@${BASTION_HOST}" $@ && break
sleep 10
done
}

echo "Make etcd backup on first master"
FIRST_MASTER=$(oc get node -l node-role.kubernetes.io/master= -o name | head -n1 | cut -d '/' -f 2)
cat > /tmp/etcd_backup.sh <<'EOF'
#!/bin/bash
set -x
RUNNING_ETCD_POD=$(crictl pods -q --label k8s-app=etcd --state=Ready)
RUNNING_ETCD_CONTAINER=$(crictl ps --pod ${RUNNING_ETCD_POD} --name etcd-member -q)
crictl exec ${RUNNING_ETCD_CONTAINER} /bin/sh -c 'source /run/etcd/environment && ETCDCTL_API=3 etcdctl --cert /etc/ssl/etcd/system:etcd-peer:${ETCD_DNS_NAME}.crt --key /etc/ssl/etcd/system:etcd-peer:${ETCD_DNS_NAME}.key --cacert /etc/ssl/etcd/ca.crt snapshot save /var/lib/etcd/snapshot.db'
EOF
chmod +x /tmp/etcd_backup.sh
scp -o StrictHostKeyChecking=no -o ProxyCommand="ssh -A -o StrictHostKeyChecking=no -o ServerAliveInterval=30 -W %h:%p core@${BASTION_HOST}" /tmp/etcd_backup.sh "core@${FIRST_MASTER}":/tmp
bastion_ssh "core@${FIRST_MASTER}" "sudo -i /tmp/etcd_backup.sh"

echo "Backup manifests"
mapfile -t MASTERS < <(oc get node -l node-role.kubernetes.io/master= -o name | cut -d '/' -f 2)
for master in "${MASTERS[@]}"
do
bastion_ssh "core@${master}" "sudo -i mkdir /etc/kubernetes/manifests-backup && sudo -i cp -rvf /etc/kubernetes/manifests/* /etc/kubernetes/manifests-backup"
done

# TODO: upgrade conditionally here

echo "Update rollback-test machineconfig"
oc patch machineconfig 99-rollback-test -n openshift-machine-api --patch '{"spec":{"config":{"storage":{"files":[{"contents":{"source":"data:,B","verification":{}},"filesystem":"root","mode":420,"path":"/etc/rollback-test"}]}}}}' --type=merge
wait_for_machineconfigpool_to_apply

echo "Distribute snapshot across all masters"
mapfile -t MASTERS < <(oc get node -l node-role.kubernetes.io/master= -o name | cut -d '/' -f 2)
for master in "${MASTERS[@]}"
do
scp -o StrictHostKeyChecking=no -o ProxyCommand="ssh -A -o StrictHostKeyChecking=no -o ServerAliveInterval=30 -W %h:%p core@${BASTION_HOST}" ${KUBE_SSH_KEY_PATH} "core@${master}":/home/core/.ssh/id_rsa
bastion_ssh "core@${master}" "chmod 0600 /home/core/.ssh/id_rsa"
bastion_ssh "core@${FIRST_MASTER}" "scp -o StrictHostKeyChecking=no /var/lib/etcd/snapshot.db core@${master}:/tmp/snapshot.db"
done

echo "Collect etcd names"
ETCD_CONN_STRING_TEMP=""
for master in "${MASTERS[@]}"
do
ETCD_NAME="etcd-member-$(bastion_ssh core@${master} hostname -f)"
ETCD_URI=$(bastion_ssh "core@${master}" 'source /run/etcd/environment && echo "https://${ETCD_DNS_NAME}:2380"')
ETCD_CONN_STRING_TEMP="${ETCD_CONN_STRING_TEMP}${ETCD_NAME}=${ETCD_URI},"
done
ETCD_CONN_STRING=${ETCD_CONN_STRING_TEMP::-1}

echo "Stop static pods"
for master in "${MASTERS[@]}"
do
bastion_ssh "core@${master}" "sudo -i mkdir /etc/kubernetes/stopped-pods && sudo -i mv /etc/kubernetes/manifests/* /etc/kubernetes/stopped-pods"
done

echo "Restore etcd from snapshot"
cat > /tmp/etcd_restore.sh <<'EOF'
#!/bin/bash
set -ex

ETCD_VER=v3.3.10
DOWNLOAD_URL=https://storage.googleapis.com/etcd
ASSET_DIR=/root/.local

mkdir -p ${ASSET_DIR}/bin ${ASSET_DIR}/tmp ${ASSET_DIR}/shared

echo "Downloading etcdctl binary.."
curl -s -L ${DOWNLOAD_URL}/${ETCD_VER}/etcd-${ETCD_VER}-linux-amd64.tar.gz -o $ASSET_DIR/tmp/etcd-${ETCD_VER}-linux-amd64.tar.gz \
&& tar -xzf $ASSET_DIR/tmp/etcd-${ETCD_VER}-linux-amd64.tar.gz -C $ASSET_DIR/shared --strip-components=1 \
&& mv $ASSET_DIR/shared/etcdctl $ASSET_DIR/bin/ \
&& rm $ASSET_DIR/shared/etcd \
&& ETCDCTL_API=3 $ASSET_DIR/bin/etcdctl version

ETCD_NAME="etcd-member-$(hostname -f)"

rm -rf /var/lib/etcd/

source /run/etcd/environment
ETCDCTL_API=3 etcdctl \
--cert /etc/ssl/etcd/system:etcd-peer:${ETCD_DNS_NAME}.crt \
--key /etc/ssl/etcd/system:etcd-peer:${ETCD_DNS_NAME}.key \
--cacert /etc/ssl/etcd/ca.crt \
snapshot restore /tmp/snapshot.db \
--name ${ETCD_NAME} \
--initial-cluster "$@" \
--initial-cluster-token etcd-cluster-1 \
--skip-hash-check=true \
--initial-advertise-peer-urls https://${ETCD_IPV4_ADDRESS}:2380 \
--data-dir /var/lib/etcd/
EOF
chmod +x /tmp/etcd_restore.sh
for master in "${MASTERS[@]}"
do
scp -o StrictHostKeyChecking=no -o ProxyCommand="ssh -A -o StrictHostKeyChecking=no -o ServerAliveInterval=30 -W %h:%p core@${BASTION_HOST}" /tmp/etcd_restore.sh "core@${master}":/tmp
bastion_ssh "core@${master}" "sudo -i /tmp/etcd_restore.sh ${ETCD_CONN_STRING}"
done

for master in "${MASTERS[@]}"
do
while true
do
bastion_ssh "core@${master}" "sudo -i mv /etc/kubernetes/manifests-backup/* /etc/kubernetes/manifests" && break; sleep 10
done
done

echo "Wait for API server to come up"
for i in $(seq 0 10)
do
oc get nodes && break
sleep 30
done

echo "Wait for MCO to rollout new configs"
for i in $(seq 0 10); do oc get machineconfigpool/master > /dev/null && break; sleep 30; done
wait_for_machineconfigpool_to_apply

echo "Verify 99-rollback-test machineconfig"
MC="$(oc get machineconfig/99-rollback-test -o jsonpath='{.spec.config.storage.files[0].contents.source}')"
if [[ "${MC}" != "data:,A" ]]; then
echo "Unexpected MachineConfig output: ${MC}"
exit 1
fi

echo "Verify /etc/rollback-test contents"
rc=0
for master in "${MASTERS[@]}"
do
ROLLBACK="$(bastion_ssh core@${master} "sudo -i cat /etc/rollback-test")"
if [[ "${ROLLBACK}" != "A" ]]; then
echo "Master ${master} rollback contents was ${ROLLBACK}"
rc=1
fi
done

if [[ "${rc}" == "1" ]]; then exit 1; fi
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I think we want exit 1 -> return 1, so the caller can decide if they want to add additional error handling.

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

never mind, the rest of this script is all set -e, so this matches that.


echo "Removing ssh-bastion"
oc delete project openshift-ssh-bastion
oc wait namespace/openshift-ssh-bastion --for delete --timeout=10m
}

function run-upgrade-tests() {
openshift-tests run-upgrade "${TEST_SUITE}" --to-image "${RELEASE_IMAGE_LATEST}" \
--provider "${TEST_PROVIDER:-}" -o /tmp/artifacts/e2e.log --junit-dir /tmp/artifacts/junit
Expand Down