Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Original file line number Diff line number Diff line change
Expand Up @@ -182,7 +182,7 @@ presubmits:
secretName: sentry-dsn
trigger: '(?m)^/test (?:.*? )?e2e-aws-upgrade(?: .*?)?$'
- agent: kubernetes
always_run: false
always_run: true
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I think we want a longer track-record of success before we do this (although it's really up to the MCO team). Currently, the past 24 hours have three failures and no success for this job.

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

We can't have passing tests before MCO scripts are debugged - and we can't properly test those without having a dedicated test for DR scenarios (e.g. openshift/machine-config-operator#793 (comment))

Copy link
Member

@wking wking May 25, 2019

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

You can /test e2e-restore-cluster-state in that PR and it will run (and rerun after each bump) to help you debug that PR. No need to run this in all other MCO PRs while you debug that one.

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Its not clear which MCO PR would break DR scenarios. Also, if this test is misbehaving it can be skipped with /skip since its optional

branches:
- master
context: ci/prow/e2e-restore-cluster-state
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -198,32 +198,18 @@ objects:
BASTION_HOST=$(oc get service -n openshift-ssh-bastion ssh-bastion -o jsonpath='{.status.loadBalancer.ingress[0].hostname}')

function bastion_ssh() {
while true
local rc=1
for i in $(seq 0 30)
do
ssh -o LogLevel=error -o ConnectionAttempts=100 -o ConnectTimeout=5 -o StrictHostKeyChecking=no -o ProxyCommand="ssh -A -o StrictHostKeyChecking=no -o LogLevel=error -o ServerAliveInterval=30 -o ConnectionAttempts=100 -o ConnectTimeout=5 -W %h:%p core@${BASTION_HOST}" $@ && break
ssh -o LogLevel=error -o ConnectionAttempts=100 -o ConnectTimeout=5 -o StrictHostKeyChecking=no -o ProxyCommand="ssh -A -o StrictHostKeyChecking=no -o LogLevel=error -o ServerAliveInterval=30 -o ConnectionAttempts=100 -o ConnectTimeout=5 -W %h:%p core@${BASTION_HOST}" $@ && rc=0 && break
sleep 10
done
if [ "$rc" != "0" ]; then exit $rc; fi
}

echo "Make etcd backup on first master"
echo "Make etcd backup on first master - /usr/local/bin/etcd-snapshot-backup.sh"
FIRST_MASTER=$(oc get node -l node-role.kubernetes.io/master= -o name | head -n1 | cut -d '/' -f 2)
cat > /tmp/etcd_backup.sh <<'EOF'
#!/bin/bash
set -x
RUNNING_ETCD_POD=$(crictl pods -q --label k8s-app=etcd --state=Ready)
RUNNING_ETCD_CONTAINER=$(crictl ps --pod ${RUNNING_ETCD_POD} --name etcd-member -q)
crictl exec ${RUNNING_ETCD_CONTAINER} /bin/sh -c 'source /run/etcd/environment && ETCDCTL_API=3 etcdctl --cert /etc/ssl/etcd/system:etcd-peer:${ETCD_DNS_NAME}.crt --key /etc/ssl/etcd/system:etcd-peer:${ETCD_DNS_NAME}.key --cacert /etc/ssl/etcd/ca.crt snapshot save /var/lib/etcd/snapshot.db'
EOF
chmod +x /tmp/etcd_backup.sh
scp -o StrictHostKeyChecking=no -o ProxyCommand="ssh -A -o StrictHostKeyChecking=no -o ServerAliveInterval=30 -W %h:%p core@${BASTION_HOST}" /tmp/etcd_backup.sh "core@${FIRST_MASTER}":/tmp
bastion_ssh "core@${FIRST_MASTER}" "sudo -i /tmp/etcd_backup.sh"

echo "Backup manifests"
mapfile -t MASTERS < <(oc get node -l node-role.kubernetes.io/master= -o name | cut -d '/' -f 2)
for master in "${MASTERS[@]}"
do
bastion_ssh "core@${master}" "sudo -i mkdir /etc/kubernetes/manifests-backup && sudo -i cp -rvf /etc/kubernetes/manifests/* /etc/kubernetes/manifests-backup"
done
bastion_ssh "core@${FIRST_MASTER}" "sudo -i /bin/bash -x /usr/local/bin/etcd-snapshot-backup.sh /root/assets/backup/snapshot.db && sudo -i cp /root/assets/backup/snapshot.db /tmp/snapshot.db && sudo -i chown core:core /tmp/snapshot.db"

# TODO: upgrade conditionally here

Expand All @@ -236,8 +222,8 @@ objects:
for master in "${MASTERS[@]}"
do
scp -o StrictHostKeyChecking=no -o ProxyCommand="ssh -A -o StrictHostKeyChecking=no -o ServerAliveInterval=30 -W %h:%p core@${BASTION_HOST}" ${KUBE_SSH_KEY_PATH} "core@${master}":/home/core/.ssh/id_rsa
bastion_ssh "core@${master}" "chmod 0600 /home/core/.ssh/id_rsa"
bastion_ssh "core@${FIRST_MASTER}" "scp -o StrictHostKeyChecking=no /var/lib/etcd/snapshot.db core@${master}:/tmp/snapshot.db"
bastion_ssh "core@${master}" "sudo -i chmod 0600 /home/core/.ssh/id_rsa"
bastion_ssh "core@${FIRST_MASTER}" "scp -o StrictHostKeyChecking=no /tmp/snapshot.db core@${master}:/tmp/snapshot.db"
done

echo "Collect etcd names"
Expand All @@ -250,60 +236,11 @@ objects:
done
ETCD_CONN_STRING=${ETCD_CONN_STRING_TEMP::-1}

echo "Stop static pods"
echo "Restore etcd cluster from snapshot"
for master in "${MASTERS[@]}"
do
bastion_ssh "core@${master}" "sudo -i mkdir /etc/kubernetes/stopped-pods && sudo -i mv /etc/kubernetes/manifests/* /etc/kubernetes/stopped-pods"
done

echo "Restore etcd from snapshot"
cat > /tmp/etcd_restore.sh <<'EOF'
#!/bin/bash
set -ex

ETCD_VER=v3.3.10
DOWNLOAD_URL=https://storage.googleapis.com/etcd
ASSET_DIR=/root/.local

mkdir -p ${ASSET_DIR}/bin ${ASSET_DIR}/tmp ${ASSET_DIR}/shared

echo "Downloading etcdctl binary.."
curl -s -L ${DOWNLOAD_URL}/${ETCD_VER}/etcd-${ETCD_VER}-linux-amd64.tar.gz -o $ASSET_DIR/tmp/etcd-${ETCD_VER}-linux-amd64.tar.gz \
&& tar -xzf $ASSET_DIR/tmp/etcd-${ETCD_VER}-linux-amd64.tar.gz -C $ASSET_DIR/shared --strip-components=1 \
&& mv $ASSET_DIR/shared/etcdctl $ASSET_DIR/bin/ \
&& rm $ASSET_DIR/shared/etcd \
&& ETCDCTL_API=3 $ASSET_DIR/bin/etcdctl version

ETCD_NAME="etcd-member-$(hostname -f)"

rm -rf /var/lib/etcd/

source /run/etcd/environment
ETCDCTL_API=3 etcdctl \
--cert /etc/ssl/etcd/system:etcd-peer:${ETCD_DNS_NAME}.crt \
--key /etc/ssl/etcd/system:etcd-peer:${ETCD_DNS_NAME}.key \
--cacert /etc/ssl/etcd/ca.crt \
snapshot restore /tmp/snapshot.db \
--name ${ETCD_NAME} \
--initial-cluster "$@" \
--initial-cluster-token etcd-cluster-1 \
--skip-hash-check=true \
--initial-advertise-peer-urls https://${ETCD_IPV4_ADDRESS}:2380 \
--data-dir /var/lib/etcd/
EOF
chmod +x /tmp/etcd_restore.sh
for master in "${MASTERS[@]}"
do
scp -o StrictHostKeyChecking=no -o ProxyCommand="ssh -A -o StrictHostKeyChecking=no -o ServerAliveInterval=30 -W %h:%p core@${BASTION_HOST}" /tmp/etcd_restore.sh "core@${master}":/tmp
bastion_ssh "core@${master}" "sudo -i /tmp/etcd_restore.sh ${ETCD_CONN_STRING}"
done

for master in "${MASTERS[@]}"
do
while true
do
bastion_ssh "core@${master}" "sudo -i mv /etc/kubernetes/manifests-backup/* /etc/kubernetes/manifests" && break; sleep 10
done
echo "Running /usr/local/bin/etcd-snapshot-restore.sh on ${master}"
bastion_ssh "core@${master}" "sudo -i /bin/bash -x /usr/local/bin/etcd-snapshot-restore.sh /tmp/snapshot.db ${ETCD_CONN_STRING}"
done

echo "Wait for API server to come up"
Expand All @@ -317,6 +254,13 @@ objects:
for i in $(seq 0 10); do oc get machineconfigpool/master > /dev/null && break; sleep 30; done
wait_for_machineconfigpool_to_apply

echo "Wait for all kube-apiserver pods to come back"
for master in ${MASTERS[@]}
do
oc get pod/kube-apiserver-${master} -n openshift-kube-apiserver -o name
oc wait pod/kube-apiserver-${master} -n openshift-kube-apiserver --for condition=Ready --timeout=5m
done

echo "Verify 99-rollback-test machineconfig"
MC="$(oc get machineconfig/99-rollback-test -o jsonpath='{.spec.config.storage.files[0].contents.source}')"
if [[ "${MC}" != "data:,A" ]]; then
Expand All @@ -340,6 +284,10 @@ objects:
echo "Removing ssh-bastion"
oc delete project openshift-ssh-bastion
oc wait namespace/openshift-ssh-bastion --for delete --timeout=10m

echo "Remove existing openshift-apiserver pods"
# This would ensure "Pod 'openshift-apiserver/apiserver-xxx' is not healthy: container openshift-apiserver has restarted more than 5 times" test won't fail
oc delete pod --all -n openshift-apiserver
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

hmm, this probably also blows away our logs for those pods? Maybe we want to pull down their logs into the shared artifacts volume before doing this?

}

function run-upgrade-tests() {
Expand Down