Skip to content
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Original file line number Diff line number Diff line change
Expand Up @@ -141,13 +141,20 @@ tests:
commands: TEST_SUITE=openshift/conformance run-tests
openshift_ansible_40:
cluster_profile: aws-centos-40
- as: e2e-vsphere
commands: TEST_SUITE=openshift/conformance/parallel run-tests
openshift_installer_upi:
cluster_profile: vsphere
- as: e2e-etcd-quorum-loss
commands: |
recover-from-etcd-quorum-loss
TEST_SUITE=openshift/conformance/parallel run-tests
openshift_installer:
cluster_profile: aws
upgrade: false
- as: e2e-restore-cluster-state
commands: |
restore-cluster-state
TEST_SUITE=openshift/conformance/parallel run-tests
openshift_installer:
cluster_profile: aws
- as: e2e-vsphere
commands: TEST_SUITE=openshift/conformance/parallel run-tests
openshift_installer_upi:
cluster_profile: vsphere
Original file line number Diff line number Diff line change
Expand Up @@ -348,6 +348,73 @@ presubmits:
secret:
secretName: sentry-dsn
trigger: '(?m)^/test (?:.*? )?e2e-aws-upi(?: .*?)?$'
- agent: kubernetes
always_run: false
branches:
- master
context: ci/prow/e2e-etcd-quorum-loss
decorate: true
decoration_config:
skip_cloning: true
labels:
ci-operator.openshift.io/prowgen-controlled: "true"
name: pull-ci-openshift-installer-master-e2e-etcd-quorum-loss
optional: true
rerun_command: /test e2e-etcd-quorum-loss
spec:
containers:
- args:
- --artifact-dir=$(ARTIFACTS)
- --give-pr-author-access-to-namespace=true
- --secret-dir=/usr/local/e2e-etcd-quorum-loss-cluster-profile
- --sentry-dsn-path=/etc/sentry-dsn/ci-operator
- --target=e2e-etcd-quorum-loss
- --template=/usr/local/e2e-etcd-quorum-loss
command:
- ci-operator
env:
- name: CLUSTER_TYPE
value: aws
- name: CONFIG_SPEC
valueFrom:
configMapKeyRef:
key: openshift-installer-master.yaml
name: ci-operator-master-configs
- name: JOB_NAME_SAFE
value: e2e-etcd-quorum-loss
- name: TEST_COMMAND
value: |
recover-from-etcd-quorum-loss
TEST_SUITE=openshift/conformance/parallel run-tests
image: ci-operator:latest
imagePullPolicy: Always
name: ""
resources:
requests:
cpu: 10m
volumeMounts:
- mountPath: /usr/local/e2e-etcd-quorum-loss-cluster-profile
name: cluster-profile
- mountPath: /usr/local/e2e-etcd-quorum-loss
name: job-definition
subPath: cluster-launch-installer-e2e.yaml
- mountPath: /etc/sentry-dsn
name: sentry-dsn
readOnly: true
serviceAccountName: ci-operator
volumes:
- name: cluster-profile
projected:
sources:
- secret:
name: cluster-secrets-aws
- configMap:
name: prow-job-cluster-launch-installer-e2e
name: job-definition
- name: sentry-dsn
secret:
secretName: sentry-dsn
trigger: '(?m)^/test (?:.*? )?e2e-etcd-quorum-loss(?: .*?)?$'
- agent: kubernetes
always_run: false
branches:
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -103,6 +103,10 @@ objects:
value: /tmp/home
- name: KUBECONFIG
value: /tmp/artifacts/installer/auth/kubeconfig
- name: CLUSTER_NAME
value: ${NAMESPACE}-${JOB_NAME_HASH}
- name: BASE_DOMAIN
value: ${BASE_DOMAIN}
command:
- /bin/bash
- -c
Expand Down Expand Up @@ -159,6 +163,43 @@ objects:
mkdir -p /tmp/output
cd /tmp/output

function retry() {
local ATTEMPTS="${1}"
shift
echo "${@}"
rc=1
for i in $(seq 0 ${ATTEMPTS}); do
"${@}" && rc=0 && break
sleep 10
done
if [ "${rc}" != "0" ]; then exit 1; fi
}

function setup_ssh_bastion() {
echo "Setting up ssh bastion"
mkdir -p ~/.ssh || true
cp "${KUBE_SSH_KEY_PATH}" ~/.ssh/id_rsa
chmod 0600 ~/.ssh/id_rsa
if ! whoami &> /dev/null; then
if [ -w /etc/passwd ]; then
echo "${USER_NAME:-default}:x:$(id -u):0:${USER_NAME:-default} user:${HOME}:/sbin/nologin" >> /etc/passwd
fi
fi
curl https://raw.githubusercontent.com/eparis/ssh-bastion/master/deploy/deploy.sh | bash
for i in $(seq 0 60)
do
BASTION_HOST=$(oc get service -n openshift-ssh-bastion ssh-bastion -o jsonpath='{.status.loadBalancer.ingress[0].hostname}')
if [ ! -z "${BASTION_HOST}" ]; then break; fi
sleep 10
done
}
function bastion_ssh() {
retry 60 \
ssh -o LogLevel=error -o ConnectionAttempts=100 -o ConnectTimeout=5 -o StrictHostKeyChecking=no \
-o ProxyCommand="ssh -A -o StrictHostKeyChecking=no -o LogLevel=error -o ServerAliveInterval=30 -o ConnectionAttempts=100 -o ConnectTimeout=5 -W %h:%p core@${BASTION_HOST}" \
$@
}

function restore-cluster-state() {
echo "Placing file /etc/rollback-test with contents A"
cat > /tmp/machineconfig.yaml <<'EOF'
Expand Down Expand Up @@ -189,21 +230,7 @@ objects:

wait_for_machineconfigpool_to_apply

echo "Setting up ssh bastion"
mkdir -p ~/.ssh || true
cp "${KUBE_SSH_KEY_PATH}" ~/.ssh/id_rsa
chmod 0600 ~/.ssh/id_rsa
echo "${USER:-default}:x:$(id -u):$(id -g):Default User:$HOME:/sbin/nologin" >> /etc/passwd
curl https://raw.githubusercontent.com/eparis/ssh-bastion/master/deploy/deploy.sh | bash
BASTION_HOST=$(oc get service -n openshift-ssh-bastion ssh-bastion -o jsonpath='{.status.loadBalancer.ingress[0].hostname}')

function bastion_ssh() {
while true
do
ssh -o LogLevel=error -o ConnectionAttempts=100 -o ConnectTimeout=5 -o StrictHostKeyChecking=no -o ProxyCommand="ssh -A -o StrictHostKeyChecking=no -o LogLevel=error -o ServerAliveInterval=30 -o ConnectionAttempts=100 -o ConnectTimeout=5 -W %h:%p core@${BASTION_HOST}" $@ && break
sleep 10
done
}
setup_ssh_bastion

echo "Make etcd backup on first master"
FIRST_MASTER=$(oc get node -l node-role.kubernetes.io/master= -o name | head -n1 | cut -d '/' -f 2)
Expand Down Expand Up @@ -342,6 +369,222 @@ objects:
oc wait namespace/openshift-ssh-bastion --for delete --timeout=10m
}

function recover-from-etcd-quorum-loss() {
mkdir -p ~/.ssh || true
cp "${KUBE_SSH_KEY_PATH}" ~/.ssh/id_rsa
chmod 0600 ~/.ssh/id_rsa
if ! whoami &> /dev/null; then
if [ -w /etc/passwd ]; then
echo "${USER_NAME:-default}:x:$(id -u):0:${USER_NAME:-default} user:${HOME}:/sbin/nologin" >> /etc/passwd
fi
fi

setup_ssh_bastion

# Machine API won't let the user to destroy the node which runs the controller
echo "Finding two masters to destroy"
MAPI_POD=$(oc get pod -l k8s-app=controller -n openshift-machine-api --no-headers -o name)
SURVIVING_MASTER_NODE=$(oc get ${MAPI_POD} -n openshift-machine-api -o jsonpath='{.spec.nodeName}')
mapfile -t MASTER_NODES_TO_REMOVE < <(oc get nodes -l node-role.kubernetes.io/master= -o name | grep -v "${SURVIVING_MASTER_NODE}")
MASTER_MACHINES_TO_REMOVE=()
for master in ${MASTER_NODES_TO_REMOVE[@]}
do
MASTER_MACHINES_TO_REMOVE+=($(oc get ${master} -o jsonpath='{.metadata.annotations.machine\.openshift\.io\/machine}' | cut -d '/' -f 2))
done

echo "Destroy two masters"
# Scale down etcd quorum guard
oc scale --replicas=0 deployment.apps/etcd-quorum-guard -n openshift-machine-config-operator

for machine in ${MASTER_MACHINES_TO_REMOVE[@]}
do
retry 10 oc --request-timeout=5s -n openshift-machine-api delete machine ${machine}
done

echo "Confirm meltdown"
sleep 30
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

What are we waiting for here? This seems brittle.

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Machine API won't wait for machine to be deleted during oc delete machine call, so API is available for a few seconds after 2 masters were removed.

This pause is necessary to confirm API is no longer responding

oc --request-timeout=5s get nodes && exit 1

echo "Upload a script to restore single node cluster"
cat > /tmp/install_recovery_scripts.sh <<'SCRIPT_EOF'
#!/bin/bash
set -x
if [[ $EUID -ne 0 ]]; then
echo "This script must be run as root"
exit 1
fi

PREFIX="https://raw.githubusercontent.com/vrutkovs/openshift-recovery/unify-fixes"
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

floating downloads 😭 😉

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Its a temporary measure while we debug MCO scripts.

This is fixed in #3842 which is the same PR + MCO scripts


curl -s $PREFIX/bin/etcd-member-recover.sh -o /usr/local/bin/etcd-member-recover.sh
chmod 755 /usr/local/bin/etcd-member-recover.sh

curl -s $PREFIX/bin/etcd-snapshot-restore.sh -o /usr/local/bin/etcd-snapshot-restore.sh
chmod 755 /usr/local/bin/etcd-snapshot-restore.sh

curl -s $PREFIX/bin/openshift-recovery-tools -o /usr/local/bin/recovery-tools

curl -s $PREFIX/bin/tokenize-signer.sh -o /usr/local/bin/tokenize-signer.sh
chmod 755 /usr/local/bin/tokenize-signer.sh
SCRIPT_EOF
chmod +x /tmp/install_recovery_scripts.sh

retry 10 \
scp -o StrictHostKeyChecking=no -o ProxyCommand="ssh -A -o StrictHostKeyChecking=no \
-o ServerAliveInterval=30 -W %h:%p core@${BASTION_HOST}" \
/tmp/install_recovery_scripts.sh core@"${SURVIVING_MASTER_NODE}":/tmp
echo "Run recovery scripts installer"
bastion_ssh core@${SURVIVING_MASTER_NODE} "sudo -i /tmp/install_recovery_scripts.sh"

echo "Restore etcd from snapshot and initialize one node cluster"
bastion_ssh core@${SURVIVING_MASTER_NODE} "sudo -i /usr/local/bin/etcd-snapshot-restore.sh"

echo "Wait for API server to come up"
retry 30 oc get nodes
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Not part of the test framework that I can review, but if your etcd cluster is down to one node (because you killed two of three control-plane nodes), then how is the remaining etcd still functioning? I'd have expected it to be freaking out about having lost quorum and refusing to take possibly-split-brained actions. If there's some quick explanation for how this works, I'm very curious. If the explanation is longer, we should probably skip it to avoid distracting from the test script itself.

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

if your etcd cluster is down to one node (because you killed two of three control-plane nodes), then how is the remaining etcd still functioning?

etcd-snapshot-restore.sh without params would restore this etcd forming a one node cluster - https://github.com/openshift/machine-config-operator/blob/master/templates/master/00-master/_base/files/usr-local-bin-openshift-recovery-tools-sh.yaml#L141-L168.

Added a bit better comment for this in b99911295


# Workaround for https://bugzilla.redhat.com/show_bug.cgi?id=1707006
echo "Restart SDN"
retry 10 oc delete pods -l app=sdn -n openshift-sdn --wait=false

echo "Create two masters via Machine API"
retry 10 oc get machines -n openshift-machine-api
# Clone existing masters, update IDs and oc apply
SURVIVING_MASTER_MACHINE=$(oc get machine -l machine.openshift.io/cluster-api-machine-role=master -n openshift-machine-api -o name | cut -d '/' -f 2)
SURVIVING_MASTER_NUM=${SURVIVING_MASTER_MACHINE##*-}
SURVIVING_MASTER_PREFIX=${SURVIVING_MASTER_MACHINE%-*}
oc get --export machine ${SURVIVING_MASTER_MACHINE} -n openshift-machine-api -o yaml > /tmp/machine.template
SURVIVING_MASTER_AZ=$(grep -oP 'availabilityZone: (\K.+)' /tmp/machine.template)
SURVIVING_MASTER_AZ_INDEX=$(grep -oP 'availabilityZone: .*(\K.)' /tmp/machine.template)

MASTER_INDEX=0
for i in $(seq 0 1); do
if [[ "${MASTER_INDEX}" == "${SURVIVING_MASTER_NUM}" ]]; then MASTER_INDEX=$((MASTER_INDEX+1)); fi
cat /tmp/machine.template \
| sed 's;selfLink.*;;g' \
| sed "s;name: ${SURVIVING_MASTER_PREFIX}-${SURVIVING_MASTER_NUM};name: ${SURVIVING_MASTER_PREFIX}-${MASTER_INDEX};" > /tmp/machine_${i}.yaml
for j in $(seq 0 10); do oc create -n openshift-machine-api -f /tmp/machine_${i}.yaml && break; done
MASTER_INDEX=$((MASTER_INDEX+1))
done

echo "Waiting for machines to be created"
NEW_MASTER_IPS=()
for i in $(seq 0 60); do
NEW_MASTER_IPS=($(oc -n openshift-machine-api \
get machines \
-l machine.openshift.io/cluster-api-machine-role=master \
-o jsonpath='{range .items[*]}{.status.addresses[?(@.type=="InternalIP")].address}{"\n"}{end}' || true))
if [[ "${#NEW_MASTER_IPS[@]}" == "3" ]]; then break; fi
sleep 30
done
oc get machines -n openshift-machine-api
if [[ "${#NEW_MASTER_IPS[@]}" != "3" ]]; then
echo "${NEW_MASTER_IPS[@]}"
exit 1
fi

echo "Verify new master nodes have joined the cluster"
FOUND_MASTERS=0
for i in $(seq 1 60)
do
FOUND_MASTERS=($(oc --request-timeout=5s get nodes -l node-role.kubernetes.io/master= -o name --no-headers || true))
if [[ "${#FOUND_MASTERS[@]}" == "3" ]]; then break; fi
sleep 30
done
oc get nodes
if [[ "${#FOUND_MASTERS[@]}" != "3" ]]; then
echo "${FOUND_MASTERS[@]}"
exit 1
fi

echo "Update DNS and LB"
# aws cli magic
easy_install --user pip
~/.local/bin/pip install --user boto3
cat > /tmp/update_route_53.py <<'PYTHON_EOF'
import boto3
import os
import sys

if len(sys.argv) < 3:
print("Usage: ./update_route_53.py <RECORD> <IP>")
sys.exit(1)

record = sys.argv[1]
ip = sys.argv[2]
print("record: %s" % record)
print("ip: %s" % ip)

domain = "%s.%s" % (os.environ["CLUSTER_NAME"], os.environ["BASE_DOMAIN"])

client = boto3.client('route53')
r = client.list_hosted_zones_by_name(DNSName=domain, MaxItems="1")
zone_id = r['HostedZones'][0]['Id'].split('/')[-1]

response = client.change_resource_record_sets(
HostedZoneId=zone_id,
ChangeBatch= {
'Comment': 'add %s -> %s' % (record, ip),
'Changes': [
{
'Action': 'UPSERT',
'ResourceRecordSet': {
'Name': record,
'Type': 'A',
'TTL': 60,
'ResourceRecords': [{'Value': ip}]
}
}]
})
PYTHON_EOF
for i in "${!NEW_MASTER_IPS[@]}"; do
ETCD_NAME="etcd-${i}.${CLUSTER_NAME}.${BASE_DOMAIN}"
python /tmp/update_route_53.py "${ETCD_NAME}" "${NEW_MASTER_IPS[$i]}"
done

echo "Run etcd-signer"
SURVIVING_MASTER_NODE_SHORT=${SURVIVING_MASTER_NODE%%.*}
curl -O https://raw.githubusercontent.com/hexfusion/openshift-recovery/master/manifests/kube-etcd-cert-signer.yaml.template
sed "s;__MASTER_HOSTNAME__;${SURVIVING_MASTER_NODE_SHORT};g" kube-etcd-cert-signer.yaml.template > kube-etcd-cert-signer.yaml
retry 10 oc create -f kube-etcd-cert-signer.yaml
retry 10 oc get pod/etcd-signer -n openshift-config -o name
retry 10 oc wait pod/etcd-signer -n openshift-config --for condition=ready

echo "Grow etcd cluster to full membership"
SURVIVING_MASTER_IP=$(oc get nodes ${SURVIVING_MASTER_NODE} -o jsonpath='{.status.addresses[?(@.type=="InternalIP")].address}')
SETUP_ETCD_ENVIRONMENT=$(oc adm release info --image-for setup-etcd-environment)
KUBE_CLIENT_AGENT=$(oc adm release info --image-for kube-client-agent)
MASTERS=($(oc -n openshift-machine-api \
get machines \
-l machine.openshift.io/cluster-api-machine-role=master \
-o jsonpath='{range .items[*]}{.status.addresses[?(@.type=="InternalDNS")].address}{"\n"}{end}'))
for master in ${MASTERS[@]}
do
if [[ "${master}" == ${SURVIVING_MASTER_NODE} ]]; then continue; fi
retry 10 \
scp -o StrictHostKeyChecking=no -o ProxyCommand="ssh -A -o StrictHostKeyChecking=no \
-o ServerAliveInterval=30 -W %h:%p core@${BASTION_HOST}" \
/tmp/install_recovery_scripts.sh core@${master}:/tmp
bastion_ssh core@${master} "sudo -i /tmp/install_recovery_scripts.sh"
bastion_ssh core@${master} "sudo -i env SETUP_ETCD_ENVIRONMENT=${SETUP_ETCD_ENVIRONMENT} KUBE_CLIENT_AGENT=${KUBE_CLIENT_AGENT} /usr/local/bin/etcd-member-recover.sh ${SURVIVING_MASTER_IP}"
done

for master in ${MASTERS[@]}
do
retry 10 oc get pod/etcd-member-${master} -n openshift-etcd -o name
retry 10 oc wait pod/etcd-member-${master} -n openshift-etcd --for condition=Ready
done

echo "Removing ssh-bastion"
retry 10 oc delete project openshift-ssh-bastion
retry 10 oc wait namespace/openshift-ssh-bastion --for delete --timeout=10m

echo "Scale etcd-quorum guard"
retry 10 oc scale --replicas=3 deployment.apps/etcd-quorum-guard -n openshift-machine-config-operator

echo "Remove etcd-signer"
retry 10 oc delete pod/etcd-signer -n openshift-config
}

function run-upgrade-tests() {
openshift-tests run-upgrade "${TEST_SUITE}" --to-image "${RELEASE_IMAGE_LATEST}" \
--provider "${TEST_PROVIDER:-}" -o /tmp/artifacts/e2e.log --junit-dir /tmp/artifacts/junit
Expand Down