-
Notifications
You must be signed in to change notification settings - Fork 2.1k
installer template: add 'recover from etcd quorum loss' test #3572
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Changes from all commits
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -103,6 +103,10 @@ objects: | |
| value: /tmp/home | ||
| - name: KUBECONFIG | ||
| value: /tmp/artifacts/installer/auth/kubeconfig | ||
| - name: CLUSTER_NAME | ||
| value: ${NAMESPACE}-${JOB_NAME_HASH} | ||
| - name: BASE_DOMAIN | ||
| value: ${BASE_DOMAIN} | ||
| command: | ||
| - /bin/bash | ||
| - -c | ||
|
|
@@ -159,6 +163,43 @@ objects: | |
| mkdir -p /tmp/output | ||
| cd /tmp/output | ||
|
|
||
| function retry() { | ||
| local ATTEMPTS="${1}" | ||
| shift | ||
| echo "${@}" | ||
| rc=1 | ||
| for i in $(seq 0 ${ATTEMPTS}); do | ||
| "${@}" && rc=0 && break | ||
| sleep 10 | ||
| done | ||
| if [ "${rc}" != "0" ]; then exit 1; fi | ||
| } | ||
|
|
||
| function setup_ssh_bastion() { | ||
| echo "Setting up ssh bastion" | ||
| mkdir -p ~/.ssh || true | ||
| cp "${KUBE_SSH_KEY_PATH}" ~/.ssh/id_rsa | ||
| chmod 0600 ~/.ssh/id_rsa | ||
| if ! whoami &> /dev/null; then | ||
| if [ -w /etc/passwd ]; then | ||
| echo "${USER_NAME:-default}:x:$(id -u):0:${USER_NAME:-default} user:${HOME}:/sbin/nologin" >> /etc/passwd | ||
| fi | ||
| fi | ||
| curl https://raw.githubusercontent.com/eparis/ssh-bastion/master/deploy/deploy.sh | bash | ||
| for i in $(seq 0 60) | ||
| do | ||
| BASTION_HOST=$(oc get service -n openshift-ssh-bastion ssh-bastion -o jsonpath='{.status.loadBalancer.ingress[0].hostname}') | ||
| if [ ! -z "${BASTION_HOST}" ]; then break; fi | ||
| sleep 10 | ||
| done | ||
| } | ||
| function bastion_ssh() { | ||
| retry 60 \ | ||
| ssh -o LogLevel=error -o ConnectionAttempts=100 -o ConnectTimeout=5 -o StrictHostKeyChecking=no \ | ||
| -o ProxyCommand="ssh -A -o StrictHostKeyChecking=no -o LogLevel=error -o ServerAliveInterval=30 -o ConnectionAttempts=100 -o ConnectTimeout=5 -W %h:%p core@${BASTION_HOST}" \ | ||
| $@ | ||
| } | ||
|
|
||
| function restore-cluster-state() { | ||
| echo "Placing file /etc/rollback-test with contents A" | ||
| cat > /tmp/machineconfig.yaml <<'EOF' | ||
|
|
@@ -189,21 +230,7 @@ objects: | |
|
|
||
| wait_for_machineconfigpool_to_apply | ||
|
|
||
| echo "Setting up ssh bastion" | ||
| mkdir -p ~/.ssh || true | ||
| cp "${KUBE_SSH_KEY_PATH}" ~/.ssh/id_rsa | ||
| chmod 0600 ~/.ssh/id_rsa | ||
| echo "${USER:-default}:x:$(id -u):$(id -g):Default User:$HOME:/sbin/nologin" >> /etc/passwd | ||
| curl https://raw.githubusercontent.com/eparis/ssh-bastion/master/deploy/deploy.sh | bash | ||
| BASTION_HOST=$(oc get service -n openshift-ssh-bastion ssh-bastion -o jsonpath='{.status.loadBalancer.ingress[0].hostname}') | ||
|
|
||
| function bastion_ssh() { | ||
| while true | ||
| do | ||
| ssh -o LogLevel=error -o ConnectionAttempts=100 -o ConnectTimeout=5 -o StrictHostKeyChecking=no -o ProxyCommand="ssh -A -o StrictHostKeyChecking=no -o LogLevel=error -o ServerAliveInterval=30 -o ConnectionAttempts=100 -o ConnectTimeout=5 -W %h:%p core@${BASTION_HOST}" $@ && break | ||
| sleep 10 | ||
| done | ||
| } | ||
| setup_ssh_bastion | ||
|
|
||
| echo "Make etcd backup on first master" | ||
| FIRST_MASTER=$(oc get node -l node-role.kubernetes.io/master= -o name | head -n1 | cut -d '/' -f 2) | ||
|
|
@@ -342,6 +369,222 @@ objects: | |
| oc wait namespace/openshift-ssh-bastion --for delete --timeout=10m | ||
| } | ||
|
|
||
| function recover-from-etcd-quorum-loss() { | ||
| mkdir -p ~/.ssh || true | ||
| cp "${KUBE_SSH_KEY_PATH}" ~/.ssh/id_rsa | ||
| chmod 0600 ~/.ssh/id_rsa | ||
| if ! whoami &> /dev/null; then | ||
| if [ -w /etc/passwd ]; then | ||
| echo "${USER_NAME:-default}:x:$(id -u):0:${USER_NAME:-default} user:${HOME}:/sbin/nologin" >> /etc/passwd | ||
| fi | ||
| fi | ||
|
|
||
| setup_ssh_bastion | ||
|
|
||
| # Machine API won't let the user to destroy the node which runs the controller | ||
| echo "Finding two masters to destroy" | ||
| MAPI_POD=$(oc get pod -l k8s-app=controller -n openshift-machine-api --no-headers -o name) | ||
| SURVIVING_MASTER_NODE=$(oc get ${MAPI_POD} -n openshift-machine-api -o jsonpath='{.spec.nodeName}') | ||
| mapfile -t MASTER_NODES_TO_REMOVE < <(oc get nodes -l node-role.kubernetes.io/master= -o name | grep -v "${SURVIVING_MASTER_NODE}") | ||
| MASTER_MACHINES_TO_REMOVE=() | ||
| for master in ${MASTER_NODES_TO_REMOVE[@]} | ||
| do | ||
| MASTER_MACHINES_TO_REMOVE+=($(oc get ${master} -o jsonpath='{.metadata.annotations.machine\.openshift\.io\/machine}' | cut -d '/' -f 2)) | ||
| done | ||
|
|
||
| echo "Destroy two masters" | ||
| # Scale down etcd quorum guard | ||
| oc scale --replicas=0 deployment.apps/etcd-quorum-guard -n openshift-machine-config-operator | ||
|
|
||
| for machine in ${MASTER_MACHINES_TO_REMOVE[@]} | ||
| do | ||
| retry 10 oc --request-timeout=5s -n openshift-machine-api delete machine ${machine} | ||
| done | ||
|
|
||
| echo "Confirm meltdown" | ||
| sleep 30 | ||
|
||
| oc --request-timeout=5s get nodes && exit 1 | ||
|
|
||
| echo "Upload a script to restore single node cluster" | ||
| cat > /tmp/install_recovery_scripts.sh <<'SCRIPT_EOF' | ||
| #!/bin/bash | ||
| set -x | ||
| if [[ $EUID -ne 0 ]]; then | ||
| echo "This script must be run as root" | ||
| exit 1 | ||
| fi | ||
|
|
||
| PREFIX="https://raw.githubusercontent.com/vrutkovs/openshift-recovery/unify-fixes" | ||
|
||
|
|
||
| curl -s $PREFIX/bin/etcd-member-recover.sh -o /usr/local/bin/etcd-member-recover.sh | ||
| chmod 755 /usr/local/bin/etcd-member-recover.sh | ||
|
|
||
| curl -s $PREFIX/bin/etcd-snapshot-restore.sh -o /usr/local/bin/etcd-snapshot-restore.sh | ||
| chmod 755 /usr/local/bin/etcd-snapshot-restore.sh | ||
|
|
||
| curl -s $PREFIX/bin/openshift-recovery-tools -o /usr/local/bin/recovery-tools | ||
|
|
||
| curl -s $PREFIX/bin/tokenize-signer.sh -o /usr/local/bin/tokenize-signer.sh | ||
| chmod 755 /usr/local/bin/tokenize-signer.sh | ||
| SCRIPT_EOF | ||
| chmod +x /tmp/install_recovery_scripts.sh | ||
|
|
||
| retry 10 \ | ||
| scp -o StrictHostKeyChecking=no -o ProxyCommand="ssh -A -o StrictHostKeyChecking=no \ | ||
| -o ServerAliveInterval=30 -W %h:%p core@${BASTION_HOST}" \ | ||
| /tmp/install_recovery_scripts.sh core@"${SURVIVING_MASTER_NODE}":/tmp | ||
| echo "Run recovery scripts installer" | ||
| bastion_ssh core@${SURVIVING_MASTER_NODE} "sudo -i /tmp/install_recovery_scripts.sh" | ||
|
|
||
| echo "Restore etcd from snapshot and initialize one node cluster" | ||
| bastion_ssh core@${SURVIVING_MASTER_NODE} "sudo -i /usr/local/bin/etcd-snapshot-restore.sh" | ||
|
|
||
| echo "Wait for API server to come up" | ||
| retry 30 oc get nodes | ||
|
||
|
|
||
| # Workaround for https://bugzilla.redhat.com/show_bug.cgi?id=1707006 | ||
| echo "Restart SDN" | ||
| retry 10 oc delete pods -l app=sdn -n openshift-sdn --wait=false | ||
|
|
||
| echo "Create two masters via Machine API" | ||
| retry 10 oc get machines -n openshift-machine-api | ||
| # Clone existing masters, update IDs and oc apply | ||
| SURVIVING_MASTER_MACHINE=$(oc get machine -l machine.openshift.io/cluster-api-machine-role=master -n openshift-machine-api -o name | cut -d '/' -f 2) | ||
| SURVIVING_MASTER_NUM=${SURVIVING_MASTER_MACHINE##*-} | ||
| SURVIVING_MASTER_PREFIX=${SURVIVING_MASTER_MACHINE%-*} | ||
| oc get --export machine ${SURVIVING_MASTER_MACHINE} -n openshift-machine-api -o yaml > /tmp/machine.template | ||
| SURVIVING_MASTER_AZ=$(grep -oP 'availabilityZone: (\K.+)' /tmp/machine.template) | ||
| SURVIVING_MASTER_AZ_INDEX=$(grep -oP 'availabilityZone: .*(\K.)' /tmp/machine.template) | ||
|
|
||
| MASTER_INDEX=0 | ||
| for i in $(seq 0 1); do | ||
| if [[ "${MASTER_INDEX}" == "${SURVIVING_MASTER_NUM}" ]]; then MASTER_INDEX=$((MASTER_INDEX+1)); fi | ||
| cat /tmp/machine.template \ | ||
| | sed 's;selfLink.*;;g' \ | ||
| | sed "s;name: ${SURVIVING_MASTER_PREFIX}-${SURVIVING_MASTER_NUM};name: ${SURVIVING_MASTER_PREFIX}-${MASTER_INDEX};" > /tmp/machine_${i}.yaml | ||
| for j in $(seq 0 10); do oc create -n openshift-machine-api -f /tmp/machine_${i}.yaml && break; done | ||
| MASTER_INDEX=$((MASTER_INDEX+1)) | ||
| done | ||
|
|
||
| echo "Waiting for machines to be created" | ||
| NEW_MASTER_IPS=() | ||
| for i in $(seq 0 60); do | ||
| NEW_MASTER_IPS=($(oc -n openshift-machine-api \ | ||
| get machines \ | ||
| -l machine.openshift.io/cluster-api-machine-role=master \ | ||
| -o jsonpath='{range .items[*]}{.status.addresses[?(@.type=="InternalIP")].address}{"\n"}{end}' || true)) | ||
| if [[ "${#NEW_MASTER_IPS[@]}" == "3" ]]; then break; fi | ||
| sleep 30 | ||
| done | ||
| oc get machines -n openshift-machine-api | ||
| if [[ "${#NEW_MASTER_IPS[@]}" != "3" ]]; then | ||
| echo "${NEW_MASTER_IPS[@]}" | ||
| exit 1 | ||
| fi | ||
|
|
||
| echo "Verify new master nodes have joined the cluster" | ||
| FOUND_MASTERS=0 | ||
| for i in $(seq 1 60) | ||
| do | ||
| FOUND_MASTERS=($(oc --request-timeout=5s get nodes -l node-role.kubernetes.io/master= -o name --no-headers || true)) | ||
| if [[ "${#FOUND_MASTERS[@]}" == "3" ]]; then break; fi | ||
| sleep 30 | ||
| done | ||
| oc get nodes | ||
| if [[ "${#FOUND_MASTERS[@]}" != "3" ]]; then | ||
| echo "${FOUND_MASTERS[@]}" | ||
| exit 1 | ||
| fi | ||
|
|
||
| echo "Update DNS and LB" | ||
| # aws cli magic | ||
| easy_install --user pip | ||
| ~/.local/bin/pip install --user boto3 | ||
| cat > /tmp/update_route_53.py <<'PYTHON_EOF' | ||
| import boto3 | ||
| import os | ||
| import sys | ||
|
|
||
| if len(sys.argv) < 3: | ||
| print("Usage: ./update_route_53.py <RECORD> <IP>") | ||
| sys.exit(1) | ||
|
|
||
| record = sys.argv[1] | ||
| ip = sys.argv[2] | ||
| print("record: %s" % record) | ||
| print("ip: %s" % ip) | ||
|
|
||
| domain = "%s.%s" % (os.environ["CLUSTER_NAME"], os.environ["BASE_DOMAIN"]) | ||
|
|
||
| client = boto3.client('route53') | ||
| r = client.list_hosted_zones_by_name(DNSName=domain, MaxItems="1") | ||
| zone_id = r['HostedZones'][0]['Id'].split('/')[-1] | ||
|
|
||
| response = client.change_resource_record_sets( | ||
| HostedZoneId=zone_id, | ||
| ChangeBatch= { | ||
| 'Comment': 'add %s -> %s' % (record, ip), | ||
| 'Changes': [ | ||
| { | ||
| 'Action': 'UPSERT', | ||
| 'ResourceRecordSet': { | ||
| 'Name': record, | ||
| 'Type': 'A', | ||
| 'TTL': 60, | ||
| 'ResourceRecords': [{'Value': ip}] | ||
| } | ||
| }] | ||
| }) | ||
| PYTHON_EOF | ||
| for i in "${!NEW_MASTER_IPS[@]}"; do | ||
| ETCD_NAME="etcd-${i}.${CLUSTER_NAME}.${BASE_DOMAIN}" | ||
| python /tmp/update_route_53.py "${ETCD_NAME}" "${NEW_MASTER_IPS[$i]}" | ||
| done | ||
|
|
||
| echo "Run etcd-signer" | ||
| SURVIVING_MASTER_NODE_SHORT=${SURVIVING_MASTER_NODE%%.*} | ||
| curl -O https://raw.githubusercontent.com/hexfusion/openshift-recovery/master/manifests/kube-etcd-cert-signer.yaml.template | ||
| sed "s;__MASTER_HOSTNAME__;${SURVIVING_MASTER_NODE_SHORT};g" kube-etcd-cert-signer.yaml.template > kube-etcd-cert-signer.yaml | ||
vrutkovs marked this conversation as resolved.
Outdated
Show resolved
Hide resolved
|
||
| retry 10 oc create -f kube-etcd-cert-signer.yaml | ||
| retry 10 oc get pod/etcd-signer -n openshift-config -o name | ||
| retry 10 oc wait pod/etcd-signer -n openshift-config --for condition=ready | ||
|
|
||
| echo "Grow etcd cluster to full membership" | ||
| SURVIVING_MASTER_IP=$(oc get nodes ${SURVIVING_MASTER_NODE} -o jsonpath='{.status.addresses[?(@.type=="InternalIP")].address}') | ||
| SETUP_ETCD_ENVIRONMENT=$(oc adm release info --image-for setup-etcd-environment) | ||
| KUBE_CLIENT_AGENT=$(oc adm release info --image-for kube-client-agent) | ||
| MASTERS=($(oc -n openshift-machine-api \ | ||
| get machines \ | ||
| -l machine.openshift.io/cluster-api-machine-role=master \ | ||
| -o jsonpath='{range .items[*]}{.status.addresses[?(@.type=="InternalDNS")].address}{"\n"}{end}')) | ||
| for master in ${MASTERS[@]} | ||
| do | ||
| if [[ "${master}" == ${SURVIVING_MASTER_NODE} ]]; then continue; fi | ||
| retry 10 \ | ||
| scp -o StrictHostKeyChecking=no -o ProxyCommand="ssh -A -o StrictHostKeyChecking=no \ | ||
| -o ServerAliveInterval=30 -W %h:%p core@${BASTION_HOST}" \ | ||
| /tmp/install_recovery_scripts.sh core@${master}:/tmp | ||
| bastion_ssh core@${master} "sudo -i /tmp/install_recovery_scripts.sh" | ||
| bastion_ssh core@${master} "sudo -i env SETUP_ETCD_ENVIRONMENT=${SETUP_ETCD_ENVIRONMENT} KUBE_CLIENT_AGENT=${KUBE_CLIENT_AGENT} /usr/local/bin/etcd-member-recover.sh ${SURVIVING_MASTER_IP}" | ||
| done | ||
|
|
||
| for master in ${MASTERS[@]} | ||
| do | ||
| retry 10 oc get pod/etcd-member-${master} -n openshift-etcd -o name | ||
| retry 10 oc wait pod/etcd-member-${master} -n openshift-etcd --for condition=Ready | ||
| done | ||
|
|
||
| echo "Removing ssh-bastion" | ||
| retry 10 oc delete project openshift-ssh-bastion | ||
| retry 10 oc wait namespace/openshift-ssh-bastion --for delete --timeout=10m | ||
|
|
||
| echo "Scale etcd-quorum guard" | ||
| retry 10 oc scale --replicas=3 deployment.apps/etcd-quorum-guard -n openshift-machine-config-operator | ||
|
|
||
| echo "Remove etcd-signer" | ||
| retry 10 oc delete pod/etcd-signer -n openshift-config | ||
| } | ||
|
|
||
| function run-upgrade-tests() { | ||
| openshift-tests run-upgrade "${TEST_SUITE}" --to-image "${RELEASE_IMAGE_LATEST}" \ | ||
| --provider "${TEST_PROVIDER:-}" -o /tmp/artifacts/e2e.log --junit-dir /tmp/artifacts/junit | ||
|
|
||
Uh oh!
There was an error while loading. Please reload this page.