diff --git a/ci-operator/config/openshift/installer/openshift-installer-master.yaml b/ci-operator/config/openshift/installer/openshift-installer-master.yaml index 09b5b3590de69..2d038ada68c86 100644 --- a/ci-operator/config/openshift/installer/openshift-installer-master.yaml +++ b/ci-operator/config/openshift/installer/openshift-installer-master.yaml @@ -141,13 +141,20 @@ tests: commands: TEST_SUITE=openshift/conformance run-tests openshift_ansible_40: cluster_profile: aws-centos-40 -- as: e2e-vsphere - commands: TEST_SUITE=openshift/conformance/parallel run-tests - openshift_installer_upi: - cluster_profile: vsphere +- as: e2e-etcd-quorum-loss + commands: | + recover-from-etcd-quorum-loss + TEST_SUITE=openshift/conformance/parallel run-tests + openshift_installer: + cluster_profile: aws + upgrade: false - as: e2e-restore-cluster-state commands: | restore-cluster-state TEST_SUITE=openshift/conformance/parallel run-tests openshift_installer: cluster_profile: aws +- as: e2e-vsphere + commands: TEST_SUITE=openshift/conformance/parallel run-tests + openshift_installer_upi: + cluster_profile: vsphere diff --git a/ci-operator/jobs/openshift/installer/openshift-installer-master-presubmits.yaml b/ci-operator/jobs/openshift/installer/openshift-installer-master-presubmits.yaml index 33a6fad7e4f49..0bd374f30e047 100644 --- a/ci-operator/jobs/openshift/installer/openshift-installer-master-presubmits.yaml +++ b/ci-operator/jobs/openshift/installer/openshift-installer-master-presubmits.yaml @@ -348,6 +348,73 @@ presubmits: secret: secretName: sentry-dsn trigger: '(?m)^/test (?:.*? )?e2e-aws-upi(?: .*?)?$' + - agent: kubernetes + always_run: false + branches: + - master + context: ci/prow/e2e-etcd-quorum-loss + decorate: true + decoration_config: + skip_cloning: true + labels: + ci-operator.openshift.io/prowgen-controlled: "true" + name: pull-ci-openshift-installer-master-e2e-etcd-quorum-loss + optional: true + rerun_command: /test e2e-etcd-quorum-loss + spec: + containers: + - args: + - --artifact-dir=$(ARTIFACTS) + - --give-pr-author-access-to-namespace=true + - --secret-dir=/usr/local/e2e-etcd-quorum-loss-cluster-profile + - --sentry-dsn-path=/etc/sentry-dsn/ci-operator + - --target=e2e-etcd-quorum-loss + - --template=/usr/local/e2e-etcd-quorum-loss + command: + - ci-operator + env: + - name: CLUSTER_TYPE + value: aws + - name: CONFIG_SPEC + valueFrom: + configMapKeyRef: + key: openshift-installer-master.yaml + name: ci-operator-master-configs + - name: JOB_NAME_SAFE + value: e2e-etcd-quorum-loss + - name: TEST_COMMAND + value: | + recover-from-etcd-quorum-loss + TEST_SUITE=openshift/conformance/parallel run-tests + image: ci-operator:latest + imagePullPolicy: Always + name: "" + resources: + requests: + cpu: 10m + volumeMounts: + - mountPath: /usr/local/e2e-etcd-quorum-loss-cluster-profile + name: cluster-profile + - mountPath: /usr/local/e2e-etcd-quorum-loss + name: job-definition + subPath: cluster-launch-installer-e2e.yaml + - mountPath: /etc/sentry-dsn + name: sentry-dsn + readOnly: true + serviceAccountName: ci-operator + volumes: + - name: cluster-profile + projected: + sources: + - secret: + name: cluster-secrets-aws + - configMap: + name: prow-job-cluster-launch-installer-e2e + name: job-definition + - name: sentry-dsn + secret: + secretName: sentry-dsn + trigger: '(?m)^/test (?:.*? )?e2e-etcd-quorum-loss(?: .*?)?$' - agent: kubernetes always_run: false branches: diff --git a/ci-operator/templates/openshift/installer/cluster-launch-installer-e2e.yaml b/ci-operator/templates/openshift/installer/cluster-launch-installer-e2e.yaml index e24774c1911b4..8715b6e38352a 100644 --- a/ci-operator/templates/openshift/installer/cluster-launch-installer-e2e.yaml +++ b/ci-operator/templates/openshift/installer/cluster-launch-installer-e2e.yaml @@ -103,6 +103,10 @@ objects: value: /tmp/home - name: KUBECONFIG value: /tmp/artifacts/installer/auth/kubeconfig + - name: CLUSTER_NAME + value: ${NAMESPACE}-${JOB_NAME_HASH} + - name: BASE_DOMAIN + value: ${BASE_DOMAIN} command: - /bin/bash - -c @@ -159,6 +163,43 @@ objects: mkdir -p /tmp/output cd /tmp/output + function retry() { + local ATTEMPTS="${1}" + shift + echo "${@}" + rc=1 + for i in $(seq 0 ${ATTEMPTS}); do + "${@}" && rc=0 && break + sleep 10 + done + if [ "${rc}" != "0" ]; then exit 1; fi + } + + function setup_ssh_bastion() { + echo "Setting up ssh bastion" + mkdir -p ~/.ssh || true + cp "${KUBE_SSH_KEY_PATH}" ~/.ssh/id_rsa + chmod 0600 ~/.ssh/id_rsa + if ! whoami &> /dev/null; then + if [ -w /etc/passwd ]; then + echo "${USER_NAME:-default}:x:$(id -u):0:${USER_NAME:-default} user:${HOME}:/sbin/nologin" >> /etc/passwd + fi + fi + curl https://raw.githubusercontent.com/eparis/ssh-bastion/master/deploy/deploy.sh | bash + for i in $(seq 0 60) + do + BASTION_HOST=$(oc get service -n openshift-ssh-bastion ssh-bastion -o jsonpath='{.status.loadBalancer.ingress[0].hostname}') + if [ ! -z "${BASTION_HOST}" ]; then break; fi + sleep 10 + done + } + function bastion_ssh() { + retry 60 \ + ssh -o LogLevel=error -o ConnectionAttempts=100 -o ConnectTimeout=5 -o StrictHostKeyChecking=no \ + -o ProxyCommand="ssh -A -o StrictHostKeyChecking=no -o LogLevel=error -o ServerAliveInterval=30 -o ConnectionAttempts=100 -o ConnectTimeout=5 -W %h:%p core@${BASTION_HOST}" \ + $@ + } + function restore-cluster-state() { echo "Placing file /etc/rollback-test with contents A" cat > /tmp/machineconfig.yaml <<'EOF' @@ -189,21 +230,7 @@ objects: wait_for_machineconfigpool_to_apply - echo "Setting up ssh bastion" - mkdir -p ~/.ssh || true - cp "${KUBE_SSH_KEY_PATH}" ~/.ssh/id_rsa - chmod 0600 ~/.ssh/id_rsa - echo "${USER:-default}:x:$(id -u):$(id -g):Default User:$HOME:/sbin/nologin" >> /etc/passwd - curl https://raw.githubusercontent.com/eparis/ssh-bastion/master/deploy/deploy.sh | bash - BASTION_HOST=$(oc get service -n openshift-ssh-bastion ssh-bastion -o jsonpath='{.status.loadBalancer.ingress[0].hostname}') - - function bastion_ssh() { - while true - do - ssh -o LogLevel=error -o ConnectionAttempts=100 -o ConnectTimeout=5 -o StrictHostKeyChecking=no -o ProxyCommand="ssh -A -o StrictHostKeyChecking=no -o LogLevel=error -o ServerAliveInterval=30 -o ConnectionAttempts=100 -o ConnectTimeout=5 -W %h:%p core@${BASTION_HOST}" $@ && break - sleep 10 - done - } + setup_ssh_bastion echo "Make etcd backup on first master" FIRST_MASTER=$(oc get node -l node-role.kubernetes.io/master= -o name | head -n1 | cut -d '/' -f 2) @@ -342,6 +369,222 @@ objects: oc wait namespace/openshift-ssh-bastion --for delete --timeout=10m } + function recover-from-etcd-quorum-loss() { + mkdir -p ~/.ssh || true + cp "${KUBE_SSH_KEY_PATH}" ~/.ssh/id_rsa + chmod 0600 ~/.ssh/id_rsa + if ! whoami &> /dev/null; then + if [ -w /etc/passwd ]; then + echo "${USER_NAME:-default}:x:$(id -u):0:${USER_NAME:-default} user:${HOME}:/sbin/nologin" >> /etc/passwd + fi + fi + + setup_ssh_bastion + + # Machine API won't let the user to destroy the node which runs the controller + echo "Finding two masters to destroy" + MAPI_POD=$(oc get pod -l k8s-app=controller -n openshift-machine-api --no-headers -o name) + SURVIVING_MASTER_NODE=$(oc get ${MAPI_POD} -n openshift-machine-api -o jsonpath='{.spec.nodeName}') + mapfile -t MASTER_NODES_TO_REMOVE < <(oc get nodes -l node-role.kubernetes.io/master= -o name | grep -v "${SURVIVING_MASTER_NODE}") + MASTER_MACHINES_TO_REMOVE=() + for master in ${MASTER_NODES_TO_REMOVE[@]} + do + MASTER_MACHINES_TO_REMOVE+=($(oc get ${master} -o jsonpath='{.metadata.annotations.machine\.openshift\.io\/machine}' | cut -d '/' -f 2)) + done + + echo "Destroy two masters" + # Scale down etcd quorum guard + oc scale --replicas=0 deployment.apps/etcd-quorum-guard -n openshift-machine-config-operator + + for machine in ${MASTER_MACHINES_TO_REMOVE[@]} + do + retry 10 oc --request-timeout=5s -n openshift-machine-api delete machine ${machine} + done + + echo "Confirm meltdown" + sleep 30 + oc --request-timeout=5s get nodes && exit 1 + + echo "Upload a script to restore single node cluster" + cat > /tmp/install_recovery_scripts.sh <<'SCRIPT_EOF' + #!/bin/bash + set -x + if [[ $EUID -ne 0 ]]; then + echo "This script must be run as root" + exit 1 + fi + + PREFIX="https://raw.githubusercontent.com/vrutkovs/openshift-recovery/unify-fixes" + + curl -s $PREFIX/bin/etcd-member-recover.sh -o /usr/local/bin/etcd-member-recover.sh + chmod 755 /usr/local/bin/etcd-member-recover.sh + + curl -s $PREFIX/bin/etcd-snapshot-restore.sh -o /usr/local/bin/etcd-snapshot-restore.sh + chmod 755 /usr/local/bin/etcd-snapshot-restore.sh + + curl -s $PREFIX/bin/openshift-recovery-tools -o /usr/local/bin/recovery-tools + + curl -s $PREFIX/bin/tokenize-signer.sh -o /usr/local/bin/tokenize-signer.sh + chmod 755 /usr/local/bin/tokenize-signer.sh + SCRIPT_EOF + chmod +x /tmp/install_recovery_scripts.sh + + retry 10 \ + scp -o StrictHostKeyChecking=no -o ProxyCommand="ssh -A -o StrictHostKeyChecking=no \ + -o ServerAliveInterval=30 -W %h:%p core@${BASTION_HOST}" \ + /tmp/install_recovery_scripts.sh core@"${SURVIVING_MASTER_NODE}":/tmp + echo "Run recovery scripts installer" + bastion_ssh core@${SURVIVING_MASTER_NODE} "sudo -i /tmp/install_recovery_scripts.sh" + + echo "Restore etcd from snapshot and initialize one node cluster" + bastion_ssh core@${SURVIVING_MASTER_NODE} "sudo -i /usr/local/bin/etcd-snapshot-restore.sh" + + echo "Wait for API server to come up" + retry 30 oc get nodes + + # Workaround for https://bugzilla.redhat.com/show_bug.cgi?id=1707006 + echo "Restart SDN" + retry 10 oc delete pods -l app=sdn -n openshift-sdn --wait=false + + echo "Create two masters via Machine API" + retry 10 oc get machines -n openshift-machine-api + # Clone existing masters, update IDs and oc apply + SURVIVING_MASTER_MACHINE=$(oc get machine -l machine.openshift.io/cluster-api-machine-role=master -n openshift-machine-api -o name | cut -d '/' -f 2) + SURVIVING_MASTER_NUM=${SURVIVING_MASTER_MACHINE##*-} + SURVIVING_MASTER_PREFIX=${SURVIVING_MASTER_MACHINE%-*} + oc get --export machine ${SURVIVING_MASTER_MACHINE} -n openshift-machine-api -o yaml > /tmp/machine.template + SURVIVING_MASTER_AZ=$(grep -oP 'availabilityZone: (\K.+)' /tmp/machine.template) + SURVIVING_MASTER_AZ_INDEX=$(grep -oP 'availabilityZone: .*(\K.)' /tmp/machine.template) + + MASTER_INDEX=0 + for i in $(seq 0 1); do + if [[ "${MASTER_INDEX}" == "${SURVIVING_MASTER_NUM}" ]]; then MASTER_INDEX=$((MASTER_INDEX+1)); fi + cat /tmp/machine.template \ + | sed 's;selfLink.*;;g' \ + | sed "s;name: ${SURVIVING_MASTER_PREFIX}-${SURVIVING_MASTER_NUM};name: ${SURVIVING_MASTER_PREFIX}-${MASTER_INDEX};" > /tmp/machine_${i}.yaml + for j in $(seq 0 10); do oc create -n openshift-machine-api -f /tmp/machine_${i}.yaml && break; done + MASTER_INDEX=$((MASTER_INDEX+1)) + done + + echo "Waiting for machines to be created" + NEW_MASTER_IPS=() + for i in $(seq 0 60); do + NEW_MASTER_IPS=($(oc -n openshift-machine-api \ + get machines \ + -l machine.openshift.io/cluster-api-machine-role=master \ + -o jsonpath='{range .items[*]}{.status.addresses[?(@.type=="InternalIP")].address}{"\n"}{end}' || true)) + if [[ "${#NEW_MASTER_IPS[@]}" == "3" ]]; then break; fi + sleep 30 + done + oc get machines -n openshift-machine-api + if [[ "${#NEW_MASTER_IPS[@]}" != "3" ]]; then + echo "${NEW_MASTER_IPS[@]}" + exit 1 + fi + + echo "Verify new master nodes have joined the cluster" + FOUND_MASTERS=0 + for i in $(seq 1 60) + do + FOUND_MASTERS=($(oc --request-timeout=5s get nodes -l node-role.kubernetes.io/master= -o name --no-headers || true)) + if [[ "${#FOUND_MASTERS[@]}" == "3" ]]; then break; fi + sleep 30 + done + oc get nodes + if [[ "${#FOUND_MASTERS[@]}" != "3" ]]; then + echo "${FOUND_MASTERS[@]}" + exit 1 + fi + + echo "Update DNS and LB" + # aws cli magic + easy_install --user pip + ~/.local/bin/pip install --user boto3 + cat > /tmp/update_route_53.py <<'PYTHON_EOF' + import boto3 + import os + import sys + + if len(sys.argv) < 3: + print("Usage: ./update_route_53.py ") + sys.exit(1) + + record = sys.argv[1] + ip = sys.argv[2] + print("record: %s" % record) + print("ip: %s" % ip) + + domain = "%s.%s" % (os.environ["CLUSTER_NAME"], os.environ["BASE_DOMAIN"]) + + client = boto3.client('route53') + r = client.list_hosted_zones_by_name(DNSName=domain, MaxItems="1") + zone_id = r['HostedZones'][0]['Id'].split('/')[-1] + + response = client.change_resource_record_sets( + HostedZoneId=zone_id, + ChangeBatch= { + 'Comment': 'add %s -> %s' % (record, ip), + 'Changes': [ + { + 'Action': 'UPSERT', + 'ResourceRecordSet': { + 'Name': record, + 'Type': 'A', + 'TTL': 60, + 'ResourceRecords': [{'Value': ip}] + } + }] + }) + PYTHON_EOF + for i in "${!NEW_MASTER_IPS[@]}"; do + ETCD_NAME="etcd-${i}.${CLUSTER_NAME}.${BASE_DOMAIN}" + python /tmp/update_route_53.py "${ETCD_NAME}" "${NEW_MASTER_IPS[$i]}" + done + + echo "Run etcd-signer" + SURVIVING_MASTER_NODE_SHORT=${SURVIVING_MASTER_NODE%%.*} + curl -O https://raw.githubusercontent.com/hexfusion/openshift-recovery/master/manifests/kube-etcd-cert-signer.yaml.template + sed "s;__MASTER_HOSTNAME__;${SURVIVING_MASTER_NODE_SHORT};g" kube-etcd-cert-signer.yaml.template > kube-etcd-cert-signer.yaml + retry 10 oc create -f kube-etcd-cert-signer.yaml + retry 10 oc get pod/etcd-signer -n openshift-config -o name + retry 10 oc wait pod/etcd-signer -n openshift-config --for condition=ready + + echo "Grow etcd cluster to full membership" + SURVIVING_MASTER_IP=$(oc get nodes ${SURVIVING_MASTER_NODE} -o jsonpath='{.status.addresses[?(@.type=="InternalIP")].address}') + SETUP_ETCD_ENVIRONMENT=$(oc adm release info --image-for setup-etcd-environment) + KUBE_CLIENT_AGENT=$(oc adm release info --image-for kube-client-agent) + MASTERS=($(oc -n openshift-machine-api \ + get machines \ + -l machine.openshift.io/cluster-api-machine-role=master \ + -o jsonpath='{range .items[*]}{.status.addresses[?(@.type=="InternalDNS")].address}{"\n"}{end}')) + for master in ${MASTERS[@]} + do + if [[ "${master}" == ${SURVIVING_MASTER_NODE} ]]; then continue; fi + retry 10 \ + scp -o StrictHostKeyChecking=no -o ProxyCommand="ssh -A -o StrictHostKeyChecking=no \ + -o ServerAliveInterval=30 -W %h:%p core@${BASTION_HOST}" \ + /tmp/install_recovery_scripts.sh core@${master}:/tmp + bastion_ssh core@${master} "sudo -i /tmp/install_recovery_scripts.sh" + bastion_ssh core@${master} "sudo -i env SETUP_ETCD_ENVIRONMENT=${SETUP_ETCD_ENVIRONMENT} KUBE_CLIENT_AGENT=${KUBE_CLIENT_AGENT} /usr/local/bin/etcd-member-recover.sh ${SURVIVING_MASTER_IP}" + done + + for master in ${MASTERS[@]} + do + retry 10 oc get pod/etcd-member-${master} -n openshift-etcd -o name + retry 10 oc wait pod/etcd-member-${master} -n openshift-etcd --for condition=Ready + done + + echo "Removing ssh-bastion" + retry 10 oc delete project openshift-ssh-bastion + retry 10 oc wait namespace/openshift-ssh-bastion --for delete --timeout=10m + + echo "Scale etcd-quorum guard" + retry 10 oc scale --replicas=3 deployment.apps/etcd-quorum-guard -n openshift-machine-config-operator + + echo "Remove etcd-signer" + retry 10 oc delete pod/etcd-signer -n openshift-config + } + function run-upgrade-tests() { openshift-tests run-upgrade "${TEST_SUITE}" --to-image "${RELEASE_IMAGE_LATEST}" \ --provider "${TEST_PROVIDER:-}" -o /tmp/artifacts/e2e.log --junit-dir /tmp/artifacts/junit