diff --git a/test/e2e/dr/common.go b/test/e2e/dr/common.go new file mode 100644 index 000000000000..b5dce62d4134 --- /dev/null +++ b/test/e2e/dr/common.go @@ -0,0 +1,473 @@ +package dr + +import ( + "bytes" + "fmt" + "io" + "io/ioutil" + "os" + "os/exec" + "path/filepath" + "strings" + "text/tabwriter" + "time" + + corev1 "k8s.io/api/core/v1" + kerrors "k8s.io/apimachinery/pkg/api/errors" + metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" + "k8s.io/apimachinery/pkg/util/sets" + "k8s.io/apimachinery/pkg/util/wait" + "k8s.io/client-go/dynamic" + e2e "k8s.io/kubernetes/test/e2e/framework" + + "github.com/openshift/origin/test/e2e/upgrade" + excl "github.com/openshift/origin/test/extended/cluster" + exutil "github.com/openshift/origin/test/extended/util" + + o "github.com/onsi/gomega" + "github.com/stretchr/objx" +) + +const ( + sshOpts = "-o StrictHostKeyChecking=no -o LogLevel=error -o ServerAliveInterval=30 -o ConnectionAttempts=100 -o ConnectTimeout=30" + proxyTemplate = "ssh -A %s -W %%h:%%p core@%s 2>/dev/null" + scpToHostTemplate = "scp %s -o ProxyCommand=\"%s\" %s core@%s:%s" + scpFromHostTemplate = "scp %s -o ProxyCommand=\"%s\" core@%s:%s %s" + sshTemplate = "ssh %s -o ProxyCommand=\"%s\" core@%s \"%s\"" + bastionNamespace = "ssh-bastion" + + operatorWait = 15 * time.Minute +) + +func createPasswdEntry(homeDir string) { + e2e.Logf("Adding a fake user entry") + userName := os.Getenv("USER_NAME") + if len(userName) == 0 { + userName = "default" + } + // User IDs are fake in openshift, so os/user would return nil + uid := strings.TrimSuffix(runCommandAndRetry("id -u"), "\n") + passwdEntry := fmt.Sprintf("%s:x:%s:0:%s user:%s:/sbin/nologin\n", userName, uid, userName, homeDir) + + f, err := os.OpenFile("/etc/passwd", os.O_APPEND|os.O_CREATE|os.O_WRONLY, 0644) + o.Expect(err).NotTo(o.HaveOccurred()) + defer f.Close() + _, err = f.WriteString(passwdEntry) + o.Expect(err).NotTo(o.HaveOccurred()) +} + +func copyKubeSSHKeyToUser(homeDir string) { + e2e.Logf("Copying kube's ssh key to %s", homeDir) + var ( + sshDirPath = filepath.Join(homeDir, ".ssh") + destPath = filepath.Join(sshDirPath, "id_rsa") + ) + + os.MkdirAll(sshDirPath, os.ModePerm) + + kubeSSHPath := os.Getenv("KUBE_SSH_KEY_PATH") + o.Expect(kubeSSHPath).NotTo(o.HaveLen(0)) + + source, err := os.Open(kubeSSHPath) + o.Expect(err).NotTo(o.HaveOccurred()) + defer source.Close() + + destination, err := os.Create(destPath) + o.Expect(err).NotTo(o.HaveOccurred()) + defer destination.Close() + + _, err = io.Copy(destination, source) + o.Expect(err).NotTo(o.HaveOccurred()) + + err = os.Chmod(destPath, 0600) + o.Expect(err).NotTo(o.HaveOccurred()) +} + +func createSSHKeys() ([]string, string) { + var ( + keyTypes = []string{"rsa", "ecdsa", "ed25519"} + tmpFiles = make([]string, len(keyTypes)) + ) + + homeDir := os.Getenv("HOME") + + // Ensure user entry is created in the container + createPasswdEntry(homeDir) + + // Copy test ssh key to user dir + copyKubeSSHKeyToUser(homeDir) + + // Create temporary ssh keys + tmpDir, err := ioutil.TempDir("/tmp", "ssh-keys") + o.Expect(err).NotTo(o.HaveOccurred()) + + for index, keyType := range keyTypes { + keyPath := filepath.Join(tmpDir, keyType) + e2e.Logf("Generating %s key in %s", keyType, keyPath) + out, err := exec.Command( + "/usr/bin/ssh-keygen", + "-q", // silence + "-t", keyType, // type + "-f", keyPath, // output file + "-C", "", // no comment + "-N", "", // no passphrase + ).CombinedOutput() + if err != nil { + e2e.Logf("ssh-keygen output:\n%s", out) + } + o.Expect(err).NotTo(o.HaveOccurred()) + tmpFiles[index] = keyPath + } + return tmpFiles, tmpDir +} + +func setupSSHBastion(oc *exutil.CLI) string { + e2e.Logf("Setting up ssh bastion host") + + var ( + bastionHost = "" + sshBastionBaseDir = exutil.FixturePath("testdata", "disaster-recovery", "ssh-bastion") + files = []string{ + "service.yaml", + "serviceaccount.yaml", + "role.yaml", + "rolebinding.yaml", + "clusterrole.yaml", + "clusterrolebinding.yaml", + "deployment.yaml", + } + ) + + ok, err := excl.ProjectExists(oc, bastionNamespace) + if err == nil && !ok { + err := oc.Run("new-project").Args(bastionNamespace).Execute() + o.Expect(err).NotTo(o.HaveOccurred()) + } + o.Expect(err).NotTo(o.HaveOccurred()) + + e2e.Logf("Creating ssh keys") + _, err = oc.AdminKubeClient().CoreV1().Secrets(bastionNamespace).Get("ssh-host-keys", metav1.GetOptions{}) + if kerrors.IsNotFound(err) { + tmpFiles, tmpDir := createSSHKeys() + defer os.RemoveAll(tmpDir) + + secretKeyArgs := fmt.Sprintf( + "ssh_host_rsa_key=%s,ssh_host_ecdsa_key=%s,ssh_host_ed25519_key=%s,sshd_config=%s", + tmpFiles[0], tmpFiles[1], tmpFiles[2], filepath.Join(sshBastionBaseDir, "sshd_config"), + ) + _, err = oc.Run("create").Args("-n", bastionNamespace, "secret", "generic", "ssh-host-keys", "--from-file", secretKeyArgs).Output() + o.Expect(err).NotTo(o.HaveOccurred()) + } else { + o.Expect(err).NotTo(o.HaveOccurred()) + } + + e2e.Logf("Deploying ssh bastion") + for _, file := range files { + testDataPath := filepath.Join(sshBastionBaseDir, file) + err := oc.Run("apply").Args("-f", testDataPath).Execute() + o.Expect(err).NotTo(o.HaveOccurred()) + } + + e2e.Logf("Waiting for load balancer to be created") + err = wait.Poll(10*time.Second, 5*time.Minute, func() (done bool, err error) { + svc, err := oc.AdminKubeClient().CoreV1().Services(bastionNamespace).Get("ssh-bastion", metav1.GetOptions{}) + if err != nil { + return false, nil + } + if svc.Spec.Type != corev1.ServiceTypeLoadBalancer { + return true, fmt.Errorf("Incorrect service type: %v", svc.Spec.Type) + } + if len(svc.Status.LoadBalancer.Ingress) == 0 { + return false, nil + } + bastionHost = svc.Status.LoadBalancer.Ingress[0].Hostname + return true, nil + }) + o.Expect(err).NotTo(o.HaveOccurred()) + e2e.Logf("Bastion host: %s", bastionHost) + + e2e.Logf("Waiting for host to be resolvable") + err = wait.Poll(10*time.Second, 5*time.Minute, func() (done bool, err error) { + _, err = exec.Command("nslookup", bastionHost).Output() + if err != nil { + return false, nil + } + return true, nil + }) + o.Expect(err).NotTo(o.HaveOccurred()) + + return bastionHost +} + +func runCommandAndRetry(command string) string { + const ( + maxRetries = 10 + pause = 10 + ) + var ( + retryCount = 0 + out []byte + err error + ) + e2e.Logf("command '%s'", command) + for retryCount = 0; retryCount <= maxRetries; retryCount++ { + out, err = exec.Command("bash", "-c", command).CombinedOutput() + e2e.Logf("output:\n%s", out) + if err == nil { + break + } + e2e.Logf("%v", err) + time.Sleep(time.Second * pause) + } + o.Expect(retryCount).NotTo(o.Equal(maxRetries + 1)) + return string(out) +} + +func scpFileToHost(src string, proxy string, dest string, destHost string) { + e2e.Logf("Copying %s to %s at host '%s' via %s", src, dest, destHost, proxy) + + command := fmt.Sprintf(scpToHostTemplate, sshOpts, proxy, src, destHost, dest) + runCommandAndRetry(command) +} + +func scpFileFromHost(src string, srcHost string, proxy string, dest string) { + e2e.Logf("Copying %s from '%s' to %s via %s", src, srcHost, dest, proxy) + + command := fmt.Sprintf(scpFromHostTemplate, sshOpts, proxy, srcHost, src, dest) + runCommandAndRetry(command) +} + +func runViaBastionSSH(host string, proxy string, remoteCommand string) string { + e2e.Logf("Running '%s' on host %s via %s", remoteCommand, host, proxy) + + command := fmt.Sprintf(sshTemplate, sshOpts, proxy, host, remoteCommand) + return runCommandAndRetry(command) +} + +func getAllMasters(oc *exutil.CLI) []string { + nodeNames := sets.NewString() + + e2e.Logf("Fetching a list of masters") + + masterNodes, err := oc.AdminKubeClient().CoreV1().Nodes().List(metav1.ListOptions{ + LabelSelector: "node-role.kubernetes.io/master", + }) + for i := range masterNodes.Items { + node := &masterNodes.Items[i] + nodeNames.Insert(node.ObjectMeta.Name) + } + + o.Expect(err).NotTo(o.HaveOccurred()) + + return nodeNames.List() +} + +func constructEtcdConnectionString(masters []string, proxy string) string { + //TODO vrutkovs: replace this nonsense with `etcdctl member list -w json ...` + etcdConnectionString := "" + e2e.Logf("Construct etcd connection string") + for _, master := range masters { + etcdEnv := runViaBastionSSH(master, proxy, "cat /run/etcd/environment") + var entry string + for _, entry = range strings.Split(etcdEnv, "\n") { + if strings.HasPrefix(entry, "ETCD_DNS_NAME=") { + break + } + } + etcdDNSName := strings.Split(entry, "=")[1] + o.Expect(etcdDNSName).NotTo(o.BeEmpty()) + etcdConnectionString = fmt.Sprintf("%setcd-member-%s=https://%s:2380,", etcdConnectionString, master, etcdDNSName) + } + return etcdConnectionString[:len(etcdConnectionString)-1] +} + +func removeSSHBastion(oc *exutil.CLI) { + e2e.Logf("Removing ssh bastion") + err := oc.AdminKubeClient().CoreV1().Namespaces().Delete(bastionNamespace, &metav1.DeleteOptions{}) + o.Expect(err).NotTo(o.HaveOccurred()) +} + +func waitForMastersToUpdate(oc *exutil.CLI, mcps dynamic.NamespaceableResourceInterface) { + e2e.Logf("Waiting for MachineConfig master to finish rolling out") + err := wait.Poll(30*time.Second, 30*time.Minute, func() (done bool, err error) { + return upgrade.IsPoolUpdated(mcps, "master") + }) + o.Expect(err).NotTo(o.HaveOccurred()) +} + +func waitForOperatorsToSettle(coc dynamic.NamespaceableResourceInterface) { + var lastErr error + // gate on all clusteroperators being ready + available := make(map[string]struct{}) + lastErr = nil + var lastCOs []objx.Map + wait.PollImmediate(30*time.Second, operatorWait, func() (bool, error) { + obj, err := coc.List(metav1.ListOptions{}) + if err != nil { + lastErr = err + e2e.Logf("Unable to check for cluster operators: %v", err) + return false, nil + } + cv := objx.Map(obj.UnstructuredContent()) + lastErr = nil + items := objects(cv.Get("items")) + lastCOs = items + + if len(items) == 0 { + return false, nil + } + + var unavailable []objx.Map + var unavailableNames []string + for _, co := range items { + if condition(co, "Available").Get("status").String() != "True" { + ns := co.Get("metadata.namespace").String() + name := co.Get("metadata.name").String() + unavailableNames = append(unavailableNames, fmt.Sprintf("%s/%s", ns, name)) + unavailable = append(unavailable, co) + break + } + if condition(co, "Progressing").Get("status").String() != "False" { + ns := co.Get("metadata.namespace").String() + name := co.Get("metadata.name").String() + unavailableNames = append(unavailableNames, fmt.Sprintf("%s/%s", ns, name)) + unavailable = append(unavailable, co) + break + } + if condition(co, "Failing").Get("status").String() != "False" { + ns := co.Get("metadata.namespace").String() + name := co.Get("metadata.name").String() + unavailableNames = append(unavailableNames, fmt.Sprintf("%s/%s", ns, name)) + unavailable = append(unavailable, co) + break + } + } + if len(unavailable) > 0 { + e2e.Logf("Operators still doing work: %s", strings.Join(unavailableNames, ", ")) + return false, nil + } + return true, nil + }) + + o.Expect(lastErr).NotTo(o.HaveOccurred()) + var unavailable []string + buf := &bytes.Buffer{} + w := tabwriter.NewWriter(buf, 0, 4, 1, ' ', 0) + fmt.Fprintf(w, "NAMESPACE\tNAME\tPROGRESSING\tAVAILABLE\tVERSION\tMESSAGE\n") + for _, co := range lastCOs { + ns := co.Get("metadata.namespace").String() + name := co.Get("metadata.name").String() + if condition(co, "Available").Get("status").String() != "True" { + unavailable = append(unavailable, fmt.Sprintf("%s/%s", ns, name)) + } else { + available[fmt.Sprintf("%s/%s", ns, name)] = struct{}{} + } + fmt.Fprintf(w, "%s\t%s\t%s\t%s\t%s\t%s\n", + ns, + name, + condition(co, "Progressing").Get("status").String(), + condition(co, "Available").Get("status").String(), + co.Get("status.version").String(), + condition(co, "Failing").Get("message").String(), + ) + } + w.Flush() + e2e.Logf("ClusterOperators:\n%s", buf.String()) + if len(unavailable) > 0 { + e2e.Failf("Some cluster operators never became available %s", strings.Join(unavailable, ", ")) + } + // Check at least one core operator is available + if len(available) == 0 { + e2e.Failf("There must be at least one cluster operator") + } +} + +func restartSDNPods(oc *exutil.CLI) { + e2e.Logf("Restarting SDN") + + pods, err := oc.AdminKubeClient().CoreV1().Pods("openshift-sdn").List(metav1.ListOptions{}) + o.Expect(err).NotTo(o.HaveOccurred()) + + for _, pod := range pods.Items { + e2e.Logf("Deleting pod %s", pod.Name) + err := oc.AdminKubeClient().CoreV1().Pods("openshift-sdn").Delete(pod.Name, &metav1.DeleteOptions{}) + o.Expect(err).NotTo(o.HaveOccurred()) + } + + err = wait.Poll(10*time.Second, 5*time.Minute, func() (done bool, err error) { + sdnDaemonset, err := oc.AdminKubeClient().ExtensionsV1beta1().DaemonSets("openshift-sdn").Get("sdn", metav1.GetOptions{}) + if err != nil { + return false, nil + } + return sdnDaemonset.Status.NumberReady == sdnDaemonset.Status.NumberAvailable, nil + }) + o.Expect(err).NotTo(o.HaveOccurred()) +} + +func restartOpenshiftAPIPods(oc *exutil.CLI) { + e2e.Logf("Restarting Openshift API server") + + pods, err := oc.AdminKubeClient().CoreV1().Pods("openshift-apiserver").List(metav1.ListOptions{}) + o.Expect(err).NotTo(o.HaveOccurred()) + + for _, pod := range pods.Items { + e2e.Logf("Deleting pod %s", pod.Name) + err := oc.AdminKubeClient().CoreV1().Pods("openshift-apiserver").Delete(pod.Name, &metav1.DeleteOptions{}) + o.Expect(err).NotTo(o.HaveOccurred()) + } + + err = wait.Poll(10*time.Second, 5*time.Minute, func() (done bool, err error) { + apiServerDS, err := oc.AdminKubeClient().ExtensionsV1beta1().DaemonSets("openshift-apiserver").Get("apiserver", metav1.GetOptions{}) + if err != nil { + return false, nil + } + return apiServerDS.Status.NumberReady == apiServerDS.Status.NumberAvailable, nil + }) + o.Expect(err).NotTo(o.HaveOccurred()) +} + +func restartMCDPods(oc *exutil.CLI) { + e2e.Logf("Restarting MCD pods") + + pods, err := oc.AdminKubeClient().CoreV1().Pods("openshift-machine-config-operator").List(metav1.ListOptions{}) + o.Expect(err).NotTo(o.HaveOccurred()) + + for _, pod := range pods.Items { + e2e.Logf("Deleting pod %s", pod.Name) + err := oc.AdminKubeClient().CoreV1().Pods("openshift-machine-config-operator").Delete(pod.Name, &metav1.DeleteOptions{}) + o.Expect(err).NotTo(o.HaveOccurred()) + } + + err = wait.Poll(10*time.Second, 5*time.Minute, func() (done bool, err error) { + mcDS, err := oc.AdminKubeClient().ExtensionsV1beta1().DaemonSets("openshift-machine-config-operator").Get("machine-config-daemon", metav1.GetOptions{}) + if err != nil { + return false, nil + } + return mcDS.Status.NumberReady == mcDS.Status.NumberAvailable, nil + }) + o.Expect(err).NotTo(o.HaveOccurred()) +} + +func objects(from *objx.Value) []objx.Map { + var values []objx.Map + switch { + case from.IsObjxMapSlice(): + return from.ObjxMapSlice() + case from.IsInterSlice(): + for _, i := range from.InterSlice() { + if msi, ok := i.(map[string]interface{}); ok { + values = append(values, objx.Map(msi)) + } + } + } + return values +} + +func condition(cv objx.Map, condition string) objx.Map { + for _, obj := range objects(cv.Get("status.conditions")) { + if obj.Get("type").String() == condition { + return obj + } + } + return objx.Map(nil) +} diff --git a/test/e2e/dr/quorum_restore.go b/test/e2e/dr/quorum_restore.go new file mode 100644 index 000000000000..32bd2f007687 --- /dev/null +++ b/test/e2e/dr/quorum_restore.go @@ -0,0 +1,332 @@ +package dr + +import ( + "fmt" + "io/ioutil" + "net" + "net/url" + "os" + "strings" + "time" + + metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" + "k8s.io/apimachinery/pkg/runtime/schema" + "k8s.io/apimachinery/pkg/util/wait" + "k8s.io/client-go/dynamic" + e2e "k8s.io/kubernetes/test/e2e/framework" + + exutil "github.com/openshift/origin/test/extended/util" + + g "github.com/onsi/ginkgo" + o "github.com/onsi/gomega" +) + +const ( + machineAnnotationName = "machine.openshift.io/machine" + localEtcdSignerYaml = "/tmp/kube-etcd-cert-signer.yaml" + expectedNumberOfMasters = 3 +) + +var _ = g.Describe("[Feature:DisasterRecovery][Disruptive]", func() { + f := e2e.NewDefaultFramework("disaster-recovery") + f.SkipNamespaceCreation = true + f.SkipPrivilegedPSPBinding = true + + oc := exutil.NewCLIWithoutNamespace("disaster-recovery") + + g.It("[dr-quorum-restore] Cluster should restore itself after quorum loss", func() { + config, err := e2e.LoadConfig() + o.Expect(err).NotTo(o.HaveOccurred()) + dynamicClient := dynamic.NewForConfigOrDie(config) + ms := dynamicClient.Resource(schema.GroupVersionResource{ + Group: "machine.openshift.io", + Version: "v1beta1", + Resource: "machines", + }).Namespace("openshift-machine-api") + mcps := dynamicClient.Resource(schema.GroupVersionResource{ + Group: "machineconfiguration.openshift.io", + Version: "v1", + Resource: "machineconfigpools", + }) + coc := dynamicClient.Resource(schema.GroupVersionResource{ + Group: "config.openshift.io", + Version: "v1", + Resource: "clusteroperators", + }) + + bastionHost := setupSSHBastion(oc) + proxy := fmt.Sprintf(proxyTemplate, sshOpts, bastionHost) + defer removeSSHBastion(oc) + + scaleEtcdQuorum(oc, 0) + + e2e.Logf("Finding two masters to remove") + mapiPods, err := oc.AdminKubeClient().CoreV1().Pods("openshift-machine-api").List(metav1.ListOptions{ + LabelSelector: "k8s-app=controller", + }) + o.Expect(err).NotTo(o.HaveOccurred()) + o.Expect(mapiPods.Items).NotTo(o.BeEmpty()) + + survivingNodeName := mapiPods.Items[0].Spec.NodeName + mastersNodes := getAllMasters(oc) + o.Expect(mastersNodes).NotTo(o.BeEmpty()) + + survivingMachineName := getMachineNameByNodeName(oc, survivingNodeName) + survivingMachine, err := ms.Get(survivingMachineName, metav1.GetOptions{}) + o.Expect(err).NotTo(o.HaveOccurred()) + + // Set etcd connection string before destroying masters, as ssh bastion may be unavailable + etcdConnectionString := constructEtcdConnectionString([]string{survivingNodeName}, proxy) + + e2e.Logf("Destroy 2 masters") + masterMachines := make([]string, len(mastersNodes)) + for i, node := range mastersNodes { + masterMachine := getMachineNameByNodeName(oc, node) + masterMachines[i] = masterMachine + + if node == survivingNodeName { + continue + } + + e2e.Logf("Destroying %s", masterMachine) + err = ms.Delete(masterMachine, &metav1.DeleteOptions{}) + o.Expect(err).NotTo(o.HaveOccurred()) + } + e2e.Logf("masterMachines: %v", masterMachines) + + e2e.Logf("Confirm meltdown") + time.Sleep(30 * time.Second) + err = wait.Poll(30*time.Second, 30*time.Minute, func() (done bool, err error) { + _, err = oc.AdminKubeClient().CoreV1().Nodes().List(metav1.ListOptions{}) + return err != nil, nil + }) + + e2e.Logf("Restore single node etcd") + runViaBastionSSH(survivingNodeName, proxy, + fmt.Sprintf("sudo -i /bin/bash -x /usr/local/bin/etcd-snapshot-restore.sh /root/assets/backup/etcd/member/snap/db %s", etcdConnectionString)) + + e2e.Logf("Wait for API server to come up") + time.Sleep(30 * time.Second) + err = wait.Poll(30*time.Second, 30*time.Minute, func() (done bool, err error) { + nodes, err := oc.AdminKubeClient().CoreV1().Nodes().List(metav1.ListOptions{}) + if err != nil || nodes.Items == nil { + e2e.Logf("return false - err %v nodes.Items %v", err, nodes.Items) + return false, nil + } + return true, nil + }) + o.Expect(err).NotTo(o.HaveOccurred()) + + e2e.Logf("Create new masters") + for _, master := range masterMachines { + if master == survivingMachineName { + continue + } + e2e.Logf("Creating master %s", master) + newMaster := survivingMachine.DeepCopy() + newMaster.SetName(master) + newMaster.SetResourceVersion("") + newMaster.SetSelfLink("") + newMaster.SetUID("") + newMaster.SetCreationTimestamp(metav1.NewTime(time.Time{})) + _, err := ms.Create(newMaster, metav1.CreateOptions{}) + o.Expect(err).NotTo(o.HaveOccurred()) + } + + e2e.Logf("Waiting for machines to be created") + err = wait.Poll(30*time.Second, 10*time.Minute, func() (done bool, err error) { + mastersList, err := ms.List(metav1.ListOptions{ + LabelSelector: "machine.openshift.io/cluster-api-machine-role=master", + }) + if err != nil || mastersList.Items == nil { + e2e.Logf("return false - err %v mastersList.Items %v", err, mastersList.Items) + return false, err + } + return len(mastersList.Items) == expectedNumberOfMasters, nil + }) + o.Expect(err).NotTo(o.HaveOccurred()) + + e2e.Logf("Wait for masters to join as nodes") + err = wait.Poll(30*time.Second, 30*time.Minute, func() (done bool, err error) { + defer func() { + if r := recover(); r != nil { + fmt.Println("Recovered from panic", r) + } + }() + nodes, err := oc.AdminKubeClient().CoreV1().Nodes().List(metav1.ListOptions{ + LabelSelector: "node-role.kubernetes.io/master=", + }) + if err != nil || nodes.Items == nil { + e2e.Logf("return false - err %v nodes.Items %v", err, nodes.Items) + return false, err + } + return len(nodes.Items) == expectedNumberOfMasters, nil + }) + o.Expect(err).NotTo(o.HaveOccurred()) + + e2e.Logf("Update DNS records") + var survivingMasterIP string + runCommandAndRetry("easy_install --user pip && ~/.local/bin/pip install --user boto3") + + infra, err := oc.AdminConfigClient().ConfigV1().Infrastructures().Get("cluster", metav1.GetOptions{}) + o.Expect(err).NotTo(o.HaveOccurred()) + + internalAPI, err := url.Parse(infra.Status.APIServerURL) + o.Expect(err).NotTo(o.HaveOccurred()) + internalAPI.Host = strings.Replace(internalAPI.Host, "api.", "", 1) + + domain, _, err := net.SplitHostPort(internalAPI.Host) + o.Expect(err).ToNot(o.HaveOccurred()) + e2e.Logf("domain: %s", domain) + masterNodes, err := oc.AdminKubeClient().CoreV1().Nodes().List(metav1.ListOptions{ + LabelSelector: "node-role.kubernetes.io/master", + }) + for i := range masterNodes.Items { + node := &masterNodes.Items[i] + etcdName := fmt.Sprintf("etcd-%d.%s", i, domain) + masterIP := "" + for _, address := range node.Status.Addresses { + if address.Type == "InternalIP" { + masterIP = address.Address + break + } + } + if node.GetName() == survivingNodeName { + survivingMasterIP = masterIP + } + updateDNS(domain, etcdName, masterIP) + } + + imagePullSecretPath := getPullSecret(oc) + defer os.Remove(imagePullSecretPath) + runPodSigner(oc, survivingNodeName, imagePullSecretPath, proxy) + + e2e.Logf("Restore etcd on remaining masters") + setupEtcdEnvImage := getImagePullSpecFromRelease(oc, imagePullSecretPath, "setup-etcd-environment") + kubeClientAgent := getImagePullSpecFromRelease(oc, imagePullSecretPath, "kube-client-agent") + for i := range masterNodes.Items { + node := &masterNodes.Items[i] + masterDNS := "" + for _, address := range node.Status.Addresses { + if address.Type == "InternalDNS" { + masterDNS = address.Address + break + } + } + if masterDNS == survivingNodeName { + e2e.Logf("Skipping node as its the surviving master") + continue + } + runViaBastionSSH(masterDNS, proxy, + fmt.Sprintf("sudo -i env SETUP_ETCD_ENVIRONMENT=%s KUBE_CLIENT_AGENT=%s /bin/bash -x /usr/local/bin/etcd-member-recover.sh %s \"etcd-member-%s\"", + setupEtcdEnvImage, kubeClientAgent, survivingMasterIP, node.GetName())) + } + + e2e.Logf("Wait for etcd pods to become available") + _, err = exutil.WaitForPods( + oc.AdminKubeClient().CoreV1().Pods("openshift-etcd"), + exutil.ParseLabelsOrDie("k8s-app=etcd"), + exutil.CheckPodIsReady, + expectedNumberOfMasters, + 10*time.Minute, + ) + o.Expect(err).NotTo(o.HaveOccurred()) + + scaleEtcdQuorum(oc, expectedNumberOfMasters) + + e2e.Logf("Remove etcd signer") + err = oc.AdminKubeClient().CoreV1().Pods("openshift-config").Delete("etcd-signer", &metav1.DeleteOptions{}) + o.Expect(err).NotTo(o.HaveOccurred()) + + // Workaround for https://bugzilla.redhat.com/show_bug.cgi?id=1707006# + // SDN won't switch to Degraded mode when service is down after disaster recovery + restartSDNPods(oc) + waitForMastersToUpdate(oc, mcps) + waitForOperatorsToSettle(coc) + }) +}) + +func scaleEtcdQuorum(oc *exutil.CLI, replicas int) { + e2e.Logf("Scale etcd-quorum-guard to %d replicas", replicas) + etcdQGScale, err := oc.AdminKubeClient().AppsV1().Deployments("openshift-machine-config-operator").GetScale("etcd-quorum-guard", metav1.GetOptions{}) + o.Expect(err).NotTo(o.HaveOccurred()) + etcdQGScale.Spec.Replicas = int32(replicas) + _, err = oc.AdminKubeClient().AppsV1().Deployments("openshift-machine-config-operator").UpdateScale("etcd-quorum-guard", etcdQGScale) + o.Expect(err).NotTo(o.HaveOccurred()) + + etcdQGScale, err = oc.AdminKubeClient().AppsV1().Deployments("openshift-machine-config-operator").GetScale("etcd-quorum-guard", metav1.GetOptions{}) + o.Expect(err).NotTo(o.HaveOccurred()) + o.Expect(etcdQGScale.Spec.Replicas).To(o.Equal(int32(replicas))) +} + +func runPodSigner(oc *exutil.CLI, survivingNodeName, imagePullSecretPath, proxy string) { + e2e.Logf("Run etcd signer pod") + nodeHostname := strings.Split(survivingNodeName, ".")[0] + + kubeEtcdSignerServerImage := getImagePullSpecFromRelease(oc, imagePullSecretPath, "kube-etcd-signer-server") + runViaBastionSSH(survivingNodeName, proxy, + fmt.Sprintf("sudo -i env KUBE_ETCD_SIGNER_SERVER=%s /bin/bash -x /usr/local/bin/tokenize-signer.sh %s && sudo -i install -o core -g core /root/assets/manifests/kube-etcd-cert-signer.yaml /tmp/kube-etcd-cert-signer.yaml", + kubeEtcdSignerServerImage, nodeHostname)) + scpFileFromHost("/tmp/kube-etcd-cert-signer.yaml", survivingNodeName, proxy, localEtcdSignerYaml) + err := oc.Run("apply").Args("-f", localEtcdSignerYaml).Execute() + o.Expect(err).NotTo(o.HaveOccurred()) + + e2e.Logf("Wait for etcd signer pod to become Ready") + _, err = exutil.WaitForPods( + oc.AdminKubeClient().CoreV1().Pods("openshift-config"), + exutil.ParseLabelsOrDie("k8s-app=etcd"), + exutil.CheckPodIsReady, + 1, + 10*time.Minute, + ) + o.Expect(err).NotTo(o.HaveOccurred()) +} + +func getPullSecret(oc *exutil.CLI) string { + e2e.Logf("Saving image pull secret") + //TODO: copy of test/extended/operators/images.go, move this to a common func + imagePullSecret, err := oc.KubeFramework().ClientSet.CoreV1().Secrets("openshift-config").Get("pull-secret", metav1.GetOptions{}) + o.Expect(err).NotTo(o.HaveOccurred()) + if err != nil { + e2e.Failf("unable to get pull secret for cluster: %v", err) + } + + // cache file to local temp location + imagePullFile, err := ioutil.TempFile("", "image-pull-secret") + if err != nil { + e2e.Failf("unable to create a temporary file: %v", err) + } + + // write the content + imagePullSecretBytes := imagePullSecret.Data[".dockerconfigjson"] + if _, err := imagePullFile.Write(imagePullSecretBytes); err != nil { + e2e.Failf("unable to write pull secret to temp file: %v", err) + } + if err := imagePullFile.Close(); err != nil { + e2e.Failf("unable to close file: %v", err) + } + e2e.Logf("Image pull secret: %s", imagePullFile.Name()) + return imagePullFile.Name() +} + +func getImagePullSpecFromRelease(oc *exutil.CLI, imagePullSecretPath, imageName string) string { + image, err := oc.Run("adm", "release", "info").Args("--image-for", imageName, "--registry-config", imagePullSecretPath).Output() + o.Expect(err).NotTo(o.HaveOccurred()) + return image +} + +func updateDNS(domain string, etcdName, masterIP string) { + //TODO vrutkovs: make a golang native version + scriptPath := exutil.FixturePath("testdata", "disaster-recovery", "update_route_53.py") + runCommandAndRetry(fmt.Sprintf( + "python %s %s %s %s", scriptPath, domain, etcdName, masterIP)) +} + +func getMachineNameByNodeName(oc *exutil.CLI, name string) string { + masterNode, err := oc.AdminKubeClient().CoreV1().Nodes().Get(name, metav1.GetOptions{}) + o.Expect(err).NotTo(o.HaveOccurred()) + + annotations := masterNode.GetAnnotations() + o.Expect(annotations).To(o.HaveKey(machineAnnotationName)) + return strings.Split(annotations[machineAnnotationName], "/")[1] +} diff --git a/test/e2e/dr/restore_from_snapshot.go b/test/e2e/dr/restore_from_snapshot.go new file mode 100644 index 000000000000..961a31c1a5be --- /dev/null +++ b/test/e2e/dr/restore_from_snapshot.go @@ -0,0 +1,139 @@ +package dr + +import ( + "fmt" + "os" + "strings" + "time" + + metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" + "k8s.io/apimachinery/pkg/apis/meta/v1/unstructured" + "k8s.io/apimachinery/pkg/runtime/schema" + "k8s.io/apimachinery/pkg/util/wait" + "k8s.io/client-go/dynamic" + e2e "k8s.io/kubernetes/test/e2e/framework" + + exutil "github.com/openshift/origin/test/extended/util" + + g "github.com/onsi/ginkgo" + o "github.com/onsi/gomega" +) + +const ( + rollBackMachineConfig = "99-rollback-test" +) + +var _ = g.Describe("[Feature:DisasterRecovery][Disruptive]", func() { + f := e2e.NewDefaultFramework("disaster-recovery") + f.SkipNamespaceCreation = true + f.SkipPrivilegedPSPBinding = true + + oc := exutil.NewCLIWithoutNamespace("disaster-recovery") + + g.It("[dr-etcd-snapshot] Cluster should restore itself from etcd snapshot", func() { + config, err := e2e.LoadConfig() + o.Expect(err).NotTo(o.HaveOccurred()) + dynamicClient := dynamic.NewForConfigOrDie(config) + mcps := dynamicClient.Resource(schema.GroupVersionResource{ + Group: "machineconfiguration.openshift.io", + Version: "v1", + Resource: "machineconfigpools", + }) + mc := dynamicClient.Resource(schema.GroupVersionResource{ + Group: "machineconfiguration.openshift.io", + Version: "v1", + Resource: "machineconfigs", + }) + coc := dynamicClient.Resource(schema.GroupVersionResource{ + Group: "config.openshift.io", + Version: "v1", + Resource: "clusteroperators", + }) + + bastionHost := setupSSHBastion(oc) + proxy := fmt.Sprintf(proxyTemplate, sshOpts, bastionHost) + defer removeSSHBastion(oc) + + setMachineConfig("rollback-A.yaml", oc, mcps) + + masters := getAllMasters(oc) + e2e.Logf("masters: %v", masters) + o.Expect(masters).NotTo(o.BeEmpty()) + firstMaster := masters[0] + e2e.Logf("first master: %v", firstMaster) + + e2e.Logf("Make etcd backup on first master") + runViaBastionSSH(firstMaster, proxy, + "sudo -i /bin/bash -x /usr/local/bin/etcd-snapshot-backup.sh /root/assets/backup/snapshot.db") + runViaBastionSSH(firstMaster, proxy, + "sudo -i install -o core -g core /root/assets/backup/snapshot.db /tmp/snapshot.db") + + setMachineConfig("rollback-B.yaml", oc, mcps) + + masterHosts := strings.Join(masters, " ") + restoreScriptPath := exutil.FixturePath("testdata", "disaster-recovery", "restore-etcd.sh") + cmd := fmt.Sprintf("env BASTION_HOST='%s' MASTERHOSTS='%s' KUBE_SSH_KEY_PATH='%s' /bin/bash -x %s ", + bastionHost, masterHosts, os.Getenv("KUBE_SSH_KEY_PATH"), restoreScriptPath) + runCommandAndRetry(cmd) + + time.Sleep(30 * time.Second) + waitForAPIServer(oc) + // restartSDNPods(oc) + restartOpenshiftAPIPods(oc) + restartMCDPods(oc) + waitForMastersToUpdate(oc, mcps) + waitForOperatorsToSettle(coc) + + rollBackInMC := getRollbackContentsInMachineConfig(oc, mc, rollBackMachineConfig) + o.Expect(rollBackInMC).To(o.BeEquivalentTo("data:,A")) + + for _, master := range masters { + rollBackFile := fetchRollbackFileContents(master, proxy) + o.Expect(rollBackFile).To(o.BeEquivalentTo("A")) + } + }) +}) + +func setMachineConfig(rollbackFileName string, oc *exutil.CLI, mcps dynamic.NamespaceableResourceInterface) { + e2e.Logf("Update MachineConfig using %s file on masters", rollbackFileName) + machineConfigTemplate := exutil.FixturePath("testdata", "disaster-recovery", rollbackFileName) + err := oc.Run("apply").Args("-f", machineConfigTemplate).Execute() + o.Expect(err).NotTo(o.HaveOccurred()) + + waitForMastersToUpdate(oc, mcps) +} + +func getRollbackContentsInMachineConfig(oc *exutil.CLI, mcs dynamic.NamespaceableResourceInterface, mcName string) string { + e2e.Logf("Reading contents of rollback MachineConfig") + pool, err := mcs.Get(mcName, metav1.GetOptions{}) + o.Expect(err).NotTo(o.HaveOccurred()) + + files, found, err := unstructured.NestedSlice(pool.Object, "spec", "config", "storage", "files") + o.Expect(err).NotTo(o.HaveOccurred()) + o.Expect(found).To(o.BeTrue()) + o.Expect(files).NotTo(o.BeEmpty()) + + file := files[0].(map[string]interface{}) + actual, found, err := unstructured.NestedString(file, "contents", "source") + o.Expect(err).NotTo(o.HaveOccurred()) + o.Expect(found).To(o.BeTrue()) + + return actual +} + +func waitForAPIServer(oc *exutil.CLI) { + e2e.Logf("Waiting for API server to restore") + err := wait.Poll(10*time.Second, 5*time.Minute, func() (done bool, err error) { + _, err = oc.AdminKubeClient().CoreV1().Nodes().List(metav1.ListOptions{}) + if err != nil { + return false, nil + } + return true, nil + }) + o.Expect(err).NotTo(o.HaveOccurred()) +} + +func fetchRollbackFileContents(master string, proxy string) string { + e2e.Logf("Fetching /etc/rollback-test file contents from %s", master) + return runViaBastionSSH(master, proxy, "cat /etc/rollback-test") +} diff --git a/test/e2e/upgrade/upgrade.go b/test/e2e/upgrade/upgrade.go index 4b0800079c6a..73cd16142bd1 100644 --- a/test/e2e/upgrade/upgrade.go +++ b/test/e2e/upgrade/upgrade.go @@ -496,7 +496,7 @@ func clusterUpgrade(c configv1client.Interface, dc dynamic.Interface, config *re } allUpdated := true for _, p := range pools.Items { - updated, err := isPoolUpdated(mcps, p.GetName()) + updated, err := IsPoolUpdated(mcps, p.GetName()) if err != nil { framework.Logf("error checking pool %s: %v", p.GetName(), err) return false, nil @@ -513,7 +513,7 @@ func clusterUpgrade(c configv1client.Interface, dc dynamic.Interface, config *re } // TODO(runcom): drop this when MCO types are in openshift/api and we can use the typed client directly -func isPoolUpdated(dc dynamic.NamespaceableResourceInterface, name string) (bool, error) { +func IsPoolUpdated(dc dynamic.NamespaceableResourceInterface, name string) (bool, error) { pool, err := dc.Get(name, metav1.GetOptions{}) if err != nil { framework.Logf("error getting pool %s: %v", name, err) diff --git a/test/extended/testdata/bindata.go b/test/extended/testdata/bindata.go index fdb23e83eaa8..40dd65656968 100644 --- a/test/extended/testdata/bindata.go +++ b/test/extended/testdata/bindata.go @@ -322,6 +322,19 @@ // test/extended/testdata/deployments/tag-images-deployment.yaml // test/extended/testdata/deployments/test-deployment-broken.yaml // test/extended/testdata/deployments/test-deployment-test.yaml +// test/extended/testdata/disaster-recovery/restore-etcd.sh +// test/extended/testdata/disaster-recovery/rollback-A.yaml +// test/extended/testdata/disaster-recovery/rollback-B.yaml +// test/extended/testdata/disaster-recovery/ssh-bastion/clusterrole.yaml +// test/extended/testdata/disaster-recovery/ssh-bastion/clusterrolebinding.yaml +// test/extended/testdata/disaster-recovery/ssh-bastion/deployment.yaml +// test/extended/testdata/disaster-recovery/ssh-bastion/namespace.yaml +// test/extended/testdata/disaster-recovery/ssh-bastion/role.yaml +// test/extended/testdata/disaster-recovery/ssh-bastion/rolebinding.yaml +// test/extended/testdata/disaster-recovery/ssh-bastion/service.yaml +// test/extended/testdata/disaster-recovery/ssh-bastion/serviceaccount.yaml +// test/extended/testdata/disaster-recovery/ssh-bastion/sshd_config +// test/extended/testdata/disaster-recovery/update_route_53.py // test/extended/testdata/forcepull-test.json // test/extended/testdata/gssapi/config/kubeconfig // test/extended/testdata/gssapi/config/oauth_config.json @@ -47352,6 +47365,525 @@ func testExtendedTestdataDeploymentsTestDeploymentTestYaml() (*asset, error) { return a, nil } +var _testExtendedTestdataDisasterRecoveryRestoreEtcdSh = []byte(`#!/bin/bash +set -euo pipefail + +if [ -z "${BASTION_HOST}" ]; then exit 1; fi +if [ -z "${MASTERHOSTS}" ]; then exit 1; fi +if [ -z "${KUBE_SSH_KEY_PATH}" ]; then exit 1; fi + +MASTERS=(${MASTERHOSTS}) +FIRST_MASTER="${MASTERS[0]}" + +function retry() { + local ATTEMPTS="${1}" + local rc=0 + shift + for i in $(seq 0 $((ATTEMPTS-1))); do + echo "--> ${@}" + set +e + "${@}" + rc="$?" + set -e + echo "--> exit code: $rc" + test "${rc}" = 0 && break + sleep 10 + done + return "${rc}" +} + +function bastion_ssh() { + retry 60 \ + ssh -o LogLevel=error -o ConnectionAttempts=100 -o ConnectTimeout=30 -o StrictHostKeyChecking=no \ + -o ProxyCommand="ssh -A -o StrictHostKeyChecking=no -o LogLevel=error -o ServerAliveInterval=30 -o ConnectionAttempts=100 -o ConnectTimeout=30 -W %h:%p core@${BASTION_HOST} 2>/dev/null" \ + $@ +} + +echo "Distribute snapshot across all masters" +for master in "${MASTERS[@]}" +do + scp -o StrictHostKeyChecking=no -o ProxyCommand="ssh -A -o StrictHostKeyChecking=no -o ServerAliveInterval=30 -W %h:%p core@${BASTION_HOST}" ${KUBE_SSH_KEY_PATH} "core@${master}":/home/core/.ssh/id_rsa + bastion_ssh "core@${master}" "sudo -i chmod 0600 /home/core/.ssh/id_rsa" + bastion_ssh "core@${FIRST_MASTER}" "scp -o StrictHostKeyChecking=no /tmp/snapshot.db core@${master}:/tmp/snapshot.db" +done + +echo "Collect etcd names" +for master in "${MASTERS[@]}" +do + bastion_ssh "core@${master}" 'echo "etcd-member-$(hostname -f)" > /tmp/etcd_name && source /run/etcd/environment && echo "https://${ETCD_DNS_NAME}:2380" > /tmp/etcd_uri' + bastion_ssh "core@${FIRST_MASTER}" "mkdir -p /tmp/etcd/${master} && scp -o StrictHostKeyChecking=no core@${master}:/tmp/etcd_name /tmp/etcd/${master}/etcd_name && scp -o StrictHostKeyChecking=no core@${master}:/tmp/etcd_uri /tmp/etcd/${master}/etcd_uri" + bastion_ssh "core@${FIRST_MASTER}" "cat /tmp/etcd/${master}/etcd_name" + bastion_ssh "core@${FIRST_MASTER}" "cat /tmp/etcd/${master}/etcd_uri" +done + +echo "Assemble etcd connection string" +bastion_ssh "core@${FIRST_MASTER}" 'rm -rf /tmp/etcd/connstring && mapfile -t MASTERS < <(ls /tmp/etcd) && echo ${MASTERS[@]} && for master in "${MASTERS[@]}"; do echo -n "$(cat /tmp/etcd/${master}/etcd_name)=$(cat /tmp/etcd/${master}/etcd_uri)," >> /tmp/etcd/connstring; done && sed -i '"'$ s/.$//'"' /tmp/etcd/connstring' + +echo "Restore etcd cluster from snapshot" +for master in "${MASTERS[@]}" +do + echo "Running /usr/local/bin/etcd-snapshot-restore.sh on ${master}" + bastion_ssh "core@${FIRST_MASTER}" "scp -o StrictHostKeyChecking=no /tmp/etcd/connstring core@${master}:/tmp/etcd_connstring" + bastion_ssh "core@${master}" 'sudo -i /bin/bash -x /usr/local/bin/etcd-snapshot-restore.sh /tmp/snapshot.db $(cat /tmp/etcd_connstring)' +done +`) + +func testExtendedTestdataDisasterRecoveryRestoreEtcdShBytes() ([]byte, error) { + return _testExtendedTestdataDisasterRecoveryRestoreEtcdSh, nil +} + +func testExtendedTestdataDisasterRecoveryRestoreEtcdSh() (*asset, error) { + bytes, err := testExtendedTestdataDisasterRecoveryRestoreEtcdShBytes() + if err != nil { + return nil, err + } + + info := bindataFileInfo{name: "test/extended/testdata/disaster-recovery/restore-etcd.sh", size: 0, mode: os.FileMode(0), modTime: time.Unix(0, 0)} + a := &asset{bytes: bytes, info: info} + return a, nil +} + +var _testExtendedTestdataDisasterRecoveryRollbackAYaml = []byte(`apiVersion: machineconfiguration.openshift.io/v1 +kind: MachineConfig +metadata: + labels: + machineconfiguration.openshift.io/role: master + name: 99-rollback-test +spec: + config: + ignition: + version: 2.2.0 + storage: + files: + - contents: + source: data:,A + filesystem: root + mode: 420 + path: /etc/rollback-test +`) + +func testExtendedTestdataDisasterRecoveryRollbackAYamlBytes() ([]byte, error) { + return _testExtendedTestdataDisasterRecoveryRollbackAYaml, nil +} + +func testExtendedTestdataDisasterRecoveryRollbackAYaml() (*asset, error) { + bytes, err := testExtendedTestdataDisasterRecoveryRollbackAYamlBytes() + if err != nil { + return nil, err + } + + info := bindataFileInfo{name: "test/extended/testdata/disaster-recovery/rollback-A.yaml", size: 0, mode: os.FileMode(0), modTime: time.Unix(0, 0)} + a := &asset{bytes: bytes, info: info} + return a, nil +} + +var _testExtendedTestdataDisasterRecoveryRollbackBYaml = []byte(`apiVersion: machineconfiguration.openshift.io/v1 +kind: MachineConfig +metadata: + labels: + machineconfiguration.openshift.io/role: master + name: 99-rollback-test +spec: + config: + ignition: + version: 2.2.0 + storage: + files: + - contents: + source: data:,B + filesystem: root + mode: 420 + path: /etc/rollback-test +`) + +func testExtendedTestdataDisasterRecoveryRollbackBYamlBytes() ([]byte, error) { + return _testExtendedTestdataDisasterRecoveryRollbackBYaml, nil +} + +func testExtendedTestdataDisasterRecoveryRollbackBYaml() (*asset, error) { + bytes, err := testExtendedTestdataDisasterRecoveryRollbackBYamlBytes() + if err != nil { + return nil, err + } + + info := bindataFileInfo{name: "test/extended/testdata/disaster-recovery/rollback-B.yaml", size: 0, mode: os.FileMode(0), modTime: time.Unix(0, 0)} + a := &asset{bytes: bytes, info: info} + return a, nil +} + +var _testExtendedTestdataDisasterRecoverySshBastionClusterroleYaml = []byte(`apiVersion: rbac.authorization.k8s.io/v1 +kind: ClusterRole +metadata: + name: ssh-bastion +rules: +- apiGroups: + - "machineconfiguration.openshift.io" + resources: + - "machineconfigs" + verbs: + - get +- apiGroups: + - "" + resources: + - "nodes" + verbs: + - list + - get +`) + +func testExtendedTestdataDisasterRecoverySshBastionClusterroleYamlBytes() ([]byte, error) { + return _testExtendedTestdataDisasterRecoverySshBastionClusterroleYaml, nil +} + +func testExtendedTestdataDisasterRecoverySshBastionClusterroleYaml() (*asset, error) { + bytes, err := testExtendedTestdataDisasterRecoverySshBastionClusterroleYamlBytes() + if err != nil { + return nil, err + } + + info := bindataFileInfo{name: "test/extended/testdata/disaster-recovery/ssh-bastion/clusterrole.yaml", size: 0, mode: os.FileMode(0), modTime: time.Unix(0, 0)} + a := &asset{bytes: bytes, info: info} + return a, nil +} + +var _testExtendedTestdataDisasterRecoverySshBastionClusterrolebindingYaml = []byte(`apiVersion: rbac.authorization.k8s.io/v1 +kind: ClusterRoleBinding +metadata: + annotations: + openshift.io/description: Allows ssh-pod to read nodes and machineconfigs + name: ssh-bastion +roleRef: + apiGroup: rbac.authorization.k8s.io + kind: ClusterRole + name: ssh-bastion +subjects: +- apiGroup: rbac.authorization.k8s.io + kind: User + name: system:serviceaccount:ssh-bastion:ssh-bastion +`) + +func testExtendedTestdataDisasterRecoverySshBastionClusterrolebindingYamlBytes() ([]byte, error) { + return _testExtendedTestdataDisasterRecoverySshBastionClusterrolebindingYaml, nil +} + +func testExtendedTestdataDisasterRecoverySshBastionClusterrolebindingYaml() (*asset, error) { + bytes, err := testExtendedTestdataDisasterRecoverySshBastionClusterrolebindingYamlBytes() + if err != nil { + return nil, err + } + + info := bindataFileInfo{name: "test/extended/testdata/disaster-recovery/ssh-bastion/clusterrolebinding.yaml", size: 0, mode: os.FileMode(0), modTime: time.Unix(0, 0)} + a := &asset{bytes: bytes, info: info} + return a, nil +} + +var _testExtendedTestdataDisasterRecoverySshBastionDeploymentYaml = []byte(`apiVersion: extensions/v1beta1 +kind: Deployment +metadata: + labels: + run: ssh-bastion + name: ssh-bastion + namespace: ssh-bastion +spec: + replicas: 1 + selector: + matchLabels: + run: ssh-bastion + template: + metadata: + labels: + run: ssh-bastion + spec: + serviceAccountName: "ssh-bastion" + containers: + - image: quay.io/eparis/ssh:latest + imagePullPolicy: Always + name: ssh-bastion + ports: + - containerPort: 22 + name: ssh + protocol: TCP + volumeMounts: + - name: ssh-host-keys + mountPath: "/etc/ssh/" + readOnly: true + securityContext: + privileged: true + volumes: + - name: ssh-host-keys + secret: + secretName: ssh-host-keys + items: + - key: ssh_host_rsa_key + path: ssh_host_rsa_key + mode: 256 + - key: ssh_host_ecdsa_key + path: ssh_host_ecdsa_key + mode: 256 + - key: ssh_host_ed25519_key + path: ssh_host_ed25519_key + mode: 256 + - key: sshd_config + path: sshd_config + restartPolicy: Always +`) + +func testExtendedTestdataDisasterRecoverySshBastionDeploymentYamlBytes() ([]byte, error) { + return _testExtendedTestdataDisasterRecoverySshBastionDeploymentYaml, nil +} + +func testExtendedTestdataDisasterRecoverySshBastionDeploymentYaml() (*asset, error) { + bytes, err := testExtendedTestdataDisasterRecoverySshBastionDeploymentYamlBytes() + if err != nil { + return nil, err + } + + info := bindataFileInfo{name: "test/extended/testdata/disaster-recovery/ssh-bastion/deployment.yaml", size: 0, mode: os.FileMode(0), modTime: time.Unix(0, 0)} + a := &asset{bytes: bytes, info: info} + return a, nil +} + +var _testExtendedTestdataDisasterRecoverySshBastionNamespaceYaml = []byte(`apiVersion: v1 +kind: Namespace +metadata: + name: ssh-bastion + labels: + openshift.io/run-level: "0" + +`) + +func testExtendedTestdataDisasterRecoverySshBastionNamespaceYamlBytes() ([]byte, error) { + return _testExtendedTestdataDisasterRecoverySshBastionNamespaceYaml, nil +} + +func testExtendedTestdataDisasterRecoverySshBastionNamespaceYaml() (*asset, error) { + bytes, err := testExtendedTestdataDisasterRecoverySshBastionNamespaceYamlBytes() + if err != nil { + return nil, err + } + + info := bindataFileInfo{name: "test/extended/testdata/disaster-recovery/ssh-bastion/namespace.yaml", size: 0, mode: os.FileMode(0), modTime: time.Unix(0, 0)} + a := &asset{bytes: bytes, info: info} + return a, nil +} + +var _testExtendedTestdataDisasterRecoverySshBastionRoleYaml = []byte(`apiVersion: rbac.authorization.k8s.io/v1 +kind: Role +metadata: + name: ssh-bastion + namespace: ssh-bastion +rules: +- apiGroups: + - security.openshift.io + resources: + - securitycontextconstraints + verbs: + - use + resourceNames: + - privileged +`) + +func testExtendedTestdataDisasterRecoverySshBastionRoleYamlBytes() ([]byte, error) { + return _testExtendedTestdataDisasterRecoverySshBastionRoleYaml, nil +} + +func testExtendedTestdataDisasterRecoverySshBastionRoleYaml() (*asset, error) { + bytes, err := testExtendedTestdataDisasterRecoverySshBastionRoleYamlBytes() + if err != nil { + return nil, err + } + + info := bindataFileInfo{name: "test/extended/testdata/disaster-recovery/ssh-bastion/role.yaml", size: 0, mode: os.FileMode(0), modTime: time.Unix(0, 0)} + a := &asset{bytes: bytes, info: info} + return a, nil +} + +var _testExtendedTestdataDisasterRecoverySshBastionRolebindingYaml = []byte(`apiVersion: rbac.authorization.k8s.io/v1 +kind: RoleBinding +metadata: + annotations: + openshift.io/description: Allows ssh-pod to run as root + name: ssh-bastion + namespace: ssh-bastion +roleRef: + apiGroup: rbac.authorization.k8s.io + kind: Role + name: ssh-bastion +subjects: +- apiGroup: rbac.authorization.k8s.io + kind: User + name: system:serviceaccount:ssh-bastion:ssh-bastion +`) + +func testExtendedTestdataDisasterRecoverySshBastionRolebindingYamlBytes() ([]byte, error) { + return _testExtendedTestdataDisasterRecoverySshBastionRolebindingYaml, nil +} + +func testExtendedTestdataDisasterRecoverySshBastionRolebindingYaml() (*asset, error) { + bytes, err := testExtendedTestdataDisasterRecoverySshBastionRolebindingYamlBytes() + if err != nil { + return nil, err + } + + info := bindataFileInfo{name: "test/extended/testdata/disaster-recovery/ssh-bastion/rolebinding.yaml", size: 0, mode: os.FileMode(0), modTime: time.Unix(0, 0)} + a := &asset{bytes: bytes, info: info} + return a, nil +} + +var _testExtendedTestdataDisasterRecoverySshBastionServiceYaml = []byte(`apiVersion: v1 +kind: Service +metadata: + labels: + run: ssh-bastion + name: ssh-bastion + namespace: ssh-bastion +spec: + externalTrafficPolicy: Local + ports: + - name: ssh + port: 22 + protocol: TCP + targetPort: ssh + selector: + run: ssh-bastion + type: LoadBalancer +`) + +func testExtendedTestdataDisasterRecoverySshBastionServiceYamlBytes() ([]byte, error) { + return _testExtendedTestdataDisasterRecoverySshBastionServiceYaml, nil +} + +func testExtendedTestdataDisasterRecoverySshBastionServiceYaml() (*asset, error) { + bytes, err := testExtendedTestdataDisasterRecoverySshBastionServiceYamlBytes() + if err != nil { + return nil, err + } + + info := bindataFileInfo{name: "test/extended/testdata/disaster-recovery/ssh-bastion/service.yaml", size: 0, mode: os.FileMode(0), modTime: time.Unix(0, 0)} + a := &asset{bytes: bytes, info: info} + return a, nil +} + +var _testExtendedTestdataDisasterRecoverySshBastionServiceaccountYaml = []byte(`apiVersion: v1 +kind: ServiceAccount +metadata: + name: ssh-bastion + namespace: ssh-bastion +`) + +func testExtendedTestdataDisasterRecoverySshBastionServiceaccountYamlBytes() ([]byte, error) { + return _testExtendedTestdataDisasterRecoverySshBastionServiceaccountYaml, nil +} + +func testExtendedTestdataDisasterRecoverySshBastionServiceaccountYaml() (*asset, error) { + bytes, err := testExtendedTestdataDisasterRecoverySshBastionServiceaccountYamlBytes() + if err != nil { + return nil, err + } + + info := bindataFileInfo{name: "test/extended/testdata/disaster-recovery/ssh-bastion/serviceaccount.yaml", size: 0, mode: os.FileMode(0), modTime: time.Unix(0, 0)} + a := &asset{bytes: bytes, info: info} + return a, nil +} + +var _testExtendedTestdataDisasterRecoverySshBastionSshd_config = []byte(`HostKey /etc/ssh/ssh_host_rsa_key +HostKey /etc/ssh/ssh_host_ecdsa_key +HostKey /etc/ssh/ssh_host_ed25519_key +SyslogFacility AUTHPRIV +PermitRootLogin no +AuthorizedKeysFile /home/core/.ssh/authorized_keys +PasswordAuthentication no +ChallengeResponseAuthentication no +GSSAPIAuthentication yes +GSSAPICleanupCredentials no +UsePAM yes +X11Forwarding yes +PrintMotd no +AcceptEnv LANG LC_CTYPE LC_NUMERIC LC_TIME LC_COLLATE LC_MONETARY LC_MESSAGES +AcceptEnv LC_PAPER LC_NAME LC_ADDRESS LC_TELEPHONE LC_MEASUREMENT +AcceptEnv LC_IDENTIFICATION LC_ALL LANGUAGE +AcceptEnv XMODIFIERS +Subsystem sftp /usr/libexec/openssh/sftp-server +`) + +func testExtendedTestdataDisasterRecoverySshBastionSshd_configBytes() ([]byte, error) { + return _testExtendedTestdataDisasterRecoverySshBastionSshd_config, nil +} + +func testExtendedTestdataDisasterRecoverySshBastionSshd_config() (*asset, error) { + bytes, err := testExtendedTestdataDisasterRecoverySshBastionSshd_configBytes() + if err != nil { + return nil, err + } + + info := bindataFileInfo{name: "test/extended/testdata/disaster-recovery/ssh-bastion/sshd_config", size: 0, mode: os.FileMode(0), modTime: time.Unix(0, 0)} + a := &asset{bytes: bytes, info: info} + return a, nil +} + +var _testExtendedTestdataDisasterRecoveryUpdate_route_53Py = []byte(`import boto3 +import os +import sys +from time import sleep + +if len(sys.argv) < 4: + print("Usage: ./update_route_53.py ") + sys.exit(1) + +attempts = 10 +pause = 10 + +domain = sys.argv[1] +record = sys.argv[2] +ip = sys.argv[3] +print("record: %s" % record) +print("ip: %s" % ip) + +client = boto3.client('route53') +r = client.list_hosted_zones_by_name(DNSName=domain, MaxItems="1") +zone_id = r['HostedZones'][0]['Id'].split('/')[-1] + +response = client.change_resource_record_sets( + HostedZoneId=zone_id, + ChangeBatch= { + 'Comment': 'add %s -> %s' % (record, ip), + 'Changes': [ + { + 'Action': 'UPSERT', + 'ResourceRecordSet': { + 'Name': record, + 'Type': 'A', + 'TTL': 60, + 'ResourceRecords': [{'Value': ip}] + } + }] +}) +for i in range(attempts): + print('response: %s' % response) + changeID = response['ChangeInfo']['Id'] + if response['ChangeInfo']['Status'] == "INSYNC": + print('insync found, response: %s' % response) + break + print('waiting for response to complete') + sleep(pause) + response = client.get_change(Id=changeID) +`) + +func testExtendedTestdataDisasterRecoveryUpdate_route_53PyBytes() ([]byte, error) { + return _testExtendedTestdataDisasterRecoveryUpdate_route_53Py, nil +} + +func testExtendedTestdataDisasterRecoveryUpdate_route_53Py() (*asset, error) { + bytes, err := testExtendedTestdataDisasterRecoveryUpdate_route_53PyBytes() + if err != nil { + return nil, err + } + + info := bindataFileInfo{name: "test/extended/testdata/disaster-recovery/update_route_53.py", size: 0, mode: os.FileMode(0), modTime: time.Unix(0, 0)} + a := &asset{bytes: bytes, info: info} + return a, nil +} + var _testExtendedTestdataForcepullTestJson = []byte(`{ "kind": "List", "apiVersion": "v1", @@ -56565,6 +57097,19 @@ var _bindata = map[string]func() (*asset, error){ "test/extended/testdata/deployments/tag-images-deployment.yaml": testExtendedTestdataDeploymentsTagImagesDeploymentYaml, "test/extended/testdata/deployments/test-deployment-broken.yaml": testExtendedTestdataDeploymentsTestDeploymentBrokenYaml, "test/extended/testdata/deployments/test-deployment-test.yaml": testExtendedTestdataDeploymentsTestDeploymentTestYaml, + "test/extended/testdata/disaster-recovery/restore-etcd.sh": testExtendedTestdataDisasterRecoveryRestoreEtcdSh, + "test/extended/testdata/disaster-recovery/rollback-A.yaml": testExtendedTestdataDisasterRecoveryRollbackAYaml, + "test/extended/testdata/disaster-recovery/rollback-B.yaml": testExtendedTestdataDisasterRecoveryRollbackBYaml, + "test/extended/testdata/disaster-recovery/ssh-bastion/clusterrole.yaml": testExtendedTestdataDisasterRecoverySshBastionClusterroleYaml, + "test/extended/testdata/disaster-recovery/ssh-bastion/clusterrolebinding.yaml": testExtendedTestdataDisasterRecoverySshBastionClusterrolebindingYaml, + "test/extended/testdata/disaster-recovery/ssh-bastion/deployment.yaml": testExtendedTestdataDisasterRecoverySshBastionDeploymentYaml, + "test/extended/testdata/disaster-recovery/ssh-bastion/namespace.yaml": testExtendedTestdataDisasterRecoverySshBastionNamespaceYaml, + "test/extended/testdata/disaster-recovery/ssh-bastion/role.yaml": testExtendedTestdataDisasterRecoverySshBastionRoleYaml, + "test/extended/testdata/disaster-recovery/ssh-bastion/rolebinding.yaml": testExtendedTestdataDisasterRecoverySshBastionRolebindingYaml, + "test/extended/testdata/disaster-recovery/ssh-bastion/service.yaml": testExtendedTestdataDisasterRecoverySshBastionServiceYaml, + "test/extended/testdata/disaster-recovery/ssh-bastion/serviceaccount.yaml": testExtendedTestdataDisasterRecoverySshBastionServiceaccountYaml, + "test/extended/testdata/disaster-recovery/ssh-bastion/sshd_config": testExtendedTestdataDisasterRecoverySshBastionSshd_config, + "test/extended/testdata/disaster-recovery/update_route_53.py": testExtendedTestdataDisasterRecoveryUpdate_route_53Py, "test/extended/testdata/forcepull-test.json": testExtendedTestdataForcepullTestJson, "test/extended/testdata/gssapi/config/kubeconfig": testExtendedTestdataGssapiConfigKubeconfig, "test/extended/testdata/gssapi/config/oauth_config.json": testExtendedTestdataGssapiConfigOauth_configJson, @@ -57194,6 +57739,23 @@ var _bintree = &bintree{nil, map[string]*bintree{ "test-deployment-broken.yaml": &bintree{testExtendedTestdataDeploymentsTestDeploymentBrokenYaml, map[string]*bintree{}}, "test-deployment-test.yaml": &bintree{testExtendedTestdataDeploymentsTestDeploymentTestYaml, map[string]*bintree{}}, }}, + "disaster-recovery": &bintree{nil, map[string]*bintree{ + "restore-etcd.sh": &bintree{testExtendedTestdataDisasterRecoveryRestoreEtcdSh, map[string]*bintree{}}, + "rollback-A.yaml": &bintree{testExtendedTestdataDisasterRecoveryRollbackAYaml, map[string]*bintree{}}, + "rollback-B.yaml": &bintree{testExtendedTestdataDisasterRecoveryRollbackBYaml, map[string]*bintree{}}, + "ssh-bastion": &bintree{nil, map[string]*bintree{ + "clusterrole.yaml": &bintree{testExtendedTestdataDisasterRecoverySshBastionClusterroleYaml, map[string]*bintree{}}, + "clusterrolebinding.yaml": &bintree{testExtendedTestdataDisasterRecoverySshBastionClusterrolebindingYaml, map[string]*bintree{}}, + "deployment.yaml": &bintree{testExtendedTestdataDisasterRecoverySshBastionDeploymentYaml, map[string]*bintree{}}, + "namespace.yaml": &bintree{testExtendedTestdataDisasterRecoverySshBastionNamespaceYaml, map[string]*bintree{}}, + "role.yaml": &bintree{testExtendedTestdataDisasterRecoverySshBastionRoleYaml, map[string]*bintree{}}, + "rolebinding.yaml": &bintree{testExtendedTestdataDisasterRecoverySshBastionRolebindingYaml, map[string]*bintree{}}, + "service.yaml": &bintree{testExtendedTestdataDisasterRecoverySshBastionServiceYaml, map[string]*bintree{}}, + "serviceaccount.yaml": &bintree{testExtendedTestdataDisasterRecoverySshBastionServiceaccountYaml, map[string]*bintree{}}, + "sshd_config": &bintree{testExtendedTestdataDisasterRecoverySshBastionSshd_config, map[string]*bintree{}}, + }}, + "update_route_53.py": &bintree{testExtendedTestdataDisasterRecoveryUpdate_route_53Py, map[string]*bintree{}}, + }}, "forcepull-test.json": &bintree{testExtendedTestdataForcepullTestJson, map[string]*bintree{}}, "gssapi": &bintree{nil, map[string]*bintree{ "config": &bintree{nil, map[string]*bintree{ diff --git a/test/extended/testdata/disaster-recovery/restore-etcd.sh b/test/extended/testdata/disaster-recovery/restore-etcd.sh new file mode 100755 index 000000000000..e709848c560f --- /dev/null +++ b/test/extended/testdata/disaster-recovery/restore-etcd.sh @@ -0,0 +1,61 @@ +#!/bin/bash +set -euo pipefail + +if [ -z "${BASTION_HOST}" ]; then exit 1; fi +if [ -z "${MASTERHOSTS}" ]; then exit 1; fi +if [ -z "${KUBE_SSH_KEY_PATH}" ]; then exit 1; fi + +MASTERS=(${MASTERHOSTS}) +FIRST_MASTER="${MASTERS[0]}" + +function retry() { + local ATTEMPTS="${1}" + local rc=0 + shift + for i in $(seq 0 $((ATTEMPTS-1))); do + echo "--> ${@}" + set +e + "${@}" + rc="$?" + set -e + echo "--> exit code: $rc" + test "${rc}" = 0 && break + sleep 10 + done + return "${rc}" +} + +function bastion_ssh() { + retry 60 \ + ssh -o LogLevel=error -o ConnectionAttempts=100 -o ConnectTimeout=30 -o StrictHostKeyChecking=no \ + -o ProxyCommand="ssh -A -o StrictHostKeyChecking=no -o LogLevel=error -o ServerAliveInterval=30 -o ConnectionAttempts=100 -o ConnectTimeout=30 -W %h:%p core@${BASTION_HOST} 2>/dev/null" \ + $@ +} + +echo "Distribute snapshot across all masters" +for master in "${MASTERS[@]}" +do + scp -o StrictHostKeyChecking=no -o ProxyCommand="ssh -A -o StrictHostKeyChecking=no -o ServerAliveInterval=30 -W %h:%p core@${BASTION_HOST}" ${KUBE_SSH_KEY_PATH} "core@${master}":/home/core/.ssh/id_rsa + bastion_ssh "core@${master}" "sudo -i chmod 0600 /home/core/.ssh/id_rsa" + bastion_ssh "core@${FIRST_MASTER}" "scp -o StrictHostKeyChecking=no /tmp/snapshot.db core@${master}:/tmp/snapshot.db" +done + +echo "Collect etcd names" +for master in "${MASTERS[@]}" +do + bastion_ssh "core@${master}" 'echo "etcd-member-$(hostname -f)" > /tmp/etcd_name && source /run/etcd/environment && echo "https://${ETCD_DNS_NAME}:2380" > /tmp/etcd_uri' + bastion_ssh "core@${FIRST_MASTER}" "mkdir -p /tmp/etcd/${master} && scp -o StrictHostKeyChecking=no core@${master}:/tmp/etcd_name /tmp/etcd/${master}/etcd_name && scp -o StrictHostKeyChecking=no core@${master}:/tmp/etcd_uri /tmp/etcd/${master}/etcd_uri" + bastion_ssh "core@${FIRST_MASTER}" "cat /tmp/etcd/${master}/etcd_name" + bastion_ssh "core@${FIRST_MASTER}" "cat /tmp/etcd/${master}/etcd_uri" +done + +echo "Assemble etcd connection string" +bastion_ssh "core@${FIRST_MASTER}" 'rm -rf /tmp/etcd/connstring && mapfile -t MASTERS < <(ls /tmp/etcd) && echo ${MASTERS[@]} && for master in "${MASTERS[@]}"; do echo -n "$(cat /tmp/etcd/${master}/etcd_name)=$(cat /tmp/etcd/${master}/etcd_uri)," >> /tmp/etcd/connstring; done && sed -i '"'$ s/.$//'"' /tmp/etcd/connstring' + +echo "Restore etcd cluster from snapshot" +for master in "${MASTERS[@]}" +do + echo "Running /usr/local/bin/etcd-snapshot-restore.sh on ${master}" + bastion_ssh "core@${FIRST_MASTER}" "scp -o StrictHostKeyChecking=no /tmp/etcd/connstring core@${master}:/tmp/etcd_connstring" + bastion_ssh "core@${master}" 'sudo -i /bin/bash -x /usr/local/bin/etcd-snapshot-restore.sh /tmp/snapshot.db $(cat /tmp/etcd_connstring)' +done diff --git a/test/extended/testdata/disaster-recovery/rollback-A.yaml b/test/extended/testdata/disaster-recovery/rollback-A.yaml new file mode 100644 index 000000000000..9634a704cf9e --- /dev/null +++ b/test/extended/testdata/disaster-recovery/rollback-A.yaml @@ -0,0 +1,17 @@ +apiVersion: machineconfiguration.openshift.io/v1 +kind: MachineConfig +metadata: + labels: + machineconfiguration.openshift.io/role: master + name: 99-rollback-test +spec: + config: + ignition: + version: 2.2.0 + storage: + files: + - contents: + source: data:,A + filesystem: root + mode: 420 + path: /etc/rollback-test diff --git a/test/extended/testdata/disaster-recovery/rollback-B.yaml b/test/extended/testdata/disaster-recovery/rollback-B.yaml new file mode 100644 index 000000000000..f731af036ab7 --- /dev/null +++ b/test/extended/testdata/disaster-recovery/rollback-B.yaml @@ -0,0 +1,17 @@ +apiVersion: machineconfiguration.openshift.io/v1 +kind: MachineConfig +metadata: + labels: + machineconfiguration.openshift.io/role: master + name: 99-rollback-test +spec: + config: + ignition: + version: 2.2.0 + storage: + files: + - contents: + source: data:,B + filesystem: root + mode: 420 + path: /etc/rollback-test diff --git a/test/extended/testdata/disaster-recovery/ssh-bastion/clusterrole.yaml b/test/extended/testdata/disaster-recovery/ssh-bastion/clusterrole.yaml new file mode 100644 index 000000000000..f7ce7f35641c --- /dev/null +++ b/test/extended/testdata/disaster-recovery/ssh-bastion/clusterrole.yaml @@ -0,0 +1,18 @@ +apiVersion: rbac.authorization.k8s.io/v1 +kind: ClusterRole +metadata: + name: ssh-bastion +rules: +- apiGroups: + - "machineconfiguration.openshift.io" + resources: + - "machineconfigs" + verbs: + - get +- apiGroups: + - "" + resources: + - "nodes" + verbs: + - list + - get diff --git a/test/extended/testdata/disaster-recovery/ssh-bastion/clusterrolebinding.yaml b/test/extended/testdata/disaster-recovery/ssh-bastion/clusterrolebinding.yaml new file mode 100644 index 000000000000..cdad0df9e50f --- /dev/null +++ b/test/extended/testdata/disaster-recovery/ssh-bastion/clusterrolebinding.yaml @@ -0,0 +1,14 @@ +apiVersion: rbac.authorization.k8s.io/v1 +kind: ClusterRoleBinding +metadata: + annotations: + openshift.io/description: Allows ssh-pod to read nodes and machineconfigs + name: ssh-bastion +roleRef: + apiGroup: rbac.authorization.k8s.io + kind: ClusterRole + name: ssh-bastion +subjects: +- apiGroup: rbac.authorization.k8s.io + kind: User + name: system:serviceaccount:ssh-bastion:ssh-bastion diff --git a/test/extended/testdata/disaster-recovery/ssh-bastion/deployment.yaml b/test/extended/testdata/disaster-recovery/ssh-bastion/deployment.yaml new file mode 100644 index 000000000000..284978607e3f --- /dev/null +++ b/test/extended/testdata/disaster-recovery/ssh-bastion/deployment.yaml @@ -0,0 +1,49 @@ +apiVersion: extensions/v1beta1 +kind: Deployment +metadata: + labels: + run: ssh-bastion + name: ssh-bastion + namespace: ssh-bastion +spec: + replicas: 1 + selector: + matchLabels: + run: ssh-bastion + template: + metadata: + labels: + run: ssh-bastion + spec: + serviceAccountName: "ssh-bastion" + containers: + - image: quay.io/eparis/ssh:latest + imagePullPolicy: Always + name: ssh-bastion + ports: + - containerPort: 22 + name: ssh + protocol: TCP + volumeMounts: + - name: ssh-host-keys + mountPath: "/etc/ssh/" + readOnly: true + securityContext: + privileged: true + volumes: + - name: ssh-host-keys + secret: + secretName: ssh-host-keys + items: + - key: ssh_host_rsa_key + path: ssh_host_rsa_key + mode: 256 + - key: ssh_host_ecdsa_key + path: ssh_host_ecdsa_key + mode: 256 + - key: ssh_host_ed25519_key + path: ssh_host_ed25519_key + mode: 256 + - key: sshd_config + path: sshd_config + restartPolicy: Always diff --git a/test/extended/testdata/disaster-recovery/ssh-bastion/namespace.yaml b/test/extended/testdata/disaster-recovery/ssh-bastion/namespace.yaml new file mode 100644 index 000000000000..41fe6775c02c --- /dev/null +++ b/test/extended/testdata/disaster-recovery/ssh-bastion/namespace.yaml @@ -0,0 +1,7 @@ +apiVersion: v1 +kind: Namespace +metadata: + name: ssh-bastion + labels: + openshift.io/run-level: "0" + diff --git a/test/extended/testdata/disaster-recovery/ssh-bastion/role.yaml b/test/extended/testdata/disaster-recovery/ssh-bastion/role.yaml new file mode 100644 index 000000000000..825d93b554ac --- /dev/null +++ b/test/extended/testdata/disaster-recovery/ssh-bastion/role.yaml @@ -0,0 +1,14 @@ +apiVersion: rbac.authorization.k8s.io/v1 +kind: Role +metadata: + name: ssh-bastion + namespace: ssh-bastion +rules: +- apiGroups: + - security.openshift.io + resources: + - securitycontextconstraints + verbs: + - use + resourceNames: + - privileged diff --git a/test/extended/testdata/disaster-recovery/ssh-bastion/rolebinding.yaml b/test/extended/testdata/disaster-recovery/ssh-bastion/rolebinding.yaml new file mode 100644 index 000000000000..ba2e2f2b4bdb --- /dev/null +++ b/test/extended/testdata/disaster-recovery/ssh-bastion/rolebinding.yaml @@ -0,0 +1,15 @@ +apiVersion: rbac.authorization.k8s.io/v1 +kind: RoleBinding +metadata: + annotations: + openshift.io/description: Allows ssh-pod to run as root + name: ssh-bastion + namespace: ssh-bastion +roleRef: + apiGroup: rbac.authorization.k8s.io + kind: Role + name: ssh-bastion +subjects: +- apiGroup: rbac.authorization.k8s.io + kind: User + name: system:serviceaccount:ssh-bastion:ssh-bastion diff --git a/test/extended/testdata/disaster-recovery/ssh-bastion/service.yaml b/test/extended/testdata/disaster-recovery/ssh-bastion/service.yaml new file mode 100644 index 000000000000..63fb71775799 --- /dev/null +++ b/test/extended/testdata/disaster-recovery/ssh-bastion/service.yaml @@ -0,0 +1,17 @@ +apiVersion: v1 +kind: Service +metadata: + labels: + run: ssh-bastion + name: ssh-bastion + namespace: ssh-bastion +spec: + externalTrafficPolicy: Local + ports: + - name: ssh + port: 22 + protocol: TCP + targetPort: ssh + selector: + run: ssh-bastion + type: LoadBalancer diff --git a/test/extended/testdata/disaster-recovery/ssh-bastion/serviceaccount.yaml b/test/extended/testdata/disaster-recovery/ssh-bastion/serviceaccount.yaml new file mode 100644 index 000000000000..729a2330c7e3 --- /dev/null +++ b/test/extended/testdata/disaster-recovery/ssh-bastion/serviceaccount.yaml @@ -0,0 +1,5 @@ +apiVersion: v1 +kind: ServiceAccount +metadata: + name: ssh-bastion + namespace: ssh-bastion diff --git a/test/extended/testdata/disaster-recovery/ssh-bastion/sshd_config b/test/extended/testdata/disaster-recovery/ssh-bastion/sshd_config new file mode 100644 index 000000000000..1f1b17167049 --- /dev/null +++ b/test/extended/testdata/disaster-recovery/ssh-bastion/sshd_config @@ -0,0 +1,18 @@ +HostKey /etc/ssh/ssh_host_rsa_key +HostKey /etc/ssh/ssh_host_ecdsa_key +HostKey /etc/ssh/ssh_host_ed25519_key +SyslogFacility AUTHPRIV +PermitRootLogin no +AuthorizedKeysFile /home/core/.ssh/authorized_keys +PasswordAuthentication no +ChallengeResponseAuthentication no +GSSAPIAuthentication yes +GSSAPICleanupCredentials no +UsePAM yes +X11Forwarding yes +PrintMotd no +AcceptEnv LANG LC_CTYPE LC_NUMERIC LC_TIME LC_COLLATE LC_MONETARY LC_MESSAGES +AcceptEnv LC_PAPER LC_NAME LC_ADDRESS LC_TELEPHONE LC_MEASUREMENT +AcceptEnv LC_IDENTIFICATION LC_ALL LANGUAGE +AcceptEnv XMODIFIERS +Subsystem sftp /usr/libexec/openssh/sftp-server diff --git a/test/extended/testdata/disaster-recovery/update_route_53.py b/test/extended/testdata/disaster-recovery/update_route_53.py new file mode 100644 index 000000000000..a549aee25c5a --- /dev/null +++ b/test/extended/testdata/disaster-recovery/update_route_53.py @@ -0,0 +1,46 @@ +import boto3 +import os +import sys +from time import sleep + +if len(sys.argv) < 4: + print("Usage: ./update_route_53.py ") + sys.exit(1) + +attempts = 10 +pause = 10 + +domain = sys.argv[1] +record = sys.argv[2] +ip = sys.argv[3] +print("record: %s" % record) +print("ip: %s" % ip) + +client = boto3.client('route53') +r = client.list_hosted_zones_by_name(DNSName=domain, MaxItems="1") +zone_id = r['HostedZones'][0]['Id'].split('/')[-1] + +response = client.change_resource_record_sets( + HostedZoneId=zone_id, + ChangeBatch= { + 'Comment': 'add %s -> %s' % (record, ip), + 'Changes': [ + { + 'Action': 'UPSERT', + 'ResourceRecordSet': { + 'Name': record, + 'Type': 'A', + 'TTL': 60, + 'ResourceRecords': [{'Value': ip}] + } + }] +}) +for i in range(attempts): + print('response: %s' % response) + changeID = response['ChangeInfo']['Id'] + if response['ChangeInfo']['Status'] == "INSYNC": + print('insync found, response: %s' % response) + break + print('waiting for response to complete') + sleep(pause) + response = client.get_change(Id=changeID)