From 442a7869d3bf9d6e5689afcf5dbb6062c42392ef Mon Sep 17 00:00:00 2001 From: Thomas Stromberg Date: Wed, 25 Mar 2020 12:48:51 -0700 Subject: [PATCH] Re-initalize failed Kubernetes clusters --- go.sum | 2 - pkg/minikube/bootstrapper/kubeadm/kubeadm.go | 104 +++++++++++++------ 2 files changed, 75 insertions(+), 31 deletions(-) diff --git a/go.sum b/go.sum index d7d0bc3a6178..15a7b48dae8a 100644 --- a/go.sum +++ b/go.sum @@ -421,8 +421,6 @@ github.com/jmespath/go-jmespath v0.0.0-20160202185014-0b12d6b521d8/go.mod h1:Nht github.com/jmespath/go-jmespath v0.0.0-20180206201540-c2b33e8439af h1:pmfjZENx5imkbgOkpRUYLnmbU7UEFbjtDA2hxJ1ichM= github.com/jmespath/go-jmespath v0.0.0-20180206201540-c2b33e8439af/go.mod h1:Nht3zPeWKUH0NzdCt2Blrr5ys8VGpn0CEB0cQHVjt7k= github.com/joefitzgerald/rainbow-reporter v0.1.0/go.mod h1:481CNgqmVHQZzdIbN52CupLJyoVwB10FQ/IQlF1pdL8= -github.com/johanneswuerbach/nfsexports v0.0.0-20181204082207-1aa528dcb345 h1:XP1VL9iOZu4yz/rq8zj+yvB23XEY5erXRzp8JYmkWu0= -github.com/johanneswuerbach/nfsexports v0.0.0-20181204082207-1aa528dcb345/go.mod h1:+c1/kUpg2zlkoWqTOvzDs36Wpbm3Gd1nlmtXAEB0WGU= github.com/johanneswuerbach/nfsexports v0.0.0-20200318065542-c48c3734757f h1:tL0xH80QVHQOde6Qqdohv6PewABH8l8N9pywZtuojJ0= github.com/johanneswuerbach/nfsexports v0.0.0-20200318065542-c48c3734757f/go.mod h1:+c1/kUpg2zlkoWqTOvzDs36Wpbm3Gd1nlmtXAEB0WGU= github.com/jonboulle/clockwork v0.1.0/go.mod h1:Ii8DK3G1RaLaWxj9trq07+26W01tbo22gdxWY5EU2bo= diff --git a/pkg/minikube/bootstrapper/kubeadm/kubeadm.go b/pkg/minikube/bootstrapper/kubeadm/kubeadm.go index 5de297a00324..9308439bdde5 100644 --- a/pkg/minikube/bootstrapper/kubeadm/kubeadm.go +++ b/pkg/minikube/bootstrapper/kubeadm/kubeadm.go @@ -51,6 +51,7 @@ import ( "k8s.io/minikube/pkg/minikube/constants" "k8s.io/minikube/pkg/minikube/cruntime" "k8s.io/minikube/pkg/minikube/driver" + "k8s.io/minikube/pkg/minikube/kubelet" "k8s.io/minikube/pkg/minikube/machine" "k8s.io/minikube/pkg/minikube/out" "k8s.io/minikube/pkg/minikube/vmpath" @@ -129,7 +130,7 @@ func (k *Bootstrapper) LogCommands(cfg config.ClusterConfig, o bootstrapper.LogO dmesg.WriteString(fmt.Sprintf(" | tail -n %d", o.Lines)) } - describeNodes := fmt.Sprintf("sudo %s describe node -A --kubeconfig=%s", + describeNodes := fmt.Sprintf("sudo %s describe nodes --kubeconfig=%s", path.Join(vmpath.GuestPersistentDir, "binaries", cfg.KubernetesConfig.KubernetesVersion, "kubectl"), path.Join(vmpath.GuestPersistentDir, "kubeconfig")) @@ -181,20 +182,7 @@ func (k *Bootstrapper) clearStaleConfigs(cfg config.ClusterConfig) error { return nil } -// StartCluster starts the cluster -func (k *Bootstrapper) StartCluster(cfg config.ClusterConfig) error { - err := bsutil.ExistingConfig(k.c) - if err == nil { // if there is an existing cluster don't reconfigure it - return k.restartCluster(cfg) - } - glog.Infof("existence check: %v", err) - - start := time.Now() - glog.Infof("StartCluster: %+v", cfg) - defer func() { - glog.Infof("StartCluster complete in %s", time.Since(start)) - }() - +func (k *Bootstrapper) init(cfg config.ClusterConfig) error { version, err := util.ParseKubernetesVersion(cfg.KubernetesConfig.KubernetesVersion) if err != nil { return errors.Wrap(err, "parsing kubernetes version") @@ -237,10 +225,10 @@ func (k *Bootstrapper) StartCluster(cfg config.ClusterConfig) error { } conf := bsutil.KubeadmYamlPath - c := exec.Command("/bin/bash", "-c", fmt.Sprintf("sudo mv %s.new %s && %s init --config %s %s --ignore-preflight-errors=%s", conf, conf, bsutil.InvokeKubeadm(cfg.KubernetesConfig.KubernetesVersion), conf, extraFlags, strings.Join(ignore, ","))) - rr, err := k.c.RunCmd(c) - if err != nil { - return errors.Wrapf(err, "init failed. output: %q", rr.Output()) + c := exec.Command("/bin/bash", "-c", fmt.Sprintf("%s init --config %s %s --ignore-preflight-errors=%s", + bsutil.InvokeKubeadm(cfg.KubernetesConfig.KubernetesVersion), conf, extraFlags, strings.Join(ignore, ","))) + if _, err := k.c.RunCmd(c); err != nil { + return errors.Wrap(err, "run") } if cfg.Driver == driver.Docker { @@ -258,12 +246,49 @@ func (k *Bootstrapper) StartCluster(cfg config.ClusterConfig) error { } if err := k.elevateKubeSystemPrivileges(cfg); err != nil { - glog.Warningf("unable to create cluster role binding, some addons might not work : %v. ", err) + glog.Warningf("unable to create cluster role binding, some addons might not work: %v", err) } - return nil } +// StartCluster starts the cluster +func (k *Bootstrapper) StartCluster(cfg config.ClusterConfig) error { + start := time.Now() + glog.Infof("StartCluster: %+v", cfg) + defer func() { + glog.Infof("StartCluster complete in %s", time.Since(start)) + }() + + if err := bsutil.ExistingConfig(k.c); err == nil { + glog.Infof("found existing configuration files, will attempt cluster restart") + rerr := k.restartCluster(cfg) + if rerr == nil { + return nil + } + out.T(out.Embarrassed, "Unable to restart cluster, will reset it: {{.error}}", out.V{"error": rerr}) + if err := k.DeleteCluster(cfg.KubernetesConfig); err != nil { + glog.Warningf("delete failed: %v", err) + } + // Fall-through to init + } + + conf := bsutil.KubeadmYamlPath + if _, err := k.c.RunCmd(exec.Command("sudo", "cp", conf+".new", conf)); err != nil { + return errors.Wrap(err, "cp") + } + + err := k.init(cfg) + if err == nil { + return nil + } + + out.T(out.Conflict, "initialization failed, will try again: {{.error}}", out.V{"error": err}) + if err := k.DeleteCluster(cfg.KubernetesConfig); err != nil { + glog.Warningf("delete failed: %v", err) + } + return k.init(cfg) +} + func (k *Bootstrapper) controlPlaneEndpoint(cfg config.ClusterConfig) (string, int, error) { cp, err := config.PrimaryControlPlane(&cfg) if err != nil { @@ -410,8 +435,8 @@ func (k *Bootstrapper) restartCluster(cfg config.ClusterConfig) error { return errors.Wrap(err, "clearing stale configs") } - if _, err := k.c.RunCmd(exec.Command("sudo", "mv", conf+".new", conf)); err != nil { - return errors.Wrap(err, "mv") + if _, err := k.c.RunCmd(exec.Command("sudo", "cp", conf+".new", conf)); err != nil { + return errors.Wrap(err, "cp") } baseCmd := fmt.Sprintf("%s %s", bsutil.InvokeKubeadm(cfg.KubernetesConfig.KubernetesVersion), phase) @@ -425,9 +450,9 @@ func (k *Bootstrapper) restartCluster(cfg config.ClusterConfig) error { glog.Infof("resetting cluster from %s", conf) // Run commands one at a time so that it is easier to root cause failures. for _, c := range cmds { - rr, err := k.c.RunCmd(exec.Command("/bin/bash", "-c", c)) + _, err := k.c.RunCmd(exec.Command("/bin/bash", "-c", c)) if err != nil { - return errors.Wrapf(err, "running cmd: %s", rr.Command()) + return errors.Wrap(err, "run") } } @@ -504,11 +529,32 @@ func (k *Bootstrapper) DeleteCluster(k8s config.KubernetesConfig) error { cmd = fmt.Sprintf("%s reset", bsutil.InvokeKubeadm(k8s.KubernetesVersion)) } - if rr, err := k.c.RunCmd(exec.Command("/bin/bash", "-c", cmd)); err != nil { - return errors.Wrapf(err, "kubeadm reset: cmd: %q", rr.Command()) + rr, derr := k.c.RunCmd(exec.Command("/bin/bash", "-c", cmd)) + if derr != nil { + glog.Warningf("%s: %v", rr.Command(), err) } - return nil + if err := kubelet.ForceStop(k.c); err != nil { + glog.Warningf("stop kubelet: %v", err) + } + + cr, err := cruntime.New(cruntime.Config{Type: k8s.ContainerRuntime, Runner: k.c, Socket: k8s.CRISocket}) + if err != nil { + return errors.Wrap(err, "runtime") + } + + containers, err := cr.ListContainers(cruntime.ListOptions{Namespaces: []string{"kube-system"}}) + if err != nil { + glog.Warningf("unable to list kube-system containers: %v", err) + } + if len(containers) > 0 { + glog.Warningf("found %d kube-system containers to stop", len(containers)) + if err := cr.StopContainers(containers); err != nil { + glog.Warningf("error stopping containers: %v", err) + } + } + + return derr } // SetupCerts sets up certificates within the cluster. @@ -619,7 +665,7 @@ func reloadKubelet(runner command.Runner) error { return nil } - startCmd := exec.Command("/bin/bash", "-c", fmt.Sprintf("sudo mv %s.new %s && sudo mv %s.new %s && sudo systemctl daemon-reload && sudo systemctl restart kubelet", svc, svc, conf, conf)) + startCmd := exec.Command("/bin/bash", "-c", fmt.Sprintf("sudo cp %s.new %s && sudo cp %s.new %s && sudo systemctl daemon-reload && sudo systemctl restart kubelet", svc, svc, conf, conf)) if _, err := runner.RunCmd(startCmd); err != nil { return errors.Wrap(err, "starting kubelet") }