diff --git a/pkg/addons/addons.go b/pkg/addons/addons.go index b39ee3f60174..f394fc0681b4 100644 --- a/pkg/addons/addons.go +++ b/pkg/addons/addons.go @@ -332,7 +332,9 @@ func Start(wg *sync.WaitGroup, cc *config.ClusterConfig, toEnable map[string]boo var awg sync.WaitGroup - out.T(out.AddonEnable, "Enabling addons: {{.addons}}", out.V{"addons": strings.Join(toEnableList, ", ")}) + defer func() { // making it show after verifications( not perfect till #7613 is closed) + out.T(out.AddonEnable, "Enabled addons: {{.addons}}", out.V{"addons": strings.Join(toEnableList, ", ")}) + }() for _, a := range toEnableList { awg.Add(1) go func(name string) { diff --git a/pkg/minikube/bootstrapper/bsutil/kverify/kverify.go b/pkg/minikube/bootstrapper/bsutil/kverify/kverify.go index f4486196c349..0bc234c7011f 100644 --- a/pkg/minikube/bootstrapper/bsutil/kverify/kverify.go +++ b/pkg/minikube/bootstrapper/bsutil/kverify/kverify.go @@ -32,7 +32,9 @@ const ( // DefaultSAWaitKey is the name used in the flags for default service account DefaultSAWaitKey = "default_sa" // AppsRunning is the name used in the flags for waiting for k8s-apps to be running - AppsRunning = "apps_running" + AppsRunningKey = "apps_running" + // NodeReadyKey is the name used in the flags for waiting for the node status to be ready + NodeReadyKey = "node_ready" ) // vars related to the --wait flag @@ -40,13 +42,13 @@ var ( // DefaultComponents is map of the the default components to wait for DefaultComponents = map[string]bool{APIServerWaitKey: true, SystemPodsWaitKey: true} // NoWaitComponents is map of componets to wait for if specified 'none' or 'false' - NoComponents = map[string]bool{APIServerWaitKey: false, SystemPodsWaitKey: false, DefaultSAWaitKey: false, AppsRunning: false} + NoComponents = map[string]bool{APIServerWaitKey: false, SystemPodsWaitKey: false, DefaultSAWaitKey: false, AppsRunningKey: false, NodeReadyKey: false} // AllComponents is map for waiting for all components. - AllComponents = map[string]bool{APIServerWaitKey: true, SystemPodsWaitKey: true, DefaultSAWaitKey: true, AppsRunning: true} + AllComponents = map[string]bool{APIServerWaitKey: true, SystemPodsWaitKey: true, DefaultSAWaitKey: true, AppsRunningKey: true} // DefaultWaitList is list of all default components to wait for. only names to be used for start flags. DefaultWaitList = []string{APIServerWaitKey, SystemPodsWaitKey} // AllComponentsList list of all valid components keys to wait for. only names to be used used for start flags. - AllComponentsList = []string{APIServerWaitKey, SystemPodsWaitKey, DefaultSAWaitKey, AppsRunning} + AllComponentsList = []string{APIServerWaitKey, SystemPodsWaitKey, DefaultSAWaitKey, AppsRunningKey, NodeReadyKey} // AppsRunningList running list are valid k8s-app components to wait for them to be running AppsRunningList = []string{ "kube-dns", // coredns diff --git a/pkg/minikube/bootstrapper/bsutil/kverify/node_conditions.go b/pkg/minikube/bootstrapper/bsutil/kverify/node_conditions.go new file mode 100644 index 000000000000..7e8e9a40a5c8 --- /dev/null +++ b/pkg/minikube/bootstrapper/bsutil/kverify/node_conditions.go @@ -0,0 +1,142 @@ +/* +Copyright 2020 The Kubernetes Authors All rights reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +*/ + +// Package kverify verifies a running kubernetes cluster is healthy +package kverify + +import ( + "fmt" + "time" + + "github.com/golang/glog" + "github.com/pkg/errors" + v1 "k8s.io/api/core/v1" + meta "k8s.io/apimachinery/pkg/apis/meta/v1" + "k8s.io/client-go/kubernetes" +) + +// NodeCondition represents a favorable or unfavorable node condition. +type NodeCondition struct { + Type v1.NodeConditionType + Status v1.ConditionStatus + Reason string + Message string +} + +// DiskPressure detects if the condition is disk pressure +func (pc *NodeCondition) DiskPressure() bool { + return pc.Type == v1.NodeDiskPressure && pc.Status == v1.ConditionTrue +} + +// MemoryPressure detects if the condition is memory pressure +func (pc *NodeCondition) MemoryPressure() bool { + return pc.Type == v1.NodeMemoryPressure && pc.Status == v1.ConditionTrue +} + +// PIDPressure detects if the condition is PID pressure +func (pc *NodeCondition) PIDPressure() bool { + return pc.Type == v1.NodePIDPressure && pc.Status == v1.ConditionTrue +} + +// NetworkUnavailable detects if the condition is PID pressure +func (pc *NodeCondition) NetworkUnavailable() bool { + return pc.Type == v1.NodeNetworkUnavailable && pc.Status == v1.ConditionTrue +} + +const errTextFormat = "node has unwanted condition %q : Reason %q Message: %q" + +// ErrMemoryPressure is thrown when there is node memory pressure condition +type ErrMemoryPressure struct { + NodeCondition +} + +func (e *ErrMemoryPressure) Error() string { + return fmt.Sprintf(errTextFormat, e.Type, e.Reason, e.Message) +} + +// ErrDiskPressure is thrown when there is node disk pressure condition +type ErrDiskPressure struct { + NodeCondition +} + +func (e *ErrDiskPressure) Error() string { + return fmt.Sprintf(errTextFormat, e.Type, e.Reason, e.Message) +} + +// ErrPIDPressure is thrown when there is node PID pressure condition +type ErrPIDPressure struct { + NodeCondition +} + +func (e *ErrPIDPressure) Error() string { + return fmt.Sprintf(errTextFormat, e.Type, e.Reason, e.Message) +} + +// ErrNetworkNotReady is thrown when there is node condition is network not ready +type ErrNetworkNotReady struct { + NodeCondition +} + +func (e *ErrNetworkNotReady) Error() string { + return fmt.Sprintf(errTextFormat, e.Type, e.Reason, e.Message) +} + +// NodePressure verfies that node is not under disk, memory, pid or network pressure. +func NodePressure(cs *kubernetes.Clientset) error { + glog.Info("verifying NodePressure condition ...") + start := time.Now() + defer func() { + glog.Infof("duration metric: took %s to run NodePressure ...", time.Since(start)) + }() + + ns, err := cs.CoreV1().Nodes().List(meta.ListOptions{}) + if err != nil { + return errors.Wrap(err, "list nodes") + } + + for _, n := range ns.Items { + glog.Infof("node storage ephemeral capacity is %s", n.Status.Capacity.StorageEphemeral()) + glog.Infof("node cpu capacity is %s", n.Status.Capacity.Cpu().AsDec()) + for _, c := range n.Status.Conditions { + pc := NodeCondition{Type: c.Type, Status: c.Status, Reason: c.Reason, Message: c.Message} + if pc.DiskPressure() { + return &ErrDiskPressure{ + NodeCondition: pc, + } + } + + if pc.MemoryPressure() { + return &ErrMemoryPressure{ + NodeCondition: pc, + } + } + + if pc.PIDPressure() { + return &ErrPIDPressure{ + NodeCondition: pc, + } + } + + if pc.NetworkUnavailable() { + return &ErrNetworkNotReady{ + NodeCondition: pc, + } + } + + } + } + return nil +} diff --git a/pkg/minikube/bootstrapper/bsutil/kverify/node_ready.go b/pkg/minikube/bootstrapper/bsutil/kverify/node_ready.go new file mode 100644 index 000000000000..a9c1879b6a01 --- /dev/null +++ b/pkg/minikube/bootstrapper/bsutil/kverify/node_ready.go @@ -0,0 +1,64 @@ +/* +Copyright 2020 The Kubernetes Authors All rights reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +*/ + +// Package kverify verifies a running kubernetes cluster is healthy +package kverify + +import ( + "fmt" + "time" + + "github.com/golang/glog" + "github.com/pkg/errors" + v1 "k8s.io/api/core/v1" + meta "k8s.io/apimachinery/pkg/apis/meta/v1" + "k8s.io/apimachinery/pkg/util/wait" + "k8s.io/client-go/kubernetes" + kconst "k8s.io/kubernetes/cmd/kubeadm/app/constants" +) + +// WaitForNodeReady waits till kube client reports node status as "ready" +func WaitForNodeReady(cs *kubernetes.Clientset, timeout time.Duration) error { + glog.Info("waiting for node status to be ready ...") + start := time.Now() + defer func() { + glog.Infof("duration metric: took %s to wait for WaitForNodeReady...", time.Since(start)) + }() + checkReady := func() (bool, error) { + if time.Since(start) > timeout { + return false, fmt.Errorf("wait for node to be ready timed out") + } + ns, err := cs.CoreV1().Nodes().List(meta.ListOptions{}) + if err != nil { + glog.Infof("error listing nodes will retry: %v", err) + return false, nil + } + + for _, n := range ns.Items { + for _, c := range n.Status.Conditions { + if c.Type == v1.NodeReady && c.Status != v1.ConditionTrue { + glog.Infof("node %q has unwanted condition %q : Reason %q Message: %q. will try. ", n.Name, c.Type, c.Reason, c.Message) + return false, nil + } + } + } + return true, nil + } + if err := wait.PollImmediate(kconst.APICallRetryInterval, kconst.DefaultControlPlaneTimeout, checkReady); err != nil { + return errors.Wrapf(err, "wait node ready") + } + return nil +} diff --git a/pkg/minikube/bootstrapper/kubeadm/kubeadm.go b/pkg/minikube/bootstrapper/kubeadm/kubeadm.go index 3ae20295e467..625e580fbae9 100644 --- a/pkg/minikube/bootstrapper/kubeadm/kubeadm.go +++ b/pkg/minikube/bootstrapper/kubeadm/kubeadm.go @@ -21,6 +21,7 @@ import ( "context" "os/exec" "path" + "runtime" "sync" "fmt" @@ -41,6 +42,7 @@ import ( "k8s.io/client-go/kubernetes" kconst "k8s.io/kubernetes/cmd/kubeadm/app/constants" "k8s.io/minikube/pkg/drivers/kic" + "k8s.io/minikube/pkg/drivers/kic/oci" "k8s.io/minikube/pkg/kapi" "k8s.io/minikube/pkg/minikube/assets" "k8s.io/minikube/pkg/minikube/bootstrapper" @@ -325,16 +327,37 @@ func (k *Bootstrapper) client(ip string, port int) (*kubernetes.Clientset, error } // WaitForNode blocks until the node appears to be healthy -func (k *Bootstrapper) WaitForNode(cfg config.ClusterConfig, n config.Node, timeout time.Duration) error { +func (k *Bootstrapper) WaitForNode(cfg config.ClusterConfig, n config.Node, timeout time.Duration) (waitErr error) { start := time.Now() if !n.ControlPlane { glog.Infof("%s is not a control plane, nothing to wait for", n.Name) return nil } + + out.T(out.HealthCheck, "Verifying Kubernetes Components:") + out.T(out.IndentVerify, "verifying node conditions ...") + + // TODO: #7706: for better performance we could use k.client inside minikube to avoid asking for external IP:PORT + hostname, _, port, err := driver.ControlPaneEndpoint(&cfg, &n, cfg.Driver) + if err != nil { + return errors.Wrap(err, "get control plane endpoint") + } + + defer func() { // run pressure verification after all other checks, so there be an api server to talk to. + client, err := k.client(hostname, port) + if err != nil { + waitErr = errors.Wrap(err, "get k8s client") + } + if err := kverify.NodePressure(client); err != nil { + adviseNodePressure(err, cfg.Name, cfg.Driver) + waitErr = errors.Wrap(err, "node pressure") + } + }() + if !kverify.ShouldWait(cfg.VerifyComponents) { glog.Infof("skip waiting for components based on config.") - return nil + return waitErr } cr, err := cruntime.New(cruntime.Config{Type: cfg.KubernetesConfig.ContainerRuntime, Runner: k.c}) @@ -342,12 +365,8 @@ func (k *Bootstrapper) WaitForNode(cfg config.ClusterConfig, n config.Node, time return errors.Wrapf(err, "create runtme-manager %s", cfg.KubernetesConfig.ContainerRuntime) } - hostname, _, port, err := driver.ControlPaneEndpoint(&cfg, &n, cfg.Driver) - if err != nil { - return errors.Wrap(err, "get control plane endpoint") - } - if cfg.VerifyComponents[kverify.APIServerWaitKey] { + out.T(out.IndentVerify, "verifying api server ...") client, err := k.client(hostname, port) if err != nil { return errors.Wrap(err, "get k8s client") @@ -362,6 +381,7 @@ func (k *Bootstrapper) WaitForNode(cfg config.ClusterConfig, n config.Node, time } if cfg.VerifyComponents[kverify.SystemPodsWaitKey] { + out.T(out.IndentVerify, "verifying system pods ...") client, err := k.client(hostname, port) if err != nil { return errors.Wrap(err, "get k8s client") @@ -372,6 +392,7 @@ func (k *Bootstrapper) WaitForNode(cfg config.ClusterConfig, n config.Node, time } if cfg.VerifyComponents[kverify.DefaultSAWaitKey] { + out.T(out.IndentVerify, "verifying default service account ...") client, err := k.client(hostname, port) if err != nil { return errors.Wrap(err, "get k8s client") @@ -381,7 +402,8 @@ func (k *Bootstrapper) WaitForNode(cfg config.ClusterConfig, n config.Node, time } } - if cfg.VerifyComponents[kverify.AppsRunning] { + if cfg.VerifyComponents[kverify.AppsRunningKey] { + out.T(out.IndentVerify, "verifying apps running ...") client, err := k.client(hostname, port) if err != nil { return errors.Wrap(err, "get k8s client") @@ -391,8 +413,19 @@ func (k *Bootstrapper) WaitForNode(cfg config.ClusterConfig, n config.Node, time } } + if cfg.VerifyComponents[kverify.NodeReadyKey] { + out.T(out.IndentVerify, "verifying node ready") + client, err := k.client(hostname, port) + if err != nil { + return errors.Wrap(err, "get k8s client") + } + if err := kverify.WaitForNodeReady(client, timeout); err != nil { + return errors.Wrap(err, "waiting for node to be ready") + } + } + glog.Infof("duration metric: took %s to wait for : %+v ...", time.Since(start), cfg.VerifyComponents) - return nil + return waitErr } // needsReset returns whether or not the cluster needs to be reconfigured @@ -517,6 +550,10 @@ func (k *Bootstrapper) restartCluster(cfg config.ClusterConfig) error { return errors.Wrap(err, "system pods") } + if err := kverify.NodePressure(client); err != nil { + adviseNodePressure(err, cfg.Name, cfg.Driver) + } + // This can fail during upgrades if the old pods have not shut down yet addonPhase := func() error { _, err := k.c.RunCmd(exec.Command("/bin/bash", "-c", fmt.Sprintf("%s phase addon all --config %s", baseCmd, conf))) @@ -860,3 +897,63 @@ func (k *Bootstrapper) elevateKubeSystemPrivileges(cfg config.ClusterConfig) err } return nil } + +// adviseNodePressure will advise the user what to do with difference pressure errors based on their environment +func adviseNodePressure(err error, name string, drv string) { + if diskErr, ok := err.(*kverify.ErrDiskPressure); ok { + out.ErrLn("") + glog.Warning(diskErr) + out.WarningT("The node {{.name}} has ran out of disk space.", out.V{"name": name}) + // generic advice for all drivers + out.T(out.Tip, "Please free up disk or prune images.") + if driver.IsVM(drv) { + out.T(out.Stopped, "Please create a cluster with bigger disk size: `minikube start --disk SIZE_MB` ") + } else if drv == oci.Docker && runtime.GOOS != "linux" { + out.T(out.Stopped, "Please increse Desktop's disk size.") + if runtime.GOOS == "darwin" { + out.T(out.Documentation, "Documentation: {{.url}}", out.V{"url": "https://docs.docker.com/docker-for-mac/space/"}) + } + if runtime.GOOS == "windows" { + out.T(out.Documentation, "Documentation: {{.url}}", out.V{"url": "https://docs.docker.com/docker-for-windows/"}) + } + } + out.ErrLn("") + return + } + + if memErr, ok := err.(*kverify.ErrMemoryPressure); ok { + out.ErrLn("") + glog.Warning(memErr) + out.WarningT("The node {{.name}} has ran out of memory.", out.V{"name": name}) + out.T(out.Tip, "Check if you have unnecessary pods running by running 'kubectl get po -A") + if driver.IsVM(drv) { + out.T(out.Stopped, "Consider creating a cluster with larger memory size using `minikube start --memory SIZE_MB` ") + } else if drv == oci.Docker && runtime.GOOS != "linux" { + out.T(out.Stopped, "Consider increasing Docker Desktop's memory size.") + if runtime.GOOS == "darwin" { + out.T(out.Documentation, "Documentation: {{.url}}", out.V{"url": "https://docs.docker.com/docker-for-mac/space/"}) + } + if runtime.GOOS == "windows" { + out.T(out.Documentation, "Documentation: {{.url}}", out.V{"url": "https://docs.docker.com/docker-for-windows/"}) + } + } + out.ErrLn("") + return + } + + if pidErr, ok := err.(*kverify.ErrPIDPressure); ok { + glog.Warning(pidErr) + out.ErrLn("") + out.WarningT("The node {{.name}} has ran out of available PIDs.", out.V{"name": name}) + out.ErrLn("") + return + } + + if netErr, ok := err.(*kverify.ErrNetworkNotReady); ok { + glog.Warning(netErr) + out.ErrLn("") + out.WarningT("The node {{.name}} network is not available. Please verify network settings.", out.V{"name": name}) + out.ErrLn("") + return + } +} diff --git a/pkg/minikube/node/start.go b/pkg/minikube/node/start.go index 689adef274b5..004ce8923448 100644 --- a/pkg/minikube/node/start.go +++ b/pkg/minikube/node/start.go @@ -37,7 +37,6 @@ import ( cmdcfg "k8s.io/minikube/cmd/minikube/cmd/config" "k8s.io/minikube/pkg/addons" "k8s.io/minikube/pkg/minikube/bootstrapper" - "k8s.io/minikube/pkg/minikube/bootstrapper/bsutil/kverify" "k8s.io/minikube/pkg/minikube/bootstrapper/images" "k8s.io/minikube/pkg/minikube/cluster" "k8s.io/minikube/pkg/minikube/command" @@ -145,8 +144,8 @@ func Start(starter Starter, apiServer bool) (*kubeconfig.Settings, error) { prepareNone() } - // Skip pre-existing, because we already waited for health - if kverify.ShouldWait(starter.Cfg.VerifyComponents) && !starter.PreExists { + // TODO: existing cluster should wait for health #7597 + if !starter.PreExists { if err := bs.WaitForNode(*starter.Cfg, *starter.Node, viper.GetDuration(waitTimeout)); err != nil { return nil, errors.Wrap(err, "Wait failed") } diff --git a/pkg/minikube/out/style.go b/pkg/minikube/out/style.go index 1cd400d2ae24..29629db19138 100644 --- a/pkg/minikube/out/style.go +++ b/pkg/minikube/out/style.go @@ -70,6 +70,7 @@ var styles = map[StyleEnum]style{ ThumbsUp: {Prefix: "👍 "}, ThumbsDown: {Prefix: "👎 "}, Option: {Prefix: " ▪ ", LowPrefix: lowIndent}, // Indented bullet + IndentVerify: {Prefix: " 🔎 ", LowPrefix: lowIndent}, // Indented verifying icon, it needs extra space to make it work Command: {Prefix: " ▪ ", LowPrefix: lowIndent}, // Indented bullet LogEntry: {Prefix: " "}, // Indent Deleted: {Prefix: "💀 "}, @@ -108,6 +109,7 @@ var styles = map[StyleEnum]style{ Enabling: {Prefix: "🔌 "}, Shutdown: {Prefix: "🛑 "}, Pulling: {Prefix: "🚜 "}, + HealthCheck: {Prefix: "🕵️ "}, // mac needed extra space for right tabbing Verifying: {Prefix: "🤔 "}, VerifyingNoLine: {Prefix: "🤔 ", OmitNewline: true}, Kubectl: {Prefix: "💗 "}, diff --git a/pkg/minikube/out/style_enum.go b/pkg/minikube/out/style_enum.go index 1437b26823a9..e47ec2572254 100644 --- a/pkg/minikube/out/style_enum.go +++ b/pkg/minikube/out/style_enum.go @@ -43,6 +43,7 @@ const ( ThumbsUp ThumbsDown Option + IndentVerify Command LogEntry Deleted @@ -73,6 +74,7 @@ const ( Enabling Shutdown Pulling + HealthCheck Verifying VerifyingNoLine Kubectl diff --git a/site/content/en/docs/commands/start.md b/site/content/en/docs/commands/start.md index 839be1e1673d..60128f2f653c 100644 --- a/site/content/en/docs/commands/start.md +++ b/site/content/en/docs/commands/start.md @@ -86,7 +86,7 @@ minikube start [flags] --uuid string Provide VM UUID to restore MAC address (hyperkit driver only) --vm Filter to use only VM Drivers --vm-driver driver DEPRECATED, use driver instead. - --wait strings comma separated list of kubernetes components to verify and wait for after starting a cluster. defaults to "apiserver,system_pods", available options: "apiserver,system_pods,default_sa,apps_running" . other acceptable values are 'all' or 'none', 'true' and 'false' (default [apiserver,system_pods]) + --wait strings comma separated list of kubernetes components to verify and wait for after starting a cluster. defaults to "apiserver,system_pods", available options: "apiserver,system_pods,default_sa,apps_running,node_ready" . other acceptable values are 'all' or 'none', 'true' and 'false' (default [apiserver,system_pods]) --wait-timeout duration max time to wait per Kubernetes core services to be healthy. (default 6m0s) ```