diff --git a/pkg/minikube/bootstrapper/bsutil/kverify/kverify.go b/pkg/minikube/bootstrapper/bsutil/kverify/kverify.go index e12f2d707420..a9f9ec6638da 100644 --- a/pkg/minikube/bootstrapper/bsutil/kverify/kverify.go +++ b/pkg/minikube/bootstrapper/bsutil/kverify/kverify.go @@ -37,6 +37,8 @@ const ( NodeReadyKey = "node_ready" // KubeletKey is the name used in the flags for waiting for the kubelet status to be ready KubeletKey = "kubelet" + // ExtraKey is the name used for extra waiting for pods in CorePodsList to be Ready + ExtraKey = "extra" ) // vars related to the --wait flag @@ -44,9 +46,9 @@ var ( // DefaultComponents is map of the the default components to wait for DefaultComponents = map[string]bool{APIServerWaitKey: true, SystemPodsWaitKey: true} // NoWaitComponents is map of componets to wait for if specified 'none' or 'false' - NoComponents = map[string]bool{APIServerWaitKey: false, SystemPodsWaitKey: false, DefaultSAWaitKey: false, AppsRunningKey: false, NodeReadyKey: false, KubeletKey: false} + NoComponents = map[string]bool{APIServerWaitKey: false, SystemPodsWaitKey: false, DefaultSAWaitKey: false, AppsRunningKey: false, NodeReadyKey: false, KubeletKey: false, ExtraKey: false} // AllComponents is map for waiting for all components. - AllComponents = map[string]bool{APIServerWaitKey: true, SystemPodsWaitKey: true, DefaultSAWaitKey: true, AppsRunningKey: true, NodeReadyKey: true, KubeletKey: true} + AllComponents = map[string]bool{APIServerWaitKey: true, SystemPodsWaitKey: true, DefaultSAWaitKey: true, AppsRunningKey: true, NodeReadyKey: true, KubeletKey: true, ExtraKey: true} // DefaultWaitList is list of all default components to wait for. only names to be used for start flags. DefaultWaitList = []string{APIServerWaitKey, SystemPodsWaitKey} // AllComponentsList list of all valid components keys to wait for. only names to be used used for start flags. @@ -60,6 +62,15 @@ var ( "kube-proxy", "kube-scheduler", } + // CorePodsList is a list of essential pods for running kurnetes to extra wait for them to be Ready + CorePodsList = []string{ + "kube-dns", // coredns + "etcd", + "kube-apiserver", + "kube-controller-manager", + "kube-proxy", + "kube-scheduler", + } ) // ShouldWait will return true if the config says need to wait diff --git a/pkg/minikube/bootstrapper/bsutil/kverify/pod_ready.go b/pkg/minikube/bootstrapper/bsutil/kverify/pod_ready.go new file mode 100644 index 000000000000..982cee5461dc --- /dev/null +++ b/pkg/minikube/bootstrapper/bsutil/kverify/pod_ready.go @@ -0,0 +1,133 @@ +/* +Copyright 2021 The Kubernetes Authors All rights reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +*/ + +// Package kverify verifies a running Kubernetes cluster is healthy +package kverify + +import ( + "fmt" + "strings" + "time" + + "github.com/pkg/errors" + core "k8s.io/api/core/v1" + meta "k8s.io/apimachinery/pkg/apis/meta/v1" + "k8s.io/apimachinery/pkg/util/wait" + "k8s.io/client-go/kubernetes" + "k8s.io/klog/v2" + kconst "k8s.io/kubernetes/cmd/kubeadm/app/constants" +) + +// WaitExtra calls WaitForPodReadyByLabel for each pod in labels list and returns any errors occurred. +func WaitExtra(cs *kubernetes.Clientset, labels []string, timeout time.Duration) error { + klog.Infof("extra waiting for kube-system core pods %s to be Ready ...", labels) + start := time.Now() + defer func() { + klog.Infof("duration metric: took %s for extra waiting for kube-system core pods to be Ready ...", time.Since(start)) + }() + + var errs []string + for _, label := range labels { + if err := waitForPodReadyByLabel(cs, label, "kube-system", timeout); err != nil { + errs = append(errs, fmt.Sprintf("%q: %q", label, err.Error())) + } + } + if errs != nil { + return fmt.Errorf(strings.Join(errs, ", ")) + } + + return nil +} + +// waitForPodReadyByLabel waits for pod with label ([key:]val) in a namespace to be in Ready condition. +// If namespace is not provided, it defaults to "kube-system". +// If label key is not provided, it will try with "component" and "k8s-app". +func waitForPodReadyByLabel(cs *kubernetes.Clientset, label, namespace string, timeout time.Duration) error { + klog.Infof("waiting %v for pod with %q label in %q namespace to be Ready ...", timeout, label, namespace) + start := time.Now() + defer func() { + klog.Infof("duration metric: took %v to run WaitForPodReadyByLabel for pod with %q label in %q namespace ...", time.Since(start), label, namespace) + }() + + if namespace == "" { + namespace = "kube-system" + } + + lkey := "" + lval := "" + l := strings.Split(label, ":") + switch len(l) { + case 1: // treat as no label key provided, just val + lval = strings.TrimSpace(l[0]) + case 2: + lkey = strings.TrimSpace(l[0]) + lval = strings.TrimSpace(l[1]) + default: + return fmt.Errorf("pod label %q is malformed", label) + } + + lap := time.Now() + checkReady := func() (bool, error) { + if time.Since(start) > timeout { + return false, fmt.Errorf("wait for pod with %q label in %q namespace to be Ready timed out", label, namespace) + } + pods, err := cs.CoreV1().Pods(namespace).List(meta.ListOptions{}) + if err != nil { + klog.Infof("error listing pods in %q namespace, will retry: %v", namespace, err) + return false, nil + } + for _, pod := range pods.Items { + for k, v := range pod.ObjectMeta.Labels { + if ((lkey == "" && (k == "component" || k == "k8s-app")) || lkey == k) && v == lval { + ready, reason := IsPodReady(&pod) + if ready { + klog.Info(reason) + return true, nil + } + // reduce log spam + if time.Since(lap) > (1 * time.Second) { + klog.Info(reason) + lap = time.Now() + } + return false, nil + } + } + } + klog.Infof("pod with %q label in %q namespace was not found, will retry", label, namespace) + return false, nil + } + if err := wait.PollImmediate(kconst.APICallRetryInterval, kconst.DefaultControlPlaneTimeout, checkReady); err != nil { + return errors.Wrapf(err, "wait pod Ready") + } + + return nil +} + +// IsPodReady returns if pod is Ready and verbose reason. +func IsPodReady(pod *core.Pod) (ready bool, reason string) { + if pod.Status.Phase != core.PodRunning { + return false, fmt.Sprintf("pod %q in %q namespace is not Running: %+v", pod.Name, pod.Namespace, pod.Status) + } + for _, c := range pod.Status.Conditions { + if c.Type == core.PodReady { + if c.Status != core.ConditionTrue { + return false, fmt.Sprintf("pod %q in %q namespace is not Ready: %+v", pod.Name, pod.Namespace, c) + } + return true, fmt.Sprintf("pod %q in %q namespace is Ready: %+v", pod.Name, pod.Namespace, c) + } + } + return false, fmt.Sprintf("pod %q in %q namespace does not have %q status: %+v", pod.Name, pod.Namespace, core.PodReady, pod.Status) +} diff --git a/pkg/minikube/bootstrapper/kubeadm/kubeadm.go b/pkg/minikube/bootstrapper/kubeadm/kubeadm.go index ad0348aa4030..34194b954039 100644 --- a/pkg/minikube/bootstrapper/kubeadm/kubeadm.go +++ b/pkg/minikube/bootstrapper/kubeadm/kubeadm.go @@ -36,6 +36,7 @@ import ( "github.com/docker/machine/libmachine" "github.com/docker/machine/libmachine/state" "github.com/pkg/errors" + meta "k8s.io/apimachinery/pkg/apis/meta/v1" "k8s.io/apimachinery/pkg/util/wait" "k8s.io/client-go/kubernetes" "k8s.io/klog/v2" @@ -470,6 +471,12 @@ func (k *Bootstrapper) WaitForNode(cfg config.ClusterConfig, n config.Node, time return nil } + if cfg.VerifyComponents[kverify.ExtraKey] { + if err := kverify.WaitExtra(client, kverify.CorePodsList, timeout); err != nil { + return errors.Wrap(err, "extra waiting") + } + } + cr, err := cruntime.New(cruntime.Config{Type: cfg.KubernetesConfig.ContainerRuntime, Runner: k.c}) if err != nil { return errors.Wrapf(err, "create runtme-manager %s", cfg.KubernetesConfig.ContainerRuntime) @@ -504,11 +511,11 @@ func (k *Bootstrapper) WaitForNode(cfg config.ClusterConfig, n config.Node, time } } } + if cfg.VerifyComponents[kverify.KubeletKey] { if err := kverify.WaitForService(k.c, "kubelet", timeout); err != nil { return errors.Wrap(err, "waiting for kubelet") } - } if cfg.VerifyComponents[kverify.NodeReadyKey] { @@ -658,6 +665,35 @@ func (k *Bootstrapper) restartControlPlane(cfg config.ClusterConfig) error { } } + if cfg.VerifyComponents[kverify.ExtraKey] { + // after kubelet is restarted (with 'kubeadm init phase kubelet-start' above), + // it appears as to be immediately Ready as well as all kube-system pods, + // then (after ~10sec) it realises it has some changes to apply, implying also pods restarts, + // and by that time we would exit completely, so we wait until kubelet begins restarting pods + klog.Info("waiting for restarted kubelet to initialise ...") + start := time.Now() + wait := func() error { + pods, err := client.CoreV1().Pods("kube-system").List(meta.ListOptions{}) + if err != nil { + return err + } + for _, pod := range pods.Items { + if pod.Labels["tier"] == "control-plane" { + if ready, _ := kverify.IsPodReady(&pod); !ready { + return nil + } + } + } + return fmt.Errorf("kubelet not initialised") + } + _ = retry.Expo(wait, 250*time.Millisecond, 1*time.Minute) + klog.Infof("kubelet initialised") + klog.Infof("duration metric: took %s waiting for restarted kubelet to initialise ...", time.Since(start)) + if err := kverify.WaitExtra(client, kverify.CorePodsList, kconst.DefaultControlPlaneTimeout); err != nil { + return errors.Wrap(err, "extra") + } + } + cr, err := cruntime.New(cruntime.Config{Type: cfg.KubernetesConfig.ContainerRuntime, Runner: k.c}) if err != nil { return errors.Wrap(err, "runtime") @@ -698,6 +734,7 @@ func (k *Bootstrapper) restartControlPlane(cfg config.ClusterConfig) error { if err := bsutil.AdjustResourceLimits(k.c); err != nil { klog.Warningf("unable to adjust resource limits: %v", err) } + return nil } diff --git a/test/integration/functional_test.go b/test/integration/functional_test.go index c63c6f94a8a5..d29a8f791635 100644 --- a/test/integration/functional_test.go +++ b/test/integration/functional_test.go @@ -256,7 +256,6 @@ func validateDockerEnv(ctx context.Context, t *testing.T, profile string) { if !strings.Contains(rr.Output(), expectedImgInside) { t.Fatalf("expected 'docker images' to have %q inside minikube. but the output is: *%s*", expectedImgInside, rr.Output()) } - } func validateStartWithProxy(ctx context.Context, t *testing.T, profile string) { @@ -269,7 +268,7 @@ func validateStartWithProxy(ctx context.Context, t *testing.T, profile string) { // Use more memory so that we may reliably fit MySQL and nginx // changing api server so later in soft start we verify it didn't change - startArgs := append([]string{"start", "-p", profile, "--memory=4000", fmt.Sprintf("--apiserver-port=%d", apiPortTest), "--wait=true"}, StartArgs()...) + startArgs := append([]string{"start", "-p", profile, "--memory=4000", fmt.Sprintf("--apiserver-port=%d", apiPortTest), "--wait=all"}, StartArgs()...) c := exec.CommandContext(ctx, Target(), startArgs...) env := os.Environ() env = append(env, fmt.Sprintf("HTTP_PROXY=%s", srv.Addr)) @@ -401,7 +400,6 @@ func validateMinikubeKubectlDirectCall(ctx context.Context, t *testing.T, profil if err != nil { t.Fatalf("failed to run kubectl directly. args %q: %v", rr.Command(), err) } - } func validateExtraConfig(ctx context.Context, t *testing.T, profile string) { @@ -409,7 +407,7 @@ func validateExtraConfig(ctx context.Context, t *testing.T, profile string) { start := time.Now() // The tests before this already created a profile, starting minikube with different --extra-config cmdline option. - startArgs := []string{"start", "-p", profile, "--extra-config=apiserver.enable-admission-plugins=NamespaceAutoProvision"} + startArgs := []string{"start", "-p", profile, "--extra-config=apiserver.enable-admission-plugins=NamespaceAutoProvision", "--wait=all"} c := exec.CommandContext(ctx, Target(), startArgs...) rr, err := Run(t, c) if err != nil { @@ -427,7 +425,6 @@ func validateExtraConfig(ctx context.Context, t *testing.T, profile string) { if !strings.Contains(afterCfg.Config.KubernetesConfig.ExtraOptions.String(), expectedExtraOptions) { t.Errorf("expected ExtraOptions to contain %s but got %s", expectedExtraOptions, afterCfg.Config.KubernetesConfig.ExtraOptions.String()) } - } // imageID returns a docker image id for image `image` and current architecture @@ -451,6 +448,7 @@ func imageID(image string) string { } // validateComponentHealth asserts that all Kubernetes components are healthy +// note: it expects all components to be Ready, so it makes sense to run it close after only those tests that include '--wait=all' start flag (ie, with extra wait) func validateComponentHealth(ctx context.Context, t *testing.T, profile string) { defer PostMortemLogs(t, profile) @@ -474,12 +472,22 @@ func validateComponentHealth(ctx context.Context, t *testing.T, profile string) for _, i := range cs.Items { for _, l := range i.Labels { - t.Logf("%s phase: %s", l, i.Status.Phase) - _, ok := found[l] - if ok { + if _, ok := found[l]; ok { // skip irrelevant (eg, repeating/redundant '"tier": "control-plane"') labels found[l] = true - if i.Status.Phase != "Running" { + t.Logf("%s phase: %s", l, i.Status.Phase) + if i.Status.Phase != api.PodRunning { t.Errorf("%s is not Running: %+v", l, i.Status) + continue + } + for _, c := range i.Status.Conditions { + if c.Type == api.PodReady { + if c.Status != api.ConditionTrue { + t.Errorf("%s is not Ready: %+v", l, i.Status) + } else { + t.Logf("%s status: %s", l, c.Type) + } + break + } } } }