Skip to content

Commit

Permalink
Merge pull request #10424 from prezha/fix-WaitForPod-Ready
Browse files Browse the repository at this point in the history
add new extra component to --wait=all to validate a healthy cluster
  • Loading branch information
medyagh committed Feb 16, 2021
2 parents 3bdb549 + 8099f75 commit d84f178
Show file tree
Hide file tree
Showing 4 changed files with 201 additions and 12 deletions.
15 changes: 13 additions & 2 deletions pkg/minikube/bootstrapper/bsutil/kverify/kverify.go
Original file line number Diff line number Diff line change
Expand Up @@ -37,16 +37,18 @@ const (
NodeReadyKey = "node_ready"
// KubeletKey is the name used in the flags for waiting for the kubelet status to be ready
KubeletKey = "kubelet"
// ExtraKey is the name used for extra waiting for pods in CorePodsList to be Ready
ExtraKey = "extra"
)

// vars related to the --wait flag
var (
// DefaultComponents is map of the the default components to wait for
DefaultComponents = map[string]bool{APIServerWaitKey: true, SystemPodsWaitKey: true}
// NoWaitComponents is map of componets to wait for if specified 'none' or 'false'
NoComponents = map[string]bool{APIServerWaitKey: false, SystemPodsWaitKey: false, DefaultSAWaitKey: false, AppsRunningKey: false, NodeReadyKey: false, KubeletKey: false}
NoComponents = map[string]bool{APIServerWaitKey: false, SystemPodsWaitKey: false, DefaultSAWaitKey: false, AppsRunningKey: false, NodeReadyKey: false, KubeletKey: false, ExtraKey: false}
// AllComponents is map for waiting for all components.
AllComponents = map[string]bool{APIServerWaitKey: true, SystemPodsWaitKey: true, DefaultSAWaitKey: true, AppsRunningKey: true, NodeReadyKey: true, KubeletKey: true}
AllComponents = map[string]bool{APIServerWaitKey: true, SystemPodsWaitKey: true, DefaultSAWaitKey: true, AppsRunningKey: true, NodeReadyKey: true, KubeletKey: true, ExtraKey: true}
// DefaultWaitList is list of all default components to wait for. only names to be used for start flags.
DefaultWaitList = []string{APIServerWaitKey, SystemPodsWaitKey}
// AllComponentsList list of all valid components keys to wait for. only names to be used used for start flags.
Expand All @@ -60,6 +62,15 @@ var (
"kube-proxy",
"kube-scheduler",
}
// CorePodsList is a list of essential pods for running kurnetes to extra wait for them to be Ready
CorePodsList = []string{
"kube-dns", // coredns
"etcd",
"kube-apiserver",
"kube-controller-manager",
"kube-proxy",
"kube-scheduler",
}
)

// ShouldWait will return true if the config says need to wait
Expand Down
133 changes: 133 additions & 0 deletions pkg/minikube/bootstrapper/bsutil/kverify/pod_ready.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,133 @@
/*
Copyright 2021 The Kubernetes Authors All rights reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
*/

// Package kverify verifies a running Kubernetes cluster is healthy
package kverify

import (
"fmt"
"strings"
"time"

"github.com/pkg/errors"
core "k8s.io/api/core/v1"
meta "k8s.io/apimachinery/pkg/apis/meta/v1"
"k8s.io/apimachinery/pkg/util/wait"
"k8s.io/client-go/kubernetes"
"k8s.io/klog/v2"
kconst "k8s.io/kubernetes/cmd/kubeadm/app/constants"
)

// WaitExtra calls WaitForPodReadyByLabel for each pod in labels list and returns any errors occurred.
func WaitExtra(cs *kubernetes.Clientset, labels []string, timeout time.Duration) error {
klog.Infof("extra waiting for kube-system core pods %s to be Ready ...", labels)
start := time.Now()
defer func() {
klog.Infof("duration metric: took %s for extra waiting for kube-system core pods to be Ready ...", time.Since(start))
}()

var errs []string
for _, label := range labels {
if err := waitForPodReadyByLabel(cs, label, "kube-system", timeout); err != nil {
errs = append(errs, fmt.Sprintf("%q: %q", label, err.Error()))
}
}
if errs != nil {
return fmt.Errorf(strings.Join(errs, ", "))
}

return nil
}

// waitForPodReadyByLabel waits for pod with label ([key:]val) in a namespace to be in Ready condition.
// If namespace is not provided, it defaults to "kube-system".
// If label key is not provided, it will try with "component" and "k8s-app".
func waitForPodReadyByLabel(cs *kubernetes.Clientset, label, namespace string, timeout time.Duration) error {
klog.Infof("waiting %v for pod with %q label in %q namespace to be Ready ...", timeout, label, namespace)
start := time.Now()
defer func() {
klog.Infof("duration metric: took %v to run WaitForPodReadyByLabel for pod with %q label in %q namespace ...", time.Since(start), label, namespace)
}()

if namespace == "" {
namespace = "kube-system"
}

lkey := ""
lval := ""
l := strings.Split(label, ":")
switch len(l) {
case 1: // treat as no label key provided, just val
lval = strings.TrimSpace(l[0])
case 2:
lkey = strings.TrimSpace(l[0])
lval = strings.TrimSpace(l[1])
default:
return fmt.Errorf("pod label %q is malformed", label)
}

lap := time.Now()
checkReady := func() (bool, error) {
if time.Since(start) > timeout {
return false, fmt.Errorf("wait for pod with %q label in %q namespace to be Ready timed out", label, namespace)
}
pods, err := cs.CoreV1().Pods(namespace).List(meta.ListOptions{})
if err != nil {
klog.Infof("error listing pods in %q namespace, will retry: %v", namespace, err)
return false, nil
}
for _, pod := range pods.Items {
for k, v := range pod.ObjectMeta.Labels {
if ((lkey == "" && (k == "component" || k == "k8s-app")) || lkey == k) && v == lval {
ready, reason := IsPodReady(&pod)
if ready {
klog.Info(reason)
return true, nil
}
// reduce log spam
if time.Since(lap) > (1 * time.Second) {
klog.Info(reason)
lap = time.Now()
}
return false, nil
}
}
}
klog.Infof("pod with %q label in %q namespace was not found, will retry", label, namespace)
return false, nil
}
if err := wait.PollImmediate(kconst.APICallRetryInterval, kconst.DefaultControlPlaneTimeout, checkReady); err != nil {
return errors.Wrapf(err, "wait pod Ready")
}

return nil
}

// IsPodReady returns if pod is Ready and verbose reason.
func IsPodReady(pod *core.Pod) (ready bool, reason string) {
if pod.Status.Phase != core.PodRunning {
return false, fmt.Sprintf("pod %q in %q namespace is not Running: %+v", pod.Name, pod.Namespace, pod.Status)
}
for _, c := range pod.Status.Conditions {
if c.Type == core.PodReady {
if c.Status != core.ConditionTrue {
return false, fmt.Sprintf("pod %q in %q namespace is not Ready: %+v", pod.Name, pod.Namespace, c)
}
return true, fmt.Sprintf("pod %q in %q namespace is Ready: %+v", pod.Name, pod.Namespace, c)
}
}
return false, fmt.Sprintf("pod %q in %q namespace does not have %q status: %+v", pod.Name, pod.Namespace, core.PodReady, pod.Status)
}
39 changes: 38 additions & 1 deletion pkg/minikube/bootstrapper/kubeadm/kubeadm.go
Original file line number Diff line number Diff line change
Expand Up @@ -36,6 +36,7 @@ import (
"github.com/docker/machine/libmachine"
"github.com/docker/machine/libmachine/state"
"github.com/pkg/errors"
meta "k8s.io/apimachinery/pkg/apis/meta/v1"
"k8s.io/apimachinery/pkg/util/wait"
"k8s.io/client-go/kubernetes"
"k8s.io/klog/v2"
Expand Down Expand Up @@ -470,6 +471,12 @@ func (k *Bootstrapper) WaitForNode(cfg config.ClusterConfig, n config.Node, time
return nil
}

if cfg.VerifyComponents[kverify.ExtraKey] {
if err := kverify.WaitExtra(client, kverify.CorePodsList, timeout); err != nil {
return errors.Wrap(err, "extra waiting")
}
}

cr, err := cruntime.New(cruntime.Config{Type: cfg.KubernetesConfig.ContainerRuntime, Runner: k.c})
if err != nil {
return errors.Wrapf(err, "create runtme-manager %s", cfg.KubernetesConfig.ContainerRuntime)
Expand Down Expand Up @@ -504,11 +511,11 @@ func (k *Bootstrapper) WaitForNode(cfg config.ClusterConfig, n config.Node, time
}
}
}

if cfg.VerifyComponents[kverify.KubeletKey] {
if err := kverify.WaitForService(k.c, "kubelet", timeout); err != nil {
return errors.Wrap(err, "waiting for kubelet")
}

}

if cfg.VerifyComponents[kverify.NodeReadyKey] {
Expand Down Expand Up @@ -658,6 +665,35 @@ func (k *Bootstrapper) restartControlPlane(cfg config.ClusterConfig) error {
}
}

if cfg.VerifyComponents[kverify.ExtraKey] {
// after kubelet is restarted (with 'kubeadm init phase kubelet-start' above),
// it appears as to be immediately Ready as well as all kube-system pods,
// then (after ~10sec) it realises it has some changes to apply, implying also pods restarts,
// and by that time we would exit completely, so we wait until kubelet begins restarting pods
klog.Info("waiting for restarted kubelet to initialise ...")
start := time.Now()
wait := func() error {
pods, err := client.CoreV1().Pods("kube-system").List(meta.ListOptions{})
if err != nil {
return err
}
for _, pod := range pods.Items {
if pod.Labels["tier"] == "control-plane" {
if ready, _ := kverify.IsPodReady(&pod); !ready {
return nil
}
}
}
return fmt.Errorf("kubelet not initialised")
}
_ = retry.Expo(wait, 250*time.Millisecond, 1*time.Minute)
klog.Infof("kubelet initialised")
klog.Infof("duration metric: took %s waiting for restarted kubelet to initialise ...", time.Since(start))
if err := kverify.WaitExtra(client, kverify.CorePodsList, kconst.DefaultControlPlaneTimeout); err != nil {
return errors.Wrap(err, "extra")
}
}

cr, err := cruntime.New(cruntime.Config{Type: cfg.KubernetesConfig.ContainerRuntime, Runner: k.c})
if err != nil {
return errors.Wrap(err, "runtime")
Expand Down Expand Up @@ -698,6 +734,7 @@ func (k *Bootstrapper) restartControlPlane(cfg config.ClusterConfig) error {
if err := bsutil.AdjustResourceLimits(k.c); err != nil {
klog.Warningf("unable to adjust resource limits: %v", err)
}

return nil
}

Expand Down
26 changes: 17 additions & 9 deletions test/integration/functional_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -256,7 +256,6 @@ func validateDockerEnv(ctx context.Context, t *testing.T, profile string) {
if !strings.Contains(rr.Output(), expectedImgInside) {
t.Fatalf("expected 'docker images' to have %q inside minikube. but the output is: *%s*", expectedImgInside, rr.Output())
}

}

func validateStartWithProxy(ctx context.Context, t *testing.T, profile string) {
Expand All @@ -269,7 +268,7 @@ func validateStartWithProxy(ctx context.Context, t *testing.T, profile string) {

// Use more memory so that we may reliably fit MySQL and nginx
// changing api server so later in soft start we verify it didn't change
startArgs := append([]string{"start", "-p", profile, "--memory=4000", fmt.Sprintf("--apiserver-port=%d", apiPortTest), "--wait=true"}, StartArgs()...)
startArgs := append([]string{"start", "-p", profile, "--memory=4000", fmt.Sprintf("--apiserver-port=%d", apiPortTest), "--wait=all"}, StartArgs()...)
c := exec.CommandContext(ctx, Target(), startArgs...)
env := os.Environ()
env = append(env, fmt.Sprintf("HTTP_PROXY=%s", srv.Addr))
Expand Down Expand Up @@ -401,15 +400,14 @@ func validateMinikubeKubectlDirectCall(ctx context.Context, t *testing.T, profil
if err != nil {
t.Fatalf("failed to run kubectl directly. args %q: %v", rr.Command(), err)
}

}

func validateExtraConfig(ctx context.Context, t *testing.T, profile string) {
defer PostMortemLogs(t, profile)

start := time.Now()
// The tests before this already created a profile, starting minikube with different --extra-config cmdline option.
startArgs := []string{"start", "-p", profile, "--extra-config=apiserver.enable-admission-plugins=NamespaceAutoProvision"}
startArgs := []string{"start", "-p", profile, "--extra-config=apiserver.enable-admission-plugins=NamespaceAutoProvision", "--wait=all"}
c := exec.CommandContext(ctx, Target(), startArgs...)
rr, err := Run(t, c)
if err != nil {
Expand All @@ -427,7 +425,6 @@ func validateExtraConfig(ctx context.Context, t *testing.T, profile string) {
if !strings.Contains(afterCfg.Config.KubernetesConfig.ExtraOptions.String(), expectedExtraOptions) {
t.Errorf("expected ExtraOptions to contain %s but got %s", expectedExtraOptions, afterCfg.Config.KubernetesConfig.ExtraOptions.String())
}

}

// imageID returns a docker image id for image `image` and current architecture
Expand All @@ -451,6 +448,7 @@ func imageID(image string) string {
}

// validateComponentHealth asserts that all Kubernetes components are healthy
// note: it expects all components to be Ready, so it makes sense to run it close after only those tests that include '--wait=all' start flag (ie, with extra wait)
func validateComponentHealth(ctx context.Context, t *testing.T, profile string) {
defer PostMortemLogs(t, profile)

Expand All @@ -474,12 +472,22 @@ func validateComponentHealth(ctx context.Context, t *testing.T, profile string)

for _, i := range cs.Items {
for _, l := range i.Labels {
t.Logf("%s phase: %s", l, i.Status.Phase)
_, ok := found[l]
if ok {
if _, ok := found[l]; ok { // skip irrelevant (eg, repeating/redundant '"tier": "control-plane"') labels
found[l] = true
if i.Status.Phase != "Running" {
t.Logf("%s phase: %s", l, i.Status.Phase)
if i.Status.Phase != api.PodRunning {
t.Errorf("%s is not Running: %+v", l, i.Status)
continue
}
for _, c := range i.Status.Conditions {
if c.Type == api.PodReady {
if c.Status != api.ConditionTrue {
t.Errorf("%s is not Ready: %+v", l, i.Status)
} else {
t.Logf("%s status: %s", l, c.Type)
}
break
}
}
}
}
Expand Down

0 comments on commit d84f178

Please sign in to comment.