From a6f7678701047409f5d64d329a2fbd2ad14c0d55 Mon Sep 17 00:00:00 2001 From: matewolf Date: Thu, 31 Aug 2023 12:00:45 +0200 Subject: [PATCH] Add liveness and readiness probes to koperator's manager pod (#1050) * feat: add liveness/readiness probes * add extra dash * remove unnecessary quotes * move import healthz import to the third parties * relocate the liveness and readiness probe definitions in manager.yaml file * rename health-probes to liveness-readiness-probe * add missing equal sign * relocate wrongly placed error block * rename healthProbe to livenessReadinessProbe in helm charts * rename liveness-readiness-probe to health-probes because of k8s's name length limitations * rename liveness-readiness-probe to health-probes because of k8s's name length limitations * revert rename of health-probe * remove redundant default value for health probes port definition * remove redundant default value for health probes port definition * remove redundant default value for health probes port definition * remove redundant default value for health probes port definition * add failure threshold because of e2e tests * increase the number of failure threshold --------- Co-authored-by: Marton Barta <51166675+bartam1@users.noreply.github.com> --- .../operator-deployment-with-webhook.yaml | 26 +++++++++++++++-- charts/kafka-operator/values.yaml | 7 +++-- config/base/manager/manager.yaml | 15 ++++++++++ main.go | 29 ++++++++++++++----- 4 files changed, 64 insertions(+), 13 deletions(-) diff --git a/charts/kafka-operator/templates/operator-deployment-with-webhook.yaml b/charts/kafka-operator/templates/operator-deployment-with-webhook.yaml index 3bb05c3bf..aa70af8e0 100644 --- a/charts/kafka-operator/templates/operator-deployment-with-webhook.yaml +++ b/charts/kafka-operator/templates/operator-deployment-with-webhook.yaml @@ -199,9 +199,26 @@ spec: {{- if (.Values.metricEndpoint).port }} - --metrics-addr=":{{ .Values.metricEndpoint.port }}" {{- end }} + {{- if .Values.healthProbes.port }} + - --health-probes-addr=:{{ .Values.healthProbes.port }} + {{- end }} image: "{{ .Values.operator.image.repository }}:{{ .Values.operator.image.tag | default .Chart.AppVersion }}" imagePullPolicy: {{ .Values.operator.image.pullPolicy }} name: manager + livenessProbe: + initialDelaySeconds: 15 + periodSeconds: 10 + failureThreshold: 100 + httpGet: + port: health-probes + path: /healthz + readinessProbe: + initialDelaySeconds: 20 + periodSeconds: 15 + failureThreshold: 100 + httpGet: + port: health-probes + path: /readyz env: - name: POD_NAMESPACE valueFrom: @@ -212,17 +229,20 @@ spec: {{ toYaml .Values.additionalEnv | nindent 12 }} {{- end }} ports: - {{- if .Values.webhook.enabled }} + {{- if .Values.webhook.enabled }} - containerPort: {{ .Values.webhook.serverPort | default 9443 }} name: webhook-server protocol: TCP - {{- end }} + {{- end}} - containerPort: {{ (.Values.metricEndpoint).port | default 8080 }} name: metrics protocol: TCP - containerPort: {{ .Values.alertManager.port }} name: alerts protocol: TCP + - containerPort: {{ .Values.healthProbes.port | default 8081 }} + name: health-probes + protocol: TCP volumeMounts: {{- if .Values.webhook.enabled }} - mountPath: {{ (.Values.webhook.tls).certDir | default "/etc/webhook/certs" }} @@ -230,7 +250,7 @@ spec: readOnly: true {{- end }} resources: - {{ toYaml .Values.operator.resources | nindent 12 }} + {{- toYaml .Values.operator.resources | nindent 12 }} {{- if .Values.containerSecurityContext }} securityContext: {{ toYaml .Values.containerSecurityContext | nindent 12 }} diff --git a/charts/kafka-operator/values.yaml b/charts/kafka-operator/values.yaml index 4368967e6..83698e8db 100644 --- a/charts/kafka-operator/values.yaml +++ b/charts/kafka-operator/values.yaml @@ -20,9 +20,9 @@ operator: # configurable Kubernetes namespaces. # In this scenario, users can replace the default # ClusterRole and ClusterRoleBinding to Role and RoleBinding respectively. - # When this field is not empty and Cert-manager is used, + # When this field is not empty and Cert-manager is used, # the Cert-manager's Custom Resource Namespace must be included in the comma separated list. - # When it is empty, all namespaces will be watched. + # When it is empty, all namespaces will be watched. namespaces: "" verboseLogging: false developmentLogging: false @@ -73,6 +73,9 @@ prometheusMetrics: create: true name: kafka-operator-authproxy +healthProbes: {} + # port: + #metricEndpoint: # port: diff --git a/config/base/manager/manager.yaml b/config/base/manager/manager.yaml index b4ba7893e..b4d30b095 100644 --- a/config/base/manager/manager.yaml +++ b/config/base/manager/manager.yaml @@ -22,10 +22,25 @@ spec: - --enable-leader-election image: ghcr.io/banzaicloud/kafka-operator:latest name: manager + livenessProbe: + initialDelaySeconds: 15 + periodSeconds: 10 + httpGet: + port: health-probes + path: /healthz + readinessProbe: + initialDelaySeconds: 20 + periodSeconds: 15 + httpGet: + port: health-probes + path: /readyz ports: - containerPort: 9001 name: alerts protocol: TCP + - containerPort: 8081 + name: health-probes + protocol: TCP resources: limits: cpu: 300m diff --git a/main.go b/main.go index 8f821677f..b3a0a6a1a 100644 --- a/main.go +++ b/main.go @@ -36,6 +36,7 @@ import ( "strings" "sigs.k8s.io/controller-runtime/pkg/cache" + "sigs.k8s.io/controller-runtime/pkg/healthz" istioclientv1beta1 "github.com/banzaicloud/istio-client-go/pkg/networking/v1beta1" @@ -89,6 +90,7 @@ func main() { certSigningDisabled bool certManagerEnabled bool maxKafkaTopicConcurrentReconciles int + healthProbesAddr string ) flag.StringVar(&namespaces, "namespaces", "", "Comma separated list of namespaces where operator listens for resources") @@ -103,6 +105,7 @@ func main() { flag.BoolVar(&certManagerEnabled, "cert-manager-enabled", false, "Enable cert-manager integration") flag.BoolVar(&certSigningDisabled, "disable-cert-signing-support", false, "Disable native certificate signing integration") flag.IntVar(&maxKafkaTopicConcurrentReconciles, "max-kafka-topic-concurrent-reconciles", 10, "Define max amount of concurrent KafkaTopic reconciles") + flag.StringVar(&healthProbesAddr, "health-probes-addr", ":8081", "The address the probe endpoint binds to.") flag.Parse() ctrl.SetLogger(util.CreateLogger(verboseLogging, developmentLogging)) @@ -125,20 +128,30 @@ func main() { } mgr, err := ctrl.NewManager(ctrl.GetConfigOrDie(), ctrl.Options{ - Scheme: scheme, - MetricsBindAddress: metricsAddr, - LeaderElection: enableLeaderElection, - LeaderElectionID: "controller-leader-election-helper", - NewCache: managerWatchCacheBuilder, - Port: webhookServerPort, - CertDir: webhookCertDir, + Scheme: scheme, + MetricsBindAddress: metricsAddr, + LeaderElection: enableLeaderElection, + LeaderElectionID: "controller-leader-election-helper", + NewCache: managerWatchCacheBuilder, + Port: webhookServerPort, + CertDir: webhookCertDir, + HealthProbeBindAddress: healthProbesAddr, }) - if err != nil { setupLog.Error(err, "unable to start manager") os.Exit(1) } + if err = mgr.AddHealthzCheck("healthz", healthz.Ping); err != nil { + setupLog.Error(err, "unable to start /healthz endpoint") + os.Exit(1) + } + + if err = mgr.AddReadyzCheck("readyz", healthz.Ping); err != nil { + setupLog.Error(err, "unable to start /readyz endpoint") + os.Exit(1) + } + if err := certv1.AddToScheme(mgr.GetScheme()); err != nil { setupLog.Error(err, "") os.Exit(1)