Skip to content

Commit 4e3f973

Browse files
committed
nfd-worker: Add liveness probe
Signed-off-by: Oleg Zhurakivskyy <[email protected]>
1 parent 35cc819 commit 4e3f973

File tree

4 files changed

+79
-12
lines changed

4 files changed

+79
-12
lines changed

cmd/nfd-worker/main.go

+3-1
Original file line numberDiff line numberDiff line change
@@ -32,7 +32,8 @@ import (
3232

3333
const (
3434
// ProgramName is the canonical name of this program
35-
ProgramName = "nfd-worker"
35+
ProgramName = "nfd-worker"
36+
GrpcHealthPort = 8082
3637
)
3738

3839
func main() {
@@ -81,6 +82,7 @@ func main() {
8182
utils.ConfigureGrpcKlog()
8283

8384
// Get new NfdWorker instance
85+
args.GrpcHealthPort = GrpcHealthPort
8486
instance, err := worker.NewNfdWorker(args)
8587
if err != nil {
8688
klog.ErrorS(err, "failed to initialize NfdWorker instance")

deployment/base/worker-daemonset/worker-daemonset.yaml

+11
Original file line numberDiff line numberDiff line change
@@ -19,6 +19,17 @@ spec:
1919
- name: nfd-worker
2020
image: gcr.io/k8s-staging-nfd/node-feature-discovery:master
2121
imagePullPolicy: Always
22+
livenessProbe:
23+
grpc:
24+
port: 8082
25+
initialDelaySeconds: 10
26+
periodSeconds: 10
27+
readinessProbe:
28+
grpc:
29+
port: 8082
30+
initialDelaySeconds: 5
31+
periodSeconds: 10
32+
failureThreshold: 10
2233
command:
2334
- "nfd-worker"
2435
args:

deployment/helm/node-feature-discovery/templates/worker.yaml

+11
Original file line numberDiff line numberDiff line change
@@ -43,6 +43,17 @@ spec:
4343
{{- toYaml .Values.worker.securityContext | nindent 12 }}
4444
image: "{{ .Values.image.repository }}:{{ .Values.image.tag | default .Chart.AppVersion }}"
4545
imagePullPolicy: {{ .Values.image.pullPolicy }}
46+
livenessProbe:
47+
grpc:
48+
port: 8082
49+
initialDelaySeconds: 10
50+
periodSeconds: 10
51+
readinessProbe:
52+
grpc:
53+
port: 8082
54+
initialDelaySeconds: 5
55+
periodSeconds: 10
56+
failureThreshold: 10
4657
env:
4758
- name: NODE_NAME
4859
valueFrom:

pkg/nfd-worker/nfd-worker.go

+54-11
Original file line numberDiff line numberDiff line change
@@ -21,6 +21,7 @@ import (
2121
"crypto/x509"
2222
"encoding/json"
2323
"fmt"
24+
"net"
2425
"os"
2526
"path/filepath"
2627
"regexp"
@@ -33,6 +34,8 @@ import (
3334
"google.golang.org/grpc"
3435
"google.golang.org/grpc/credentials"
3536
"google.golang.org/grpc/credentials/insecure"
37+
"google.golang.org/grpc/health"
38+
"google.golang.org/grpc/health/grpc_health_v1"
3639
"k8s.io/apimachinery/pkg/api/errors"
3740
"k8s.io/apimachinery/pkg/types"
3841
"k8s.io/apimachinery/pkg/util/validation"
@@ -93,17 +96,18 @@ type Labels map[string]string
9396

9497
// Args are the command line arguments of NfdWorker.
9598
type Args struct {
96-
CaFile string
97-
CertFile string
98-
ConfigFile string
99-
KeyFile string
100-
Klog map[string]*utils.KlogFlagVal
101-
Kubeconfig string
102-
Oneshot bool
103-
Options string
104-
Server string
105-
ServerNameOverride string
106-
MetricsPort int
99+
CaFile string
100+
CertFile string
101+
ConfigFile string
102+
KeyFile string
103+
Klog map[string]*utils.KlogFlagVal
104+
Kubeconfig string
105+
Oneshot bool
106+
Options string
107+
Server string
108+
ServerNameOverride string
109+
MetricsPort int
110+
GrpcHealthPort int
107111

108112
Overrides ConfigOverrideArgs
109113
}
@@ -124,6 +128,7 @@ type nfdWorker struct {
124128
config *NFDConfig
125129
kubernetesNamespace string
126130
grpcClient pb.LabelerClient
131+
healthServer *grpc.Server
127132
nfdClient *nfdclient.Clientset
128133
stop chan struct{} // channel for signaling stop
129134
featureSources []source.FeatureSource
@@ -187,6 +192,29 @@ func (i *infiniteTicker) Reset(d time.Duration) {
187192
}
188193
}
189194

195+
func (w *nfdWorker) startGrpcHealthServer(errChan chan<- error) error {
196+
lis, err := net.Listen("tcp", fmt.Sprintf(":%d", w.args.GrpcHealthPort))
197+
if err != nil {
198+
return fmt.Errorf("failed to listen: %w", err)
199+
}
200+
201+
s := grpc.NewServer()
202+
grpc_health_v1.RegisterHealthServer(s, health.NewServer())
203+
klog.InfoS("gRPC health server serving", "port", w.args.GrpcHealthPort)
204+
205+
go func() {
206+
defer func() {
207+
lis.Close()
208+
}()
209+
if err := s.Serve(lis); err != nil {
210+
errChan <- fmt.Errorf("gRPC health server exited with an error: %w", err)
211+
}
212+
klog.InfoS("gRPC health server stopped")
213+
}()
214+
w.healthServer = s
215+
return nil
216+
}
217+
190218
// Run feature discovery.
191219
func (w *nfdWorker) runFeatureDiscovery() error {
192220
discoveryStart := time.Now()
@@ -262,8 +290,20 @@ func (w *nfdWorker) Run() error {
262290
return nil
263291
}
264292

293+
grpcErr := make(chan error, 1)
294+
295+
// Start gRPC server for liveness probe (at this point we're "live")
296+
if w.args.GrpcHealthPort != 0 {
297+
if err := w.startGrpcHealthServer(grpcErr); err != nil {
298+
return fmt.Errorf("failed to start gRPC health server: %w", err)
299+
}
300+
}
301+
265302
for {
266303
select {
304+
case err := <-grpcErr:
305+
return fmt.Errorf("error in serving gRPC: %w", err)
306+
267307
case <-labelTrigger.C:
268308
err = w.runFeatureDiscovery()
269309
if err != nil {
@@ -294,6 +334,9 @@ func (w *nfdWorker) Run() error {
294334

295335
case <-w.stop:
296336
klog.InfoS("shutting down nfd-worker")
337+
if w.healthServer != nil {
338+
w.healthServer.GracefulStop()
339+
}
297340
configWatch.Close()
298341
w.certWatch.Close()
299342
return nil

0 commit comments

Comments
 (0)