Skip to content

Commit fcf819a

Browse files
authored
Merge pull request #1643 from ozhuraki/topology-health
nfd-topology-updater: Add liveness probe
2 parents 7938e81 + f2e9557 commit fcf819a

File tree

5 files changed

+82
-0
lines changed

5 files changed

+82
-0
lines changed

cmd/nfd-topology-updater/main.go

+2
Original file line numberDiff line numberDiff line change
@@ -36,6 +36,7 @@ const (
3636
// ProgramName is the canonical name of this program
3737
ProgramName = "nfd-topology-updater"
3838
kubeletSecurePort = 10250
39+
GrpcHealthPort = 8082
3940
)
4041

4142
var DefaultKubeletStateDir = path.Join(string(hostpath.VarDir), "lib", "kubelet")
@@ -54,6 +55,7 @@ func main() {
5455
utils.ConfigureGrpcKlog()
5556

5657
// Get new TopologyUpdater instance
58+
args.GrpcHealthPort = GrpcHealthPort
5759
instance, err := topology.NewTopologyUpdater(*args, *resourcemonitorArgs)
5860
if err != nil {
5961
klog.ErrorS(err, "failed to initialize topology updater instance")

deployment/base/topologyupdater-daemonset/topologyupdater-daemonset.yaml

+11
Original file line numberDiff line numberDiff line change
@@ -19,6 +19,17 @@ spec:
1919
- name: nfd-topology-updater
2020
image: gcr.io/k8s-staging-nfd/node-feature-discovery:master
2121
imagePullPolicy: Always
22+
livenessProbe:
23+
grpc:
24+
port: 8082
25+
initialDelaySeconds: 10
26+
periodSeconds: 10
27+
readinessProbe:
28+
grpc:
29+
port: 8082
30+
initialDelaySeconds: 5
31+
periodSeconds: 10
32+
failureThreshold: 10
2233
command:
2334
- "nfd-topology-updater"
2435
args: []

deployment/helm/node-feature-discovery/templates/topologyupdater.yaml

+11
Original file line numberDiff line numberDiff line change
@@ -41,6 +41,17 @@ spec:
4141
- name: topology-updater
4242
image: "{{ .Values.image.repository }}:{{ .Values.image.tag | default .Chart.AppVersion }}"
4343
imagePullPolicy: "{{ .Values.image.pullPolicy }}"
44+
livenessProbe:
45+
grpc:
46+
port: 8082
47+
initialDelaySeconds: 10
48+
periodSeconds: 10
49+
readinessProbe:
50+
grpc:
51+
port: 8082
52+
initialDelaySeconds: 5
53+
periodSeconds: 10
54+
failureThreshold: 10
4455
env:
4556
- name: NODE_NAME
4657
valueFrom:

deployment/helm/node-feature-discovery/values.yaml

+14
Original file line numberDiff line numberDiff line change
@@ -475,6 +475,20 @@ topologyUpdater:
475475
readOnlyRootFilesystem: true
476476
runAsUser: 0
477477

478+
# livenessProbe: {}
479+
## NOTE: Currently not configurable, defaults are provided for the sake of extra documentation.
480+
# grpc:
481+
# port: 8082
482+
# initialDelaySeconds: 10
483+
# periodSeconds: 10
484+
# readinessProbe: {}
485+
## NOTE: Currently not configurable, defaults are provided for the sake of extra documentation.
486+
# grpc:
487+
# port: 8082
488+
# initialDelaySeconds: 5
489+
# periodSeconds: 10
490+
# failureThreshold: 10
491+
478492
resources:
479493
limits:
480494
cpu: 100m

pkg/nfd-topology-updater/nfd-topology-updater.go

+44
Original file line numberDiff line numberDiff line change
@@ -18,12 +18,16 @@ package nfdtopologyupdater
1818

1919
import (
2020
"fmt"
21+
"net"
2122
"net/url"
2223
"os"
2324
"path/filepath"
2425

2526
"golang.org/x/net/context"
2627

28+
"google.golang.org/grpc"
29+
"google.golang.org/grpc/health"
30+
"google.golang.org/grpc/health/grpc_health_v1"
2731
"k8s.io/apimachinery/pkg/api/errors"
2832
metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
2933
"k8s.io/apimachinery/pkg/types"
@@ -58,6 +62,7 @@ type Args struct {
5862
KubeConfigFile string
5963
ConfigFile string
6064
KubeletStateDir string
65+
GrpcHealthPort int
6166

6267
Klog map[string]*utils.KlogFlagVal
6368
}
@@ -85,6 +90,7 @@ type nfdTopologyUpdater struct {
8590
ownerRefs []metav1.OwnerReference
8691
k8sClient k8sclient.Interface
8792
kubeletConfigFunc func() (*kubeletconfigv1beta1.KubeletConfiguration, error)
93+
healthServer *grpc.Server
8894
}
8995

9096
// NewTopologyUpdater creates a new NfdTopologyUpdater instance.
@@ -128,6 +134,29 @@ func (w *nfdTopologyUpdater) detectTopologyPolicyAndScope() (string, string, err
128134
return klConfig.TopologyManagerPolicy, klConfig.TopologyManagerScope, nil
129135
}
130136

137+
func (w *nfdTopologyUpdater) startGrpcHealthServer(errChan chan<- error) error {
138+
lis, err := net.Listen("tcp", fmt.Sprintf(":%d", w.args.GrpcHealthPort))
139+
if err != nil {
140+
return fmt.Errorf("failed to listen: %w", err)
141+
}
142+
143+
s := grpc.NewServer()
144+
grpc_health_v1.RegisterHealthServer(s, health.NewServer())
145+
klog.InfoS("gRPC health server serving", "port", w.args.GrpcHealthPort)
146+
147+
go func() {
148+
defer func() {
149+
lis.Close()
150+
}()
151+
if err := s.Serve(lis); err != nil {
152+
errChan <- fmt.Errorf("gRPC health server exited with an error: %w", err)
153+
}
154+
klog.InfoS("gRPC health server stopped")
155+
}()
156+
w.healthServer = s
157+
return nil
158+
}
159+
131160
// Run nfdTopologyUpdater. Returns if a fatal error is encountered, or, after
132161
// one request if OneShot is set to 'true' in the updater args.
133162
func (w *nfdTopologyUpdater) Run() error {
@@ -187,8 +216,20 @@ func (w *nfdTopologyUpdater) Run() error {
187216
return fmt.Errorf("failed to obtain node resource information: %w", err)
188217
}
189218

219+
grpcErr := make(chan error, 1)
220+
221+
// Start gRPC server for liveness probe (at this point we're "live")
222+
if w.args.GrpcHealthPort != 0 {
223+
if err := w.startGrpcHealthServer(grpcErr); err != nil {
224+
return fmt.Errorf("failed to start gRPC health server: %w", err)
225+
}
226+
}
227+
190228
for {
191229
select {
230+
case err := <-grpcErr:
231+
return fmt.Errorf("error in serving gRPC: %w", err)
232+
192233
case info := <-w.eventSource:
193234
klog.V(4).InfoS("event received, scanning...", "event", info.Event)
194235
scanResponse, err := resScan.Scan()
@@ -217,6 +258,9 @@ func (w *nfdTopologyUpdater) Run() error {
217258

218259
case <-w.stop:
219260
klog.InfoS("shutting down nfd-topology-updater")
261+
if w.healthServer != nil {
262+
w.healthServer.GracefulStop()
263+
}
220264
return nil
221265
}
222266
}

0 commit comments

Comments
 (0)