Skip to content

Commit

Permalink
nfd-master: run a separate gRPC health server
Browse files Browse the repository at this point in the history
This patch separates the gRPC health server from the deprecated gRPC
server (disabled by default, replaced by the NodeFeature CRD API) used
for node labeling requests. The new health server runs on hardcoded TCP
port number 8082.

The main motivation for this change is to make the Kubernetes' built-in
gRPC liveness probes to function if TLS is enabled (as they don't
support TLS).

The health server itself is a naive implementation (as it was before),
basically only checking that nfd-master has started and hasn't crashed.
The patch adds a TODO note to improve the functionality.
  • Loading branch information
marquiz committed Jan 4, 2024
1 parent b3919f3 commit f0ec516
Show file tree
Hide file tree
Showing 3 changed files with 45 additions and 13 deletions.
6 changes: 2 additions & 4 deletions deployment/base/master/master-deployment.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -23,12 +23,12 @@ spec:
imagePullPolicy: Always
livenessProbe:
grpc:
port: 8080
port: 8082
initialDelaySeconds: 10
periodSeconds: 10
readinessProbe:
grpc:
port: 8080
port: 8082
initialDelaySeconds: 5
periodSeconds: 10
failureThreshold: 10
Expand All @@ -37,5 +37,3 @@ spec:
ports:
- name: metrics
containerPort: 8081
- name: grpc
containerPort: 8080
4 changes: 2 additions & 2 deletions deployment/helm/node-feature-discovery/templates/master.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -43,12 +43,12 @@ spec:
imagePullPolicy: {{ .Values.image.pullPolicy }}
livenessProbe:
grpc:
port: 8080
port: 8082
initialDelaySeconds: 10
periodSeconds: 10
readinessProbe:
grpc:
port: 8080
port: 8082
initialDelaySeconds: 5
periodSeconds: 10
failureThreshold: 10
Expand Down
48 changes: 41 additions & 7 deletions pkg/nfd-master/nfd-master.go
Original file line number Diff line number Diff line change
Expand Up @@ -61,6 +61,10 @@ import (
"sigs.k8s.io/node-feature-discovery/pkg/version"
)

const (
GRPC_HEALTH_SERVER_PORT = 8082
)

// Labels are a Kubernetes representation of discovered features.
type Labels map[string]string

Expand Down Expand Up @@ -144,6 +148,7 @@ type nfdMaster struct {
nodeName string
configFilePath string
server *grpc.Server
healthServer *grpc.Server
stop chan struct{}
ready chan bool
apihelper apihelper.APIHelpers
Expand Down Expand Up @@ -270,7 +275,11 @@ func (m *nfdMaster) Run() error {

// Run gRPC server
grpcErr := make(chan error, 1)
go m.runGrpcServer(grpcErr)
// If the NodeFeature API is enabled, don'tregister the labeler API
// server. Otherwise, register the labeler server.
if !m.args.EnableNodeFeatureApi {
go m.runGrpcServer(grpcErr)
}

// Run updater that handles events from the nfd CRD API.
if m.nfdController != nil {
Expand All @@ -281,6 +290,11 @@ func (m *nfdMaster) Run() error {
}
}

// Start gRPC server for liveness probe (at this point we're "live")
if err := m.startGrpcHealthServer(grpcErr); err != nil {
return fmt.Errorf("failed to start gRPC health server: %w", err)
}

// Notify that we're ready to accept connections
m.ready <- true
close(m.ready)
Expand Down Expand Up @@ -323,6 +337,30 @@ func (m *nfdMaster) Run() error {
}
}

// startGrpcHealthServer starts a gRPC health server for Kubernetes readiness/liveness probes.
// TODO: improve status checking e.g. with watchdog in the main event loop and
// cheking that node updater pool is alive.
func (m *nfdMaster) startGrpcHealthServer(errChan chan<- error) error {
lis, err := net.Listen("tcp", fmt.Sprintf(":%d", GRPC_HEALTH_SERVER_PORT))
if err != nil {
return fmt.Errorf("failed to listen: %w", err)
}

s := grpc.NewServer()
grpc_health_v1.RegisterHealthServer(s, health.NewServer())
klog.InfoS("gRPC health server serving", "port", GRPC_HEALTH_SERVER_PORT)

go func() {
defer lis.Close()
if err := s.Serve(lis); err != nil {
errChan <- fmt.Errorf("gRPC health server exited with an error: %w", err)
}
klog.InfoS("gRPC health server stopped")
}()
m.healthServer = s
return nil
}

func (m *nfdMaster) runGrpcServer(errChan chan<- error) {
// Create server listening for TCP connections
lis, err := net.Listen("tcp", fmt.Sprintf(":%d", m.args.Port))
Expand Down Expand Up @@ -352,13 +390,8 @@ func (m *nfdMaster) runGrpcServer(errChan chan<- error) {
}
m.server = grpc.NewServer(serverOpts...)

// If the NodeFeature API is enabled, don'tregister the labeler API
// server. Otherwise, register the labeler server.
if !m.args.EnableNodeFeatureApi {
pb.RegisterLabelerServer(m.server, m)
}
pb.RegisterLabelerServer(m.server, m)

grpc_health_v1.RegisterHealthServer(m.server, health.NewServer())
klog.InfoS("gRPC server serving", "port", m.args.Port)

// Run gRPC server
Expand Down Expand Up @@ -422,6 +455,7 @@ func (m *nfdMaster) nfdAPIUpdateHandler() {
// Stop NfdMaster
func (m *nfdMaster) Stop() {
m.server.GracefulStop()
m.healthServer.GracefulStop()

if m.nfdController != nil {
m.nfdController.stop()
Expand Down

0 comments on commit f0ec516

Please sign in to comment.