diff --git a/cmd/nfd-master/main.go b/cmd/nfd-master/main.go index 3418d2af14..5c78a38295 100644 --- a/cmd/nfd-master/main.go +++ b/cmd/nfd-master/main.go @@ -32,7 +32,8 @@ import ( const ( // ProgramName is the canonical name of this program - ProgramName = "nfd-master" + ProgramName = "nfd-master" + GrpcHealthPort = 8082 ) func main() { @@ -100,6 +101,7 @@ func main() { utils.ConfigureGrpcKlog() // Get new NfdMaster instance + args.GrpcHealthPort = GrpcHealthPort instance, err := master.NewNfdMaster(args) if err != nil { klog.ErrorS(err, "failed to initialize NfdMaster instance") diff --git a/deployment/base/master/master-deployment.yaml b/deployment/base/master/master-deployment.yaml index bfebfd4dd3..25313d0d63 100644 --- a/deployment/base/master/master-deployment.yaml +++ b/deployment/base/master/master-deployment.yaml @@ -23,12 +23,12 @@ spec: imagePullPolicy: Always livenessProbe: grpc: - port: 8080 + port: 8082 initialDelaySeconds: 10 periodSeconds: 10 readinessProbe: grpc: - port: 8080 + port: 8082 initialDelaySeconds: 5 periodSeconds: 10 failureThreshold: 10 @@ -37,5 +37,3 @@ spec: ports: - name: metrics containerPort: 8081 - - name: grpc - containerPort: 8080 diff --git a/deployment/helm/node-feature-discovery/templates/master.yaml b/deployment/helm/node-feature-discovery/templates/master.yaml index 53a291e0f7..422261be60 100644 --- a/deployment/helm/node-feature-discovery/templates/master.yaml +++ b/deployment/helm/node-feature-discovery/templates/master.yaml @@ -43,12 +43,12 @@ spec: imagePullPolicy: {{ .Values.image.pullPolicy }} livenessProbe: grpc: - port: 8080 + port: 8082 initialDelaySeconds: 10 periodSeconds: 10 readinessProbe: grpc: - port: 8080 + port: 8082 initialDelaySeconds: 5 periodSeconds: 10 failureThreshold: 10 diff --git a/pkg/nfd-master/nfd-master.go b/pkg/nfd-master/nfd-master.go index 2f5dfebfe5..fa113f2207 100644 --- a/pkg/nfd-master/nfd-master.go +++ b/pkg/nfd-master/nfd-master.go @@ -116,6 +116,9 @@ type Args struct { CrdController bool EnableNodeFeatureApi bool Port int + // GrpcHealthPort is only needed to avoid races between tests (by skipping the health server). + // Could be removed when gRPC labler service is dropped (when nfd-worker tests stop running nfd-master). + GrpcHealthPort int Prune bool VerifyNodeName bool Options string @@ -144,6 +147,7 @@ type nfdMaster struct { nodeName string configFilePath string server *grpc.Server + healthServer *grpc.Server stop chan struct{} ready chan bool apihelper apihelper.APIHelpers @@ -270,7 +274,11 @@ func (m *nfdMaster) Run() error { // Run gRPC server grpcErr := make(chan error, 1) - go m.runGrpcServer(grpcErr) + // If the NodeFeature API is enabled, don'tregister the labeler API + // server. Otherwise, register the labeler server. + if !m.args.EnableNodeFeatureApi { + go m.runGrpcServer(grpcErr) + } // Run updater that handles events from the nfd CRD API. if m.nfdController != nil { @@ -281,6 +289,13 @@ func (m *nfdMaster) Run() error { } } + // Start gRPC server for liveness probe (at this point we're "live") + if m.args.GrpcHealthPort != 0 { + if err := m.startGrpcHealthServer(grpcErr); err != nil { + return fmt.Errorf("failed to start gRPC health server: %w", err) + } + } + // Notify that we're ready to accept connections m.ready <- true close(m.ready) @@ -323,6 +338,32 @@ func (m *nfdMaster) Run() error { } } +// startGrpcHealthServer starts a gRPC health server for Kubernetes readiness/liveness probes. +// TODO: improve status checking e.g. with watchdog in the main event loop and +// cheking that node updater pool is alive. +func (m *nfdMaster) startGrpcHealthServer(errChan chan<- error) error { + lis, err := net.Listen("tcp", fmt.Sprintf(":%d", m.args.GrpcHealthPort)) + if err != nil { + return fmt.Errorf("failed to listen: %w", err) + } + + s := grpc.NewServer() + grpc_health_v1.RegisterHealthServer(s, health.NewServer()) + klog.InfoS("gRPC health server serving", "port", m.args.GrpcHealthPort) + + go func() { + defer func() { + lis.Close() + }() + if err := s.Serve(lis); err != nil { + errChan <- fmt.Errorf("gRPC health server exited with an error: %w", err) + } + klog.InfoS("gRPC health server stopped") + }() + m.healthServer = s + return nil +} + func (m *nfdMaster) runGrpcServer(errChan chan<- error) { // Create server listening for TCP connections lis, err := net.Listen("tcp", fmt.Sprintf(":%d", m.args.Port)) @@ -352,13 +393,8 @@ func (m *nfdMaster) runGrpcServer(errChan chan<- error) { } m.server = grpc.NewServer(serverOpts...) - // If the NodeFeature API is enabled, don'tregister the labeler API - // server. Otherwise, register the labeler server. - if !m.args.EnableNodeFeatureApi { - pb.RegisterLabelerServer(m.server, m) - } + pb.RegisterLabelerServer(m.server, m) - grpc_health_v1.RegisterHealthServer(m.server, health.NewServer()) klog.InfoS("gRPC server serving", "port", m.args.Port) // Run gRPC server @@ -421,7 +457,12 @@ func (m *nfdMaster) nfdAPIUpdateHandler() { // Stop NfdMaster func (m *nfdMaster) Stop() { - m.server.GracefulStop() + if m.server != nil { + m.server.GracefulStop() + } + if m.healthServer != nil { + m.healthServer.GracefulStop() + } if m.nfdController != nil { m.nfdController.stop()