Skip to content

Commit

Permalink
nfd-topology-updater: Add liveness probe
Browse files Browse the repository at this point in the history
Signed-off-by: Oleg Zhurakivskyy <[email protected]>
  • Loading branch information
ozhuraki committed Mar 28, 2024
1 parent 137f18b commit ae7609a
Show file tree
Hide file tree
Showing 5 changed files with 82 additions and 0 deletions.
2 changes: 2 additions & 0 deletions cmd/nfd-topology-updater/main.go
Original file line number Diff line number Diff line change
Expand Up @@ -36,6 +36,7 @@ const (
// ProgramName is the canonical name of this program
ProgramName = "nfd-topology-updater"
kubeletSecurePort = 10250
GrpcHealthPort = 8082
)

var DefaultKubeletStateDir = path.Join(string(hostpath.VarDir), "lib", "kubelet")
Expand All @@ -54,6 +55,7 @@ func main() {
utils.ConfigureGrpcKlog()

// Get new TopologyUpdater instance
args.GrpcHealthPort = GrpcHealthPort

Check warning on line 58 in cmd/nfd-topology-updater/main.go

View check run for this annotation

Codecov / codecov/patch

cmd/nfd-topology-updater/main.go#L58

Added line #L58 was not covered by tests
instance, err := topology.NewTopologyUpdater(*args, *resourcemonitorArgs)
if err != nil {
klog.ErrorS(err, "failed to initialize topology updater instance")
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -19,6 +19,17 @@ spec:
- name: nfd-topology-updater
image: gcr.io/k8s-staging-nfd/node-feature-discovery:master
imagePullPolicy: Always
livenessProbe:
grpc:
port: 8082
initialDelaySeconds: 10
periodSeconds: 10
readinessProbe:
grpc:
port: 8082
initialDelaySeconds: 5
periodSeconds: 10
failureThreshold: 10
command:
- "nfd-topology-updater"
args: []
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -41,6 +41,17 @@ spec:
- name: topology-updater
image: "{{ .Values.image.repository }}:{{ .Values.image.tag | default .Chart.AppVersion }}"
imagePullPolicy: "{{ .Values.image.pullPolicy }}"
livenessProbe:
grpc:
port: 8082
initialDelaySeconds: 10
periodSeconds: 10
readinessProbe:
grpc:
port: 8082
initialDelaySeconds: 5
periodSeconds: 10
failureThreshold: 10
env:
- name: NODE_NAME
valueFrom:
Expand Down
14 changes: 14 additions & 0 deletions deployment/helm/node-feature-discovery/values.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -483,6 +483,20 @@ topologyUpdater:
readOnlyRootFilesystem: true
runAsUser: 0

# livenessProbe: {}
## NOTE: Currently not configurable, defaults are provided for the sake of extra documentation.
# grpc:
# port: 8082
# initialDelaySeconds: 10
# periodSeconds: 10
# readinessProbe: {}
## NOTE: Currently not configurable, defaults are provided for the sake of extra documentation.
# grpc:
# port: 8082
# initialDelaySeconds: 5
# periodSeconds: 10
# failureThreshold: 10

resources: {}
# We usually recommend not to specify default resources and to leave this as a conscious
# choice for the user. This also increases chances charts run on environments with little
Expand Down
44 changes: 44 additions & 0 deletions pkg/nfd-topology-updater/nfd-topology-updater.go
Original file line number Diff line number Diff line change
Expand Up @@ -18,12 +18,16 @@ package nfdtopologyupdater

import (
"fmt"
"net"
"net/url"
"os"
"path/filepath"

"golang.org/x/net/context"

"google.golang.org/grpc"
"google.golang.org/grpc/health"
"google.golang.org/grpc/health/grpc_health_v1"
"k8s.io/apimachinery/pkg/api/errors"
metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
"k8s.io/apimachinery/pkg/types"
Expand Down Expand Up @@ -58,6 +62,7 @@ type Args struct {
KubeConfigFile string
ConfigFile string
KubeletStateDir string
GrpcHealthPort int

Klog map[string]*utils.KlogFlagVal
}
Expand Down Expand Up @@ -85,6 +90,7 @@ type nfdTopologyUpdater struct {
ownerRefs []metav1.OwnerReference
k8sClient k8sclient.Interface
kubeletConfigFunc func() (*kubeletconfigv1beta1.KubeletConfiguration, error)
healthServer *grpc.Server
}

// NewTopologyUpdater creates a new NfdTopologyUpdater instance.
Expand Down Expand Up @@ -128,6 +134,29 @@ func (w *nfdTopologyUpdater) detectTopologyPolicyAndScope() (string, string, err
return klConfig.TopologyManagerPolicy, klConfig.TopologyManagerScope, nil
}

func (w *nfdTopologyUpdater) startGrpcHealthServer(errChan chan<- error) error {
lis, err := net.Listen("tcp", fmt.Sprintf(":%d", w.args.GrpcHealthPort))
if err != nil {
return fmt.Errorf("failed to listen: %w", err)
}

Check warning on line 141 in pkg/nfd-topology-updater/nfd-topology-updater.go

View check run for this annotation

Codecov / codecov/patch

pkg/nfd-topology-updater/nfd-topology-updater.go#L137-L141

Added lines #L137 - L141 were not covered by tests

s := grpc.NewServer()
grpc_health_v1.RegisterHealthServer(s, health.NewServer())
klog.InfoS("gRPC health server serving", "port", w.args.GrpcHealthPort)

go func() {
defer func() {
lis.Close()
}()
if err := s.Serve(lis); err != nil {
errChan <- fmt.Errorf("gRPC health server exited with an error: %w", err)
}
klog.InfoS("gRPC health server stopped")

Check warning on line 154 in pkg/nfd-topology-updater/nfd-topology-updater.go

View check run for this annotation

Codecov / codecov/patch

pkg/nfd-topology-updater/nfd-topology-updater.go#L143-L154

Added lines #L143 - L154 were not covered by tests
}()
w.healthServer = s
return nil

Check warning on line 157 in pkg/nfd-topology-updater/nfd-topology-updater.go

View check run for this annotation

Codecov / codecov/patch

pkg/nfd-topology-updater/nfd-topology-updater.go#L156-L157

Added lines #L156 - L157 were not covered by tests
}

// Run nfdTopologyUpdater. Returns if a fatal error is encountered, or, after
// one request if OneShot is set to 'true' in the updater args.
func (w *nfdTopologyUpdater) Run() error {
Expand Down Expand Up @@ -187,8 +216,20 @@ func (w *nfdTopologyUpdater) Run() error {
return fmt.Errorf("failed to obtain node resource information: %w", err)
}

grpcErr := make(chan error, 1)

// Start gRPC server for liveness probe (at this point we're "live")
if w.args.GrpcHealthPort != 0 {
if err := w.startGrpcHealthServer(grpcErr); err != nil {
return fmt.Errorf("failed to start gRPC health server: %w", err)
}

Check warning on line 225 in pkg/nfd-topology-updater/nfd-topology-updater.go

View check run for this annotation

Codecov / codecov/patch

pkg/nfd-topology-updater/nfd-topology-updater.go#L219-L225

Added lines #L219 - L225 were not covered by tests
}

for {
select {
case err := <-grpcErr:
return fmt.Errorf("error in serving gRPC: %w", err)

Check warning on line 231 in pkg/nfd-topology-updater/nfd-topology-updater.go

View check run for this annotation

Codecov / codecov/patch

pkg/nfd-topology-updater/nfd-topology-updater.go#L230-L231

Added lines #L230 - L231 were not covered by tests

case info := <-w.eventSource:
klog.V(4).InfoS("event received, scanning...", "event", info.Event)
scanResponse, err := resScan.Scan()
Expand Down Expand Up @@ -217,6 +258,9 @@ func (w *nfdTopologyUpdater) Run() error {

case <-w.stop:
klog.InfoS("shutting down nfd-topology-updater")
if w.healthServer != nil {
w.healthServer.GracefulStop()
}

Check warning on line 263 in pkg/nfd-topology-updater/nfd-topology-updater.go

View check run for this annotation

Codecov / codecov/patch

pkg/nfd-topology-updater/nfd-topology-updater.go#L261-L263

Added lines #L261 - L263 were not covered by tests
return nil
}
}
Expand Down

0 comments on commit ae7609a

Please sign in to comment.