Skip to content

Commit cda47cc

Browse files
cmotta2016camotta
andauthored
feat(observability): add eventID to exposed metrics (#652)
* feat(observability): add eventID to exposed metrics * chore(observability): using drainEvent parameter instead of rewriting function signature Co-authored-by: Carlos Motta <[email protected]>
1 parent 814bccc commit cda47cc

File tree

2 files changed

+10
-8
lines changed

2 files changed

+10
-8
lines changed

cmd/node-termination-handler.go

Lines changed: 7 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -287,6 +287,7 @@ func watchForCancellationEvents(cancelChan <-chan monitor.InterruptionEvent, int
287287
for {
288288
interruptionEvent := <-cancelChan
289289
nodeName := interruptionEvent.NodeName
290+
eventID := interruptionEvent.EventID
290291
interruptionEventStore.CancelInterruptionEvent(interruptionEvent.EventID)
291292
if interruptionEventStore.ShouldUncordonNode(nodeName) {
292293
log.Info().Msg("Uncordoning the node due to a cancellation event")
@@ -297,7 +298,7 @@ func watchForCancellationEvents(cancelChan <-chan monitor.InterruptionEvent, int
297298
} else {
298299
recorder.Emit(nodeName, observability.Normal, observability.UncordonReason, observability.UncordonMsg)
299300
}
300-
metrics.NodeActionsInc("uncordon", nodeName, err)
301+
metrics.NodeActionsInc("uncordon", nodeName, eventID, err)
301302

302303
err = node.RemoveNTHLabels(nodeName)
303304
if err != nil {
@@ -378,7 +379,7 @@ func runPreDrainTask(node node.Node, nodeName string, drainEvent *monitor.Interr
378379
} else {
379380
recorder.Emit(nodeName, observability.Normal, observability.PreDrainReason, observability.PreDrainMsg)
380381
}
381-
metrics.NodeActionsInc("pre-drain", nodeName, err)
382+
metrics.NodeActionsInc("pre-drain", nodeName, drainEvent.EventID, err)
382383
}
383384

384385
func cordonNode(node node.Node, nodeName string, drainEvent *monitor.InterruptionEvent, metrics observability.Metrics, recorder observability.K8sEventRecorder) error {
@@ -393,7 +394,7 @@ func cordonNode(node node.Node, nodeName string, drainEvent *monitor.Interruptio
393394
return err
394395
} else {
395396
log.Info().Str("node_name", nodeName).Str("reason", drainEvent.Description).Msg("Node successfully cordoned")
396-
metrics.NodeActionsInc("cordon", nodeName, err)
397+
metrics.NodeActionsInc("cordon", nodeName, drainEvent.EventID, err)
397398
recorder.Emit(nodeName, observability.Normal, observability.CordonReason, observability.CordonMsg)
398399
}
399400
return nil
@@ -406,13 +407,13 @@ func cordonAndDrainNode(node node.Node, nodeName string, drainEvent *monitor.Int
406407
log.Err(err).Msgf("node '%s' not found in the cluster", nodeName)
407408
} else {
408409
log.Err(err).Msg("There was a problem while trying to cordon and drain the node")
409-
metrics.NodeActionsInc("cordon-and-drain", nodeName, err)
410+
metrics.NodeActionsInc("cordon-and-drain", nodeName, drainEvent.EventID, err)
410411
recorder.Emit(nodeName, observability.Warning, observability.CordonAndDrainErrReason, observability.CordonAndDrainErrMsgFmt, err.Error())
411412
}
412413
return err
413414
} else {
414415
log.Info().Str("node_name", nodeName).Str("reason", drainEvent.Description).Msg("Node successfully cordoned and drained")
415-
metrics.NodeActionsInc("cordon-and-drain", nodeName, err)
416+
metrics.NodeActionsInc("cordon-and-drain", nodeName, drainEvent.EventID, err)
416417
recorder.Emit(nodeName, observability.Normal, observability.CordonAndDrainReason, observability.CordonAndDrainMsg)
417418
}
418419
return nil
@@ -426,7 +427,7 @@ func runPostDrainTask(node node.Node, nodeName string, drainEvent *monitor.Inter
426427
} else {
427428
recorder.Emit(nodeName, observability.Normal, observability.PostDrainReason, observability.PostDrainMsg)
428429
}
429-
metrics.NodeActionsInc("post-drain", nodeName, err)
430+
metrics.NodeActionsInc("post-drain", nodeName, drainEvent.EventID, err)
430431
}
431432

432433
func getRegionFromQueueURL(queueURL string) string {

pkg/observability/opentelemetry.go

Lines changed: 3 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -32,6 +32,7 @@ var (
3232
labelNodeActionKey = attribute.Key("node/action")
3333
labelNodeStatusKey = attribute.Key("node/status")
3434
labelNodeNameKey = attribute.Key("node/name")
35+
labelEventIDKey = attribute.Key("node/event-id")
3536
)
3637

3738
// Metrics represents the stats for observability
@@ -88,12 +89,12 @@ func (m Metrics) ErrorEventsInc(where string) {
8889
}
8990

9091
// NodeActionsInc will increment one for the node stats counter, partitioned by action, nodeName and status, and only if metrics are enabled.
91-
func (m Metrics) NodeActionsInc(action, nodeName string, err error) {
92+
func (m Metrics) NodeActionsInc(action, nodeName string, eventID string, err error) {
9293
if !m.enabled {
9394
return
9495
}
9596

96-
labels := []attribute.KeyValue{labelNodeActionKey.String(action), labelNodeNameKey.String(nodeName)}
97+
labels := []attribute.KeyValue{labelNodeActionKey.String(action), labelNodeNameKey.String(nodeName), labelEventIDKey.String(eventID)}
9798
if err != nil {
9899
labels = append(labels, labelNodeStatusKey.String("error"))
99100
} else {

0 commit comments

Comments
 (0)