diff --git a/bpf/lib/process.h b/bpf/lib/process.h index a4f14948f7d..25c5e0c6db6 100644 --- a/bpf/lib/process.h +++ b/bpf/lib/process.h @@ -564,8 +564,13 @@ FUNC_INLINE struct execve_info *execve_joined_info_map_get(__u64 tid) _Static_assert(sizeof(struct execve_map_value) % 8 == 0, "struct execve_map_value should have size multiple of 8 bytes"); +#define SENT_FAILED_UNKNOWN 0 // unknown error +#define SENT_FAILED_EBUSY 1 // EBUSY +#define SENT_FAILED_ENOSPC 2 // ENOSPC +#define SENT_FAILED_MAX 3 + struct kernel_stats { - __u64 sent_failed[256]; + __u64 sent_failed[256][SENT_FAILED_MAX]; }; struct { @@ -576,7 +581,7 @@ struct { } tg_stats_map SEC(".maps"); FUNC_INLINE void -perf_event_output_metric(void *ctx, u8 metric, void *map, u64 flags, void *data, u64 size) +perf_event_output_metric(void *ctx, u8 msg_op, void *map, u64 flags, void *data, u64 size) { struct kernel_stats *valp; __u32 zero = 0; @@ -585,8 +590,14 @@ perf_event_output_metric(void *ctx, u8 metric, void *map, u64 flags, void *data, err = perf_event_output(ctx, map, flags, data, size); if (err < 0) { valp = map_lookup_elem(&tg_stats_map, &zero); - if (valp) - __sync_fetch_and_add(&valp->sent_failed[metric], 1); + if (valp) { + if (err == -16) // EBUSY + __sync_fetch_and_add(&valp->sent_failed[msg_op][SENT_FAILED_EBUSY], 1); + else if (err == -28) // ENOSPC + __sync_fetch_and_add(&valp->sent_failed[msg_op][SENT_FAILED_ENOSPC], 1); + else + __sync_fetch_and_add(&valp->sent_failed[msg_op][SENT_FAILED_UNKNOWN], 1); + } } } diff --git a/contrib/upgrade-notes/latest.md b/contrib/upgrade-notes/latest.md index ae5f0934ac9..0d773927828 100644 --- a/contrib/upgrade-notes/latest.md +++ b/contrib/upgrade-notes/latest.md @@ -46,3 +46,4 @@ tetragon: * `tetragon_policyfilter_metrics_total` metric is renamed to `tetragon_policyfilter_operations_total`, and its `op` label is renamed to `operation`. +* `tetragon_missed_events_total` metric is renamed to `tetragon_bpf_missed_events_total`. diff --git a/docs/content/en/docs/reference/metrics.md b/docs/content/en/docs/reference/metrics.md index 5b415012384..969fa1df677 100644 --- a/docs/content/en/docs/reference/metrics.md +++ b/docs/content/en/docs/reference/metrics.md @@ -9,6 +9,15 @@ This page is autogenerated via `make metrics-doc` please do not edit directly. {{< /comment >}} ## Tetragon Health Metrics +### `tetragon_bpf_missed_events_total` + +Number of Tetragon perf events that are failed to be sent from the kernel. + +| label | values | +| ----- | ------ | +| `error` | `EBUSY, ENOSPC, unknown` | +| `msg_op` | `13, 14, 15, 16, 23, 24, 25, 26, 27, 5, 7` | + ### `tetragon_build_info` Build information about tetragon @@ -173,14 +182,6 @@ The number of errors per map. | ----- | ------ | | `map ` | `execve_map, tg_execve_joined_info_map` | -### `tetragon_missed_events_total` - -The total number of Tetragon events per type that are failed to sent from the kernel. - -| label | values | -| ----- | ------ | -| `msg_op` | `13, 14, 15, 16, 23, 24, 25, 26, 27, 5, 7` | - ### `tetragon_missed_link_probes_total` The total number of Tetragon probe missed by link. diff --git a/pkg/api/processapi/processapi.go b/pkg/api/processapi/processapi.go index 8eca43199db..7da1e640e18 100644 --- a/pkg/api/processapi/processapi.go +++ b/pkg/api/processapi/processapi.go @@ -49,6 +49,13 @@ const ( STRING_POSTFIX_MAX_LENGTH = 128 ) +const ( + SentFailedUnknown = iota + SentFailedEbusy + SentFailedEnospc + SentFailedMax +) + type MsgExec struct { Size uint32 PID uint32 @@ -235,7 +242,7 @@ type MsgThrottleEvent struct { } type KernelStats struct { - SentFailed [256]uint64 `align:"sent_failed"` + SentFailed [256][SentFailedMax]uint64 `align:"sent_failed"` } type CgroupRateKey struct { diff --git a/pkg/metrics/eventmetrics/collector.go b/pkg/metrics/eventmetrics/bpfcollector.go similarity index 70% rename from pkg/metrics/eventmetrics/collector.go rename to pkg/metrics/eventmetrics/bpfcollector.go index e9ddc827db3..dffca53a80f 100644 --- a/pkg/metrics/eventmetrics/collector.go +++ b/pkg/metrics/eventmetrics/bpfcollector.go @@ -39,20 +39,26 @@ func collect(ch chan<- prometheus.Metric) { sum := processapi.KernelStats{} for _, val := range allCpuValue { - for i, data := range val.SentFailed { - sum.SentFailed[i] += data + for opcode, errors := range val.SentFailed { + for er, count := range errors { + sum.SentFailed[opcode][er] += count + } } } - for i, data := range sum.SentFailed { - if data > 0 { - ch <- MissedEvents.MustMetric(float64(data), strconv.Itoa(i)) + for opcode, errors := range sum.SentFailed { + for er, count := range errors { + if count > 0 { + ch <- MissedEvents.MustMetric(float64(count), strconv.Itoa(opcode), perfEventErrors[er]) + } } } } func collectForDocs(ch chan<- prometheus.Metric) { for _, opcode := range metrics.OpCodeLabel.Values { - ch <- MissedEvents.MustMetric(0, opcode) + for _, er := range perfEventErrorLabel.Values { + ch <- MissedEvents.MustMetric(0, opcode, er) + } } } diff --git a/pkg/metrics/eventmetrics/eventmetrics.go b/pkg/metrics/eventmetrics/eventmetrics.go index 341a7eeffe7..4bee11f2bda 100644 --- a/pkg/metrics/eventmetrics/eventmetrics.go +++ b/pkg/metrics/eventmetrics/eventmetrics.go @@ -4,6 +4,8 @@ package eventmetrics import ( + "golang.org/x/exp/maps" + v1 "github.com/cilium/cilium/pkg/hubble/api/v1" "github.com/cilium/tetragon/api/v1/tetragon" "github.com/cilium/tetragon/api/v1/tetragon/codegen/helpers" @@ -20,6 +22,18 @@ import ( "github.com/prometheus/client_golang/prometheus" ) +var ( + perfEventErrors = map[int]string{ + processapi.SentFailedUnknown: "unknown", + processapi.SentFailedEbusy: "EBUSY", + processapi.SentFailedEnospc: "ENOSPC", + } + perfEventErrorLabel = metrics.ConstrainedLabel{ + Name: "error", + Values: maps.Values(perfEventErrors), + } +) + var ( EventsProcessed = metrics.MustNewGranularCounter[metrics.ProcessLabels](prometheus.CounterOpts{ Namespace: consts.MetricsNamespace, @@ -28,9 +42,9 @@ var ( ConstLabels: nil, }, []string{"type"}) MissedEvents = metrics.MustNewCustomCounter(metrics.NewOpts( - consts.MetricsNamespace, "", "missed_events_total", - "The total number of Tetragon events per type that are failed to sent from the kernel.", - nil, []metrics.ConstrainedLabel{metrics.OpCodeLabel}, nil, + consts.MetricsNamespace, "bpf", "missed_events_total", + "Number of Tetragon perf events that are failed to be sent from the kernel.", + nil, []metrics.ConstrainedLabel{metrics.OpCodeLabel, perfEventErrorLabel}, nil, )) FlagCount = prometheus.NewCounterVec(prometheus.CounterOpts{ Namespace: consts.MetricsNamespace,