Skip to content
17 changes: 16 additions & 1 deletion tracer/events.go
Original file line number Diff line number Diff line change
Expand Up @@ -214,7 +214,22 @@ func (t *Tracer) startTraceEventMonitor(ctx context.Context,
eventCount++

// Keep track of min KTime seen in this batch processing loop
trace := t.loadBpfTrace(data.RawSample, data.CPU)
trace, err := t.loadBpfTrace(data.RawSample, data.CPU)
switch {
case err == nil:
// Fast path for no error.
case errors.Is(err, errOriginUnexpected):
log.Warnf("skip trace handling: %v", err)
continue
case errors.Is(err, errRecordTooSmall), errors.Is(err, errRecordUnexpectedSize):
log.Errorf("stop receiving traces: %v", err)
// TODO: trigger a graceful shutdown
return
default:
Comment thread
florianl marked this conversation as resolved.
log.Warnf("unexpected error handling trace: %v", err)
continue
}

if minKTime == 0 || trace.KTime < minKTime {
minKTime = trace.KTime
}
Expand Down
19 changes: 13 additions & 6 deletions tracer/tracer.go
Original file line number Diff line number Diff line change
Expand Up @@ -927,23 +927,31 @@ func (t *Tracer) eBPFMetricsCollector(
return metricsUpdates
}

// Various bpf trace handling related errors:
var (
errRecordTooSmall = errors.New("trace record too small")
errRecordUnexpectedSize = errors.New("unexpected record size")
errOriginUnexpected = errors.New("unexepcted origin")
)

// loadBpfTrace parses a raw BPF trace into a `host.Trace` instance.
//
// If the raw trace contains a kernel stack ID, the kernel stack is also
// retrieved and inserted at the appropriate position.
func (t *Tracer) loadBpfTrace(raw []byte, cpu int) *libpf.EbpfTrace {
func (t *Tracer) loadBpfTrace(raw []byte, cpu int) (*libpf.EbpfTrace, error) {
frameListOffs := int(unsafe.Offsetof(support.Trace{}.Frame_data))

if len(raw) < frameListOffs {
panic("trace record too small")
Copy link
Copy Markdown
Member

@christos68k christos68k Jan 7, 2026

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

We should keep these panics for now until we have a functional equivalent (cleanly shutting down the receiver).

If we remove them here, we're changing the behavior as we're effectively ignoring these errors that were previously deemed serious enough to panic (we're also hiding this behavior from a future refactoring).

Copy link
Copy Markdown
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Mixing panic() with error returns in loadBpfTrace() is something I would like to avoid.
The panic() here was replaced with a dedicated error. This allows in a future step a graceful shutdown on this dedicated error. As this case should not :Tm: happen in regular setups, as far as I know there is not a single report of such a panic in the past ~5 years, this error is logged as error at the moment.

Copy link
Copy Markdown
Member

@christos68k christos68k Jan 7, 2026

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

The main issue is that we're hiding information with this change, the current code encodes certain semantics: that this error is serious enough to panic. So we at least need to keep the "serious enough" part of these semantics intact, otherwise we're forcing ourselves to remember that these conditions should trigger shutdown. See my other suggestions.

Copy link
Copy Markdown
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

To mimic the current semantics I added dedicated error handling for these panic() cases with a10309c and stop receiving and handling traces in user space.

The next step, a graceful complete shutdown, then should only be the next step with the handling of log.Fatal() cases in package tracer.

return nil, fmt.Errorf("%d < %d: %w", len(raw), frameListOffs, errRecordTooSmall)
Comment thread
florianl marked this conversation as resolved.
}

ptr := traceFromRaw(raw)
frameDataLen := int(ptr.Frame_data_len) * 8

// NOTE: can't do exact check here: kernel adds a few padding bytes to messages.
if len(raw) < frameListOffs+frameDataLen {
panic("unexpected record size")
return nil, fmt.Errorf("%d < %d: %w", len(raw), frameListOffs+frameDataLen,
Comment thread
florianl marked this conversation as resolved.
errRecordUnexpectedSize)
Comment on lines +953 to +954
Copy link
Copy Markdown
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

How about log the error here, and return err... directly to caller to determine what to do? Same for other similar places.

Copy link
Copy Markdown
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I used the fmt.Errorf() here, as I think it is important to have these additional information with the error to act on them later on. As control/program flows can change, I didn't want to put a log here and keep the log, that indicates the termination, close to the place, where the termination actual happens.

}

pid := libpf.PID(ptr.Pid)
Expand All @@ -970,8 +978,7 @@ func (t *Tracer) loadBpfTrace(raw []byte, cpu int) *libpf.EbpfTrace {
case support.TraceOriginOffCPU:
case support.TraceOriginProbe:
default:
log.Warnf("Skip handling trace from unexpected %d origin", trace.Origin)
return nil
return nil, fmt.Errorf("origin %d: %w", trace.Origin, errOriginUnexpected)
}

if ptr.Kernel_stack_id >= 0 {
Expand All @@ -996,7 +1003,7 @@ func (t *Tracer) loadBpfTrace(raw []byte, cpu int) *libpf.EbpfTrace {
trace.FrameData = trace.FrameDataBuf[:ptr.Frame_data_len]
copy(trace.FrameData, ptr.Frame_data[:ptr.Frame_data_len])

return trace
return trace, nil
}

// StartMapMonitors starts goroutines for collecting metrics and monitoring eBPF
Expand Down