diff --git a/.github/workflows/unit-test-on-pull-request.yml b/.github/workflows/unit-test-on-pull-request.yml index 2f0a6b7d8..a9999a639 100644 --- a/.github/workflows/unit-test-on-pull-request.yml +++ b/.github/workflows/unit-test-on-pull-request.yml @@ -138,6 +138,7 @@ jobs: needs: build-integration-test-binaries timeout-minutes: 10 strategy: + fail-fast: false matrix: include: # List of available kernels here: diff --git a/doc/KNOWN_KERNEL_LIMITATIONS.md b/doc/KNOWN_KERNEL_LIMITATIONS.md index 1c35d5a6a..030f0fd81 100644 --- a/doc/KNOWN_KERNEL_LIMITATIONS.md +++ b/doc/KNOWN_KERNEL_LIMITATIONS.md @@ -10,14 +10,6 @@ There was a limit of 1 eBPF program per tracepoint/kprobe. This limit no longer holds and was removed with commit [e87c6bc3852b981e71c757be20771546ce9f76f3](https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/commit/?id=e87c6bc3852b981e71c757be20771546ce9f76f3). -Obtaining Kernel backtrace --------------------------- -Affects kernel < 4.18 - -It is not possible to get individual backtraces from kernel mode stack with bpf_get_stackid(). It returns hash of the backtrace, and if it collides with another backtrace before the agent has collected it, we might report wrong kernel backtracec. -A more suitable helper bpf_get_stack() was added in commit [c195651e565ae7f41a68acb7d4aa7390ad215de1](https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/commit/?id=c195651e565ae7f41a68acb7d4aa7390ad215de1). - - Kernel version check -------------------- Affects kernel < 5.0. diff --git a/doc/internals.md b/doc/internals.md index 32995e541..a0a52fc2e 100644 --- a/doc/internals.md +++ b/doc/internals.md @@ -104,12 +104,6 @@ network would be very wasteful. We use trace hashing to avoid this. Different hashing schemes are used for the BPF and user-mode trace representations. Multiple 64 bit hashes can end up being mapped to the same 128 bit hash, but *not* vice-versa. -**BPF trace hash (64 bit):** - -``` -H(kernel_stack_id, frames_user, PID) -``` - **User-land trace hash (128 bit)** ``` @@ -382,4 +376,4 @@ probabilistic profiling is either enabled or disabled. The default value is 1 mi The following example shows how to configure the profiling agent with a threshold of 50 and an interval of 2 minutes and 30 seconds: ```bash sudo ./ebpf-profiler -probabilistic-threshold=50 -probabilistic-interval=2m30s -``` \ No newline at end of file +``` diff --git a/support/ebpf/bpfdefs.h b/support/ebpf/bpfdefs.h index 12d992a85..6e70738e8 100644 --- a/support/ebpf/bpfdefs.h +++ b/support/ebpf/bpfdefs.h @@ -73,7 +73,8 @@ static inline int bpf_map_delete_elem(UNUSED void *map, UNUSED const void *key) return -1; } -static inline int bpf_get_stackid(UNUSED void *ctx, UNUSED void *map, UNUSED u64 flags) +static inline long +bpf_get_stack(UNUSED void *ctx, UNUSED void *buf, UNUSED u32 size, UNUSED u64 flags) { return -1; } @@ -109,8 +110,9 @@ static unsigned long long (*bpf_get_current_task)(void) = (void *)BPF_FUNC static int (*bpf_perf_event_output)( void *ctx, void *map, unsigned long long flags, void *data, int size) = (void *) BPF_FUNC_perf_event_output; -static int (*bpf_get_stackid)(void *ctx, void *map, u64 flags) = (void *)BPF_FUNC_get_stackid; -static unsigned long long (*bpf_get_prandom_u32)(void) = (void *)BPF_FUNC_get_prandom_u32; +static long (*bpf_get_stack)(void *ctx, void *buf, u32 size, u64 flags) = (void *) + BPF_FUNC_get_stack; +static unsigned long long (*bpf_get_prandom_u32)(void) = (void *)BPF_FUNC_get_prandom_u32; __attribute__((format(printf, 1, 3))) static int (*bpf_trace_printk)( const char *fmt, int fmt_size, ...) = (void *)BPF_FUNC_trace_printk; diff --git a/support/ebpf/extmaps.h b/support/ebpf/extmaps.h index 0dc530e9a..01d2f00fe 100644 --- a/support/ebpf/extmaps.h +++ b/support/ebpf/extmaps.h @@ -6,7 +6,6 @@ // References to map definitions in *.ebpf.c. extern struct perf_progs_t perf_progs; extern struct per_cpu_records_t per_cpu_records; -extern struct kernel_stackmap_t kernel_stackmap; extern struct pid_page_to_mapping_info_t pid_page_to_mapping_info; extern struct metrics_t metrics; extern struct report_events_t report_events; diff --git a/support/ebpf/integration_test.ebpf.c b/support/ebpf/integration_test.ebpf.c index ced8b3139..16ac23bcf 100644 --- a/support/ebpf/integration_test.ebpf.c +++ b/support/ebpf/integration_test.ebpf.c @@ -6,7 +6,7 @@ #include "tracemgmt.h" #include "types.h" -static EBPF_INLINE void send_sample_traces(void *ctx, u64 pid, s32 kstack) +static EBPF_INLINE void send_sample_traces(void *ctx, u64 pid) { // Use the per CPU record for trace storage: it's too big for stack. PerCPURecord *record = get_pristine_per_cpu_record(); @@ -24,10 +24,9 @@ static EBPF_INLINE void send_sample_traces(void *ctx, u64 pid, s32 kstack) trace->origin = TRACE_SAMPLING; - trace->comm[3] = 1; - trace->pid = pid; - trace->tid = pid; - trace->kernel_stack_id = -1; + trace->comm[3] = 1; + trace->pid = pid; + trace->tid = pid; u64 *data = push_frame(&record->state, trace, FRAME_MARKER_NATIVE, 0, 21, 1); if (data) { @@ -36,23 +35,29 @@ static EBPF_INLINE void send_sample_traces(void *ctx, u64 pid, s32 kstack) send_trace(ctx, trace); // Single native frame, with kernel trace. - trace->comm[3] = 2; - trace->kernel_stack_id = kstack; + trace->frame_data_len = 0; + trace->num_frames = 0; + trace->num_kernel_frames = 0; + trace->comm[3] = 2; + push_kernel_frames(ctx, trace); + data = push_frame(&record->state, trace, FRAME_MARKER_NATIVE, 0, 21, 1); + if (data) { + data[0] = 1337; + } send_trace(ctx, trace); } -// tracepoint_integration__sched_switch fetches the current kernel stack ID from -// kernel_stackmap and communicates it to userspace via kernel_stack_id map. +// tracepoint_integration__sched_switch captures the kernel stack inline +// and sends sample traces to userspace. SEC("tracepoint/integration/sched_switch") int tracepoint_integration__sched_switch(void *ctx) { u64 id = bpf_get_current_pid_tgid(); u64 pid = id >> 32; - s32 kernel_stack_id = bpf_get_stackid(ctx, &kernel_stackmap, BPF_F_REUSE_STACKID); - printt("pid %lld with kernel_stack_id %d", pid, kernel_stack_id); + printt("pid %lld in integration test", pid); - send_sample_traces(ctx, pid, kernel_stack_id); + send_sample_traces(ctx, pid); return 0; } diff --git a/support/ebpf/interpreter_dispatcher.ebpf.c b/support/ebpf/interpreter_dispatcher.ebpf.c index 1ee2f6f0a..9c0dec756 100644 --- a/support/ebpf/interpreter_dispatcher.ebpf.c +++ b/support/ebpf/interpreter_dispatcher.ebpf.c @@ -678,7 +678,7 @@ static EBPF_INLINE int unwind_stop(struct pt_regs *ctx) // If the stack is otherwise empty, push an error for that: we should // never encounter empty stacks for successful unwinding. - if (trace->frame_data_len == 0 && trace->kernel_stack_id < 0) { + if (trace->frame_data_len == 0) { DEBUG_PRINT("unwind_stop called but the stack is empty"); increment_metric(metricID_ErrEmptyStack); if (!state->unwind_error) { @@ -715,7 +715,7 @@ static EBPF_INLINE int unwind_stop(struct pt_regs *ctx) // through different data structures, we'd have to keep a list of known empty traces to // also prevent the corresponding trace counts to be sent out. OTOH, if we do it here, // this is trivial. - if (trace->frame_data_len == 1 && trace->kernel_stack_id < 0 && state->unwind_error) { + if (trace->frame_data_len == 1 && state->unwind_error) { if (filter_error_frames) { return 0; } diff --git a/support/ebpf/kernel.h b/support/ebpf/kernel.h index f582de9bc..f5c37144c 100644 --- a/support/ebpf/kernel.h +++ b/support/ebpf/kernel.h @@ -185,12 +185,11 @@ enum bpf_map_type { BPF_MAP_TYPE_CGRP_STORAGE, }; -// Flags bpf_get_stackid/bpf_get_stack. +// Flags for bpf_get_stack. enum { BPF_F_SKIP_FIELD_MASK = 0xffULL, BPF_F_USER_STACK = (1ULL << 8), BPF_F_FAST_STACK_CMP = (1ULL << 9), - BPF_F_REUSE_STACKID = (1ULL << 10), BPF_F_USER_BUILD_ID = (1ULL << 11), }; diff --git a/support/ebpf/native_stack_trace.ebpf.c b/support/ebpf/native_stack_trace.ebpf.c index 6e8cda2a9..eff6643b0 100644 --- a/support/ebpf/native_stack_trace.ebpf.c +++ b/support/ebpf/native_stack_trace.ebpf.c @@ -89,16 +89,6 @@ struct stack_delta_page_to_info_t { __uint(max_entries, 40000); } stack_delta_page_to_info SEC(".maps"); -// This contains the kernel PCs as returned by bpf_get_stackid(). Unfortunately the ebpf -// program cannot read the contents, so we return the stackid in the Trace directly, and -// make the profiling agent read the kernel mode stack trace portion from this map. -struct kernel_stackmap_t { - __uint(type, BPF_MAP_TYPE_STACK_TRACE); - __type(key, u32); - __type(value, u64[PERF_MAX_STACK_DEPTH]); - __uint(max_entries, 16 * 1024); -} kernel_stackmap SEC(".maps"); - #include "native_stack_trace.h" // unwind_native is the tail call destination for PROG_UNWIND_NATIVE. diff --git a/support/ebpf/tracemgmt.h b/support/ebpf/tracemgmt.h index 3850f53e5..98c8b6a89 100644 --- a/support/ebpf/tracemgmt.h +++ b/support/ebpf/tracemgmt.h @@ -233,12 +233,12 @@ static inline EBPF_INLINE PerCPURecord *get_pristine_per_cpu_record() record->ratelimitAction = RATELIMIT_ACTION_DEFAULT; record->customLabelsState.go_m_ptr = NULL; - Trace *trace = &record->trace; - trace->kernel_stack_id = -1; - trace->frame_data_len = 0; - trace->num_frames = 0; - trace->pid = 0; - trace->tid = 0; + Trace *trace = &record->trace; + trace->frame_data_len = 0; + trace->num_frames = 0; + trace->num_kernel_frames = 0; + trace->pid = 0; + trace->tid = 0; trace->apm_trace_id.as_int.hi = 0; trace->apm_trace_id.as_int.lo = 0; @@ -399,15 +399,37 @@ static inline EBPF_INLINE void push_abort(Trace *trace, ErrorCode error) } } +// push_kernel_frames captures the kernel stack via bpf_get_stack() and stores +// the raw addresses at the beginning of frame_data. Must be called before any +// userspace frames are pushed. The num_kernel_frames field tells userspace how +// many leading frame_data entries are kernel addresses. +static inline EBPF_INLINE void push_kernel_frames(void *ctx, Trace *trace) +{ + _Static_assert( + sizeof(trace->frame_data) > PERF_MAX_STACK_DEPTH * sizeof(u64), "frame data too small"); + long bytes = bpf_get_stack(ctx, trace->frame_data, PERF_MAX_STACK_DEPTH * sizeof(u64), 0); + if (bytes > 0) { + int nframes = bytes / sizeof(u64); + trace->num_kernel_frames = nframes; + trace->frame_data_len = nframes; + } +} + // Send a trace to user-land via the `trace_events` perf event buffer. static inline EBPF_INLINE void send_trace(void *ctx, Trace *trace) { - const u64 send_size = sizeof(Trace) - sizeof(trace->frame_data) + - sizeof(trace->frame_data[0]) * trace->frame_data_len; - - if (send_size < sizeof(Trace)) { - bpf_perf_event_output(ctx, &trace_events, BPF_F_CURRENT_CPU, trace, send_size); - } + // Explicitly clamp frame_data_len for the verifier. In production the value + // is always within bounds, but when send_trace is inlined into the same + // program as push_frame (e.g. the integration test), the verifier cannot + // track frame_data_len through memory stores and reloads. + u16 len = trace->frame_data_len; + if (len > sizeof(trace->frame_data) / sizeof(trace->frame_data[0])) { + len = sizeof(trace->frame_data) / sizeof(trace->frame_data[0]); + } + const u64 send_size = + sizeof(Trace) - sizeof(trace->frame_data) + sizeof(trace->frame_data[0]) * len; + + bpf_perf_event_output(ctx, &trace_events, BPF_F_CURRENT_CPU, trace, send_size); } // is_kernel_address checks if the given address looks like virtual address to kernel memory. @@ -776,9 +798,8 @@ static inline EBPF_INLINE int collect_trace( increment_metric(metricID_ErrBPFCurrentComm); } - // Get the kernel mode stack trace first - trace->kernel_stack_id = bpf_get_stackid(ctx, &kernel_stackmap, BPF_F_REUSE_STACKID); - DEBUG_PRINT("kernel stack id = %d", trace->kernel_stack_id); + // Capture kernel stack and push each frame into frame_data. + push_kernel_frames(ctx, trace); if (pid == 0) { tail_call(ctx, PROG_UNWIND_STOP); diff --git a/support/ebpf/tracer.ebpf.amd64 b/support/ebpf/tracer.ebpf.amd64 index ac546cdee..96d310a2c 100644 Binary files a/support/ebpf/tracer.ebpf.amd64 and b/support/ebpf/tracer.ebpf.amd64 differ diff --git a/support/ebpf/tracer.ebpf.arm64 b/support/ebpf/tracer.ebpf.arm64 index bdaeea385..42936b940 100644 Binary files a/support/ebpf/tracer.ebpf.arm64 and b/support/ebpf/tracer.ebpf.arm64 differ diff --git a/support/ebpf/types.h b/support/ebpf/types.h index 8a1fed91c..c3cd27fca 100644 --- a/support/ebpf/types.h +++ b/support/ebpf/types.h @@ -676,12 +676,13 @@ typedef struct Trace { ApmTraceID apm_trace_id; // Custom Labels CustomLabelsArray custom_labels; - // The kernel stack ID. - s32 kernel_stack_id; // The number of frame_data elements present. u16 frame_data_len; // The number of frames present. u16 num_frames; + // The number of kernel stack frames at the start of frame_data. + // These are raw u64 addresses from bpf_get_stack(), not encoded frames. + u16 num_kernel_frames; // origin indicates the source of the trace. TraceOrigin origin; diff --git a/support/types.go b/support/types.go index 2e3e02a4f..3753d72cb 100644 --- a/support/types.go +++ b/support/types.go @@ -90,10 +90,6 @@ const ( HSTSIDSegMapMask = 0xffffffffffffff ) -const ( - PerfMaxStackDepth = 0x7f -) - const ( TraceOriginUnknown = 0x0 TraceOriginSampling = 0x1 @@ -170,9 +166,9 @@ type Trace struct { Apm_transaction_id [8]byte Apm_trace_id [16]byte Custom_labels CustomLabelsArray - Kernel_stack_id int32 Frame_data_len uint16 Num_frames uint16 + Num_kernel_frames uint16 Origin uint32 Offtime uint64 Frame_data [3072]uint64 diff --git a/support/types_def.go b/support/types_def.go index b2ccc3d47..c68015cd5 100644 --- a/support/types_def.go +++ b/support/types_def.go @@ -100,11 +100,6 @@ const ( HSTSIDSegMapMask = C.HS_TSID_SEG_MAP_MASK ) -const ( - // PerfMaxStackDepth is the bpf map data array length for BPF_MAP_TYPE_STACK_TRACE traces - PerfMaxStackDepth = C.PERF_MAX_STACK_DEPTH -) - const ( TraceOriginUnknown = C.TRACE_UNKNOWN TraceOriginSampling = C.TRACE_SAMPLING diff --git a/tracer/ebpf_integration_test.go b/tracer/ebpf_integration_test.go index b14816d0b..2b9808457 100644 --- a/tracer/ebpf_integration_test.go +++ b/tracer/ebpf_integration_test.go @@ -91,8 +91,7 @@ func runKernelFrameProbe(t *testing.T, tr *tracer.Tracer) { type trace struct { numKernelFrames int - - frames libpf.EbpfFrame + frames libpf.EbpfFrame } func TestTracerErrorPropagation(t *testing.T) { diff --git a/tracer/tracer.go b/tracer/tracer.go index a8bb47745..8270b2130 100644 --- a/tracer/tracer.go +++ b/tracer/tracer.go @@ -851,49 +851,28 @@ func loadProgram(ebpfProgs map[string]*cebpf.Program, tailcallMap *cebpf.Map, return nil } -// readKernelFrames fetches the kernel stack frames for a particular kstackID and -// returns them as symbolized libpf.Frames. -func (t *Tracer) readKernelFrames(kstackID int32, oldFrames libpf.Frames) (libpf.Frames, error) { - cKstackID := kstackID - kstackVal := make([]uint64, support.PerfMaxStackDepth) - - if err := t.ebpfMaps["kernel_stackmap"].Lookup(unsafe.Pointer(&cKstackID), - unsafe.Pointer(&kstackVal[0])); err != nil { - return nil, fmt.Errorf("failed to lookup kernel frames for stackID %d: %v", kstackID, err) - } - - // The kernel returns absolute addresses in kernel address - // space format. Here just the stack length is needed. - // But also debug print the symbolization based on kallsyms. - var kstackLen uint32 - for kstackLen < support.PerfMaxStackDepth && kstackVal[kstackLen] != 0 { - kstackLen++ - } - +// symbolizeKernelFrames converts raw kernel addresses into symbolized frames. +func (t *Tracer) symbolizeKernelFrames(addrs []uint64, oldFrames libpf.Frames) libpf.Frames { frames := oldFrames - if kstackLen > uint32(len(frames)) { - frames = make(libpf.Frames, 0, kstackLen) + if len(addrs) > len(frames) { + frames = make(libpf.Frames, 0, len(addrs)) } - for i := uint32(0); i < kstackLen; i++ { - address := libpf.Address(kstackVal[i]) + for _, addr := range addrs { + address := libpf.Address(addr) frame := libpf.Frame{ Type: libpf.KernelFrame, AddressOrLineno: libpf.AddressOrLineno(address - 1), } - - kmod, err := t.kernelSymbolizer.GetModuleByAddress(address) - if err == nil { + if kmod, err := t.kernelSymbolizer.GetModuleByAddress(address); err == nil { frame.Mapping = kmod.Mapping() frame.AddressOrLineno -= libpf.AddressOrLineno(kmod.Start()) - if funcName, _, err := kmod.LookupSymbolByAddress(address); err == nil { frame.FunctionName = libpf.Intern(funcName) } } frames.Append(&frame) } - - return frames, nil + return frames } // enableEvent removes the entry of given eventType from the inhibitEvents map @@ -1063,9 +1042,6 @@ var ( ) // loadBpfTrace parses a raw BPF trace into a `host.Trace` instance. -// -// If the raw trace contains a kernel stack ID, the kernel stack is also -// retrieved and inserted at the appropriate position. func (t *Tracer) loadBpfTrace(raw []byte, cpu int) (*libpf.EbpfTrace, error) { frameListOffs := int(unsafe.Offsetof(support.Trace{}.Frame_data)) @@ -1110,24 +1086,6 @@ func (t *Tracer) loadBpfTrace(raw []byte, cpu int) (*libpf.EbpfTrace, error) { return nil, fmt.Errorf("origin %d: %w", trace.Origin, errOriginUnexpected) } - clLen := int(ptr.Custom_labels.Len) - if clLen > 0 { - trace.CustomLabels = make(map[libpf.String]libpf.String, clLen) - for i := 0; i < clLen; i++ { - lbl := ptr.Custom_labels.Labels[i] - key := goString(lbl.Key[:]) - val := goString(lbl.Val[:]) - trace.CustomLabels[key] = val - } - } - if ptr.Kernel_stack_id >= 0 { - var err error - trace.KernelFrames, err = t.readKernelFrames(ptr.Kernel_stack_id, trace.KernelFrames) - if err != nil { - log.Errorf("Failed to get kernel stack frames: %v", err) - } - } - if ptr.Custom_labels.Len > 0 { trace.CustomLabels = make(map[libpf.String]libpf.String, int(ptr.Custom_labels.Len)) for i := 0; i < int(ptr.Custom_labels.Len); i++ { @@ -1139,8 +1097,17 @@ func (t *Tracer) loadBpfTrace(raw []byte, cpu int) (*libpf.EbpfTrace, error) { } trace.NumFrames = int(ptr.Num_frames) - trace.FrameData = trace.FrameDataBuf[:ptr.Frame_data_len] - copy(trace.FrameData, ptr.Frame_data[:ptr.Frame_data_len]) + + // Symbolize kernel frames directly from the raw BPF data before copying + // userspace frame data, so we only copy what's needed. + numKernelFrames := int(ptr.Num_kernel_frames) + if numKernelFrames > 0 { + trace.KernelFrames = t.symbolizeKernelFrames( + ptr.Frame_data[:numKernelFrames], trace.KernelFrames) + } + userFrameLen := int(ptr.Frame_data_len) - numKernelFrames + trace.FrameData = trace.FrameDataBuf[:userFrameLen] + copy(trace.FrameData, ptr.Frame_data[numKernelFrames:ptr.Frame_data_len]) return trace, nil } @@ -1428,7 +1395,7 @@ func (t *Tracer) AttachProbes(probes []string) error { func (t *Tracer) HandleTrace(bpfTrace *libpf.EbpfTrace) { t.processManager.HandleTrace(bpfTrace) - // Reclain the EbpfTrace + // Reclaim the EbpfTrace bpfTrace.KernelFrames = bpfTrace.KernelFrames[0:0] t.tracePool.Put(bpfTrace) }