Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions .github/workflows/unit-test-on-pull-request.yml
Original file line number Diff line number Diff line change
Expand Up @@ -138,6 +138,7 @@ jobs:
needs: build-integration-test-binaries
timeout-minutes: 10
strategy:
fail-fast: false
matrix:
include:
# List of available kernels here:
Expand Down
8 changes: 0 additions & 8 deletions doc/KNOWN_KERNEL_LIMITATIONS.md
Original file line number Diff line number Diff line change
Expand Up @@ -10,14 +10,6 @@ There was a limit of 1 eBPF program per tracepoint/kprobe.
This limit no longer holds and was removed with commit [e87c6bc3852b981e71c757be20771546ce9f76f3](https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/commit/?id=e87c6bc3852b981e71c757be20771546ce9f76f3).


Obtaining Kernel backtrace
--------------------------
Affects kernel < 4.18

It is not possible to get individual backtraces from kernel mode stack with bpf_get_stackid(). It returns hash of the backtrace, and if it collides with another backtrace before the agent has collected it, we might report wrong kernel backtracec.
A more suitable helper bpf_get_stack() was added in commit [c195651e565ae7f41a68acb7d4aa7390ad215de1](https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/commit/?id=c195651e565ae7f41a68acb7d4aa7390ad215de1).


Kernel version check
--------------------
Affects kernel < 5.0.
Expand Down
8 changes: 1 addition & 7 deletions doc/internals.md
Original file line number Diff line number Diff line change
Expand Up @@ -104,12 +104,6 @@ network would be very wasteful. We use trace hashing to avoid this. Different
hashing schemes are used for the BPF and user-mode trace representations. Multiple
64 bit hashes can end up being mapped to the same 128 bit hash, but *not* vice-versa.

**BPF trace hash (64 bit):**

```
H(kernel_stack_id, frames_user, PID)
```

**User-land trace hash (128 bit)**

```
Expand Down Expand Up @@ -382,4 +376,4 @@ probabilistic profiling is either enabled or disabled. The default value is 1 mi
The following example shows how to configure the profiling agent with a threshold of 50 and an interval of 2 minutes and 30 seconds:
```bash
sudo ./ebpf-profiler -probabilistic-threshold=50 -probabilistic-interval=2m30s
```
```
8 changes: 5 additions & 3 deletions support/ebpf/bpfdefs.h
Original file line number Diff line number Diff line change
Expand Up @@ -73,7 +73,8 @@ static inline int bpf_map_delete_elem(UNUSED void *map, UNUSED const void *key)
return -1;
}

static inline int bpf_get_stackid(UNUSED void *ctx, UNUSED void *map, UNUSED u64 flags)
static inline long
bpf_get_stack(UNUSED void *ctx, UNUSED void *buf, UNUSED u32 size, UNUSED u64 flags)
{
return -1;
}
Expand Down Expand Up @@ -109,8 +110,9 @@ static unsigned long long (*bpf_get_current_task)(void) = (void *)BPF_FUNC
static int (*bpf_perf_event_output)(
void *ctx, void *map, unsigned long long flags, void *data, int size) = (void *)
BPF_FUNC_perf_event_output;
static int (*bpf_get_stackid)(void *ctx, void *map, u64 flags) = (void *)BPF_FUNC_get_stackid;
static unsigned long long (*bpf_get_prandom_u32)(void) = (void *)BPF_FUNC_get_prandom_u32;
static long (*bpf_get_stack)(void *ctx, void *buf, u32 size, u64 flags) = (void *)
BPF_FUNC_get_stack;
static unsigned long long (*bpf_get_prandom_u32)(void) = (void *)BPF_FUNC_get_prandom_u32;

__attribute__((format(printf, 1, 3))) static int (*bpf_trace_printk)(
const char *fmt, int fmt_size, ...) = (void *)BPF_FUNC_trace_printk;
Expand Down
1 change: 0 additions & 1 deletion support/ebpf/extmaps.h
Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,6 @@
// References to map definitions in *.ebpf.c.
extern struct perf_progs_t perf_progs;
extern struct per_cpu_records_t per_cpu_records;
extern struct kernel_stackmap_t kernel_stackmap;
extern struct pid_page_to_mapping_info_t pid_page_to_mapping_info;
extern struct metrics_t metrics;
extern struct report_events_t report_events;
Expand Down
29 changes: 17 additions & 12 deletions support/ebpf/integration_test.ebpf.c
Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,7 @@
#include "tracemgmt.h"
#include "types.h"

static EBPF_INLINE void send_sample_traces(void *ctx, u64 pid, s32 kstack)
static EBPF_INLINE void send_sample_traces(void *ctx, u64 pid)
{
// Use the per CPU record for trace storage: it's too big for stack.
PerCPURecord *record = get_pristine_per_cpu_record();
Expand All @@ -24,10 +24,9 @@ static EBPF_INLINE void send_sample_traces(void *ctx, u64 pid, s32 kstack)

trace->origin = TRACE_SAMPLING;

trace->comm[3] = 1;
trace->pid = pid;
trace->tid = pid;
trace->kernel_stack_id = -1;
trace->comm[3] = 1;
trace->pid = pid;
trace->tid = pid;

u64 *data = push_frame(&record->state, trace, FRAME_MARKER_NATIVE, 0, 21, 1);
if (data) {
Expand All @@ -36,23 +35,29 @@ static EBPF_INLINE void send_sample_traces(void *ctx, u64 pid, s32 kstack)
send_trace(ctx, trace);

// Single native frame, with kernel trace.
trace->comm[3] = 2;
trace->kernel_stack_id = kstack;
trace->frame_data_len = 0;
trace->num_frames = 0;
trace->num_kernel_frames = 0;
trace->comm[3] = 2;
push_kernel_frames(ctx, trace);
data = push_frame(&record->state, trace, FRAME_MARKER_NATIVE, 0, 21, 1);
if (data) {
data[0] = 1337;
}
send_trace(ctx, trace);
}

// tracepoint_integration__sched_switch fetches the current kernel stack ID from
// kernel_stackmap and communicates it to userspace via kernel_stack_id map.
// tracepoint_integration__sched_switch captures the kernel stack inline
// and sends sample traces to userspace.
SEC("tracepoint/integration/sched_switch")
int tracepoint_integration__sched_switch(void *ctx)
{
u64 id = bpf_get_current_pid_tgid();
u64 pid = id >> 32;

s32 kernel_stack_id = bpf_get_stackid(ctx, &kernel_stackmap, BPF_F_REUSE_STACKID);
printt("pid %lld with kernel_stack_id %d", pid, kernel_stack_id);
printt("pid %lld in integration test", pid);

send_sample_traces(ctx, pid, kernel_stack_id);
send_sample_traces(ctx, pid);

return 0;
}
4 changes: 2 additions & 2 deletions support/ebpf/interpreter_dispatcher.ebpf.c
Original file line number Diff line number Diff line change
Expand Up @@ -678,7 +678,7 @@ static EBPF_INLINE int unwind_stop(struct pt_regs *ctx)

// If the stack is otherwise empty, push an error for that: we should
// never encounter empty stacks for successful unwinding.
if (trace->frame_data_len == 0 && trace->kernel_stack_id < 0) {
if (trace->frame_data_len == 0) {
DEBUG_PRINT("unwind_stop called but the stack is empty");
increment_metric(metricID_ErrEmptyStack);
if (!state->unwind_error) {
Expand Down Expand Up @@ -715,7 +715,7 @@ static EBPF_INLINE int unwind_stop(struct pt_regs *ctx)
// through different data structures, we'd have to keep a list of known empty traces to
// also prevent the corresponding trace counts to be sent out. OTOH, if we do it here,
// this is trivial.
if (trace->frame_data_len == 1 && trace->kernel_stack_id < 0 && state->unwind_error) {
if (trace->frame_data_len == 1 && state->unwind_error) {
if (filter_error_frames) {
return 0;
}
Expand Down
3 changes: 1 addition & 2 deletions support/ebpf/kernel.h
Original file line number Diff line number Diff line change
Expand Up @@ -185,12 +185,11 @@ enum bpf_map_type {
BPF_MAP_TYPE_CGRP_STORAGE,
};

// Flags bpf_get_stackid/bpf_get_stack.
// Flags for bpf_get_stack.
enum {
BPF_F_SKIP_FIELD_MASK = 0xffULL,
BPF_F_USER_STACK = (1ULL << 8),
BPF_F_FAST_STACK_CMP = (1ULL << 9),
BPF_F_REUSE_STACKID = (1ULL << 10),
BPF_F_USER_BUILD_ID = (1ULL << 11),
};

Expand Down
10 changes: 0 additions & 10 deletions support/ebpf/native_stack_trace.ebpf.c
Original file line number Diff line number Diff line change
Expand Up @@ -89,16 +89,6 @@ struct stack_delta_page_to_info_t {
__uint(max_entries, 40000);
} stack_delta_page_to_info SEC(".maps");

// This contains the kernel PCs as returned by bpf_get_stackid(). Unfortunately the ebpf
// program cannot read the contents, so we return the stackid in the Trace directly, and
// make the profiling agent read the kernel mode stack trace portion from this map.
struct kernel_stackmap_t {
__uint(type, BPF_MAP_TYPE_STACK_TRACE);
__type(key, u32);
__type(value, u64[PERF_MAX_STACK_DEPTH]);
__uint(max_entries, 16 * 1024);
} kernel_stackmap SEC(".maps");

#include "native_stack_trace.h"

// unwind_native is the tail call destination for PROG_UNWIND_NATIVE.
Expand Down
51 changes: 36 additions & 15 deletions support/ebpf/tracemgmt.h
Original file line number Diff line number Diff line change
Expand Up @@ -233,12 +233,12 @@ static inline EBPF_INLINE PerCPURecord *get_pristine_per_cpu_record()
record->ratelimitAction = RATELIMIT_ACTION_DEFAULT;
record->customLabelsState.go_m_ptr = NULL;

Trace *trace = &record->trace;
trace->kernel_stack_id = -1;
trace->frame_data_len = 0;
trace->num_frames = 0;
trace->pid = 0;
trace->tid = 0;
Trace *trace = &record->trace;
trace->frame_data_len = 0;
trace->num_frames = 0;
trace->num_kernel_frames = 0;
trace->pid = 0;
trace->tid = 0;

trace->apm_trace_id.as_int.hi = 0;
trace->apm_trace_id.as_int.lo = 0;
Expand Down Expand Up @@ -399,15 +399,37 @@ static inline EBPF_INLINE void push_abort(Trace *trace, ErrorCode error)
}
}

// push_kernel_frames captures the kernel stack via bpf_get_stack() and stores
// the raw addresses at the beginning of frame_data. Must be called before any
// userspace frames are pushed. The num_kernel_frames field tells userspace how
// many leading frame_data entries are kernel addresses.
static inline EBPF_INLINE void push_kernel_frames(void *ctx, Trace *trace)
{
_Static_assert(
sizeof(trace->frame_data) > PERF_MAX_STACK_DEPTH * sizeof(u64), "frame data too small");
long bytes = bpf_get_stack(ctx, trace->frame_data, PERF_MAX_STACK_DEPTH * sizeof(u64), 0);
if (bytes > 0) {
int nframes = bytes / sizeof(u64);
trace->num_kernel_frames = nframes;
trace->frame_data_len = nframes;
}
}

// Send a trace to user-land via the `trace_events` perf event buffer.
static inline EBPF_INLINE void send_trace(void *ctx, Trace *trace)
{
const u64 send_size = sizeof(Trace) - sizeof(trace->frame_data) +
sizeof(trace->frame_data[0]) * trace->frame_data_len;

if (send_size < sizeof(Trace)) {
bpf_perf_event_output(ctx, &trace_events, BPF_F_CURRENT_CPU, trace, send_size);
}
// Explicitly clamp frame_data_len for the verifier. In production the value
// is always within bounds, but when send_trace is inlined into the same
// program as push_frame (e.g. the integration test), the verifier cannot
// track frame_data_len through memory stores and reloads.
u16 len = trace->frame_data_len;
if (len > sizeof(trace->frame_data) / sizeof(trace->frame_data[0])) {
len = sizeof(trace->frame_data) / sizeof(trace->frame_data[0]);
}
const u64 send_size =
sizeof(Trace) - sizeof(trace->frame_data) + sizeof(trace->frame_data[0]) * len;

bpf_perf_event_output(ctx, &trace_events, BPF_F_CURRENT_CPU, trace, send_size);
}

// is_kernel_address checks if the given address looks like virtual address to kernel memory.
Expand Down Expand Up @@ -776,9 +798,8 @@ static inline EBPF_INLINE int collect_trace(
increment_metric(metricID_ErrBPFCurrentComm);
}

// Get the kernel mode stack trace first
trace->kernel_stack_id = bpf_get_stackid(ctx, &kernel_stackmap, BPF_F_REUSE_STACKID);
DEBUG_PRINT("kernel stack id = %d", trace->kernel_stack_id);
// Capture kernel stack and push each frame into frame_data.
push_kernel_frames(ctx, trace);

if (pid == 0) {
tail_call(ctx, PROG_UNWIND_STOP);
Expand Down
Binary file modified support/ebpf/tracer.ebpf.amd64
Binary file not shown.
Binary file modified support/ebpf/tracer.ebpf.arm64
Binary file not shown.
5 changes: 3 additions & 2 deletions support/ebpf/types.h
Original file line number Diff line number Diff line change
Expand Up @@ -676,12 +676,13 @@ typedef struct Trace {
ApmTraceID apm_trace_id;
// Custom Labels
CustomLabelsArray custom_labels;
// The kernel stack ID.
s32 kernel_stack_id;
// The number of frame_data elements present.
u16 frame_data_len;
// The number of frames present.
u16 num_frames;
// The number of kernel stack frames at the start of frame_data.
// These are raw u64 addresses from bpf_get_stack(), not encoded frames.
u16 num_kernel_frames;

// origin indicates the source of the trace.
TraceOrigin origin;
Expand Down
6 changes: 1 addition & 5 deletions support/types.go

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

5 changes: 0 additions & 5 deletions support/types_def.go
Original file line number Diff line number Diff line change
Expand Up @@ -100,11 +100,6 @@ const (
HSTSIDSegMapMask = C.HS_TSID_SEG_MAP_MASK
)

const (
// PerfMaxStackDepth is the bpf map data array length for BPF_MAP_TYPE_STACK_TRACE traces
PerfMaxStackDepth = C.PERF_MAX_STACK_DEPTH
)

const (
TraceOriginUnknown = C.TRACE_UNKNOWN
TraceOriginSampling = C.TRACE_SAMPLING
Expand Down
3 changes: 1 addition & 2 deletions tracer/ebpf_integration_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -91,8 +91,7 @@ func runKernelFrameProbe(t *testing.T, tr *tracer.Tracer) {

type trace struct {
numKernelFrames int

frames libpf.EbpfFrame
frames libpf.EbpfFrame
}

func TestTracerErrorPropagation(t *testing.T) {
Expand Down
Loading
Loading