diff --git a/interpreter/python/python.go b/interpreter/python/python.go index ba52263bd..e1fbfe635 100644 --- a/interpreter/python/python.go +++ b/interpreter/python/python.go @@ -483,10 +483,13 @@ func (p *pythonInstance) getCodeObject(addr libpf.Address, if addr == 0 { return nil, errors.New("failed to read code object: null pointer") } - if value, ok := p.addrToCodeObject.Get(addr); ok { - m := value - if m.ebpfChecksum == ebpfChecksum { - return m, nil + if ebpfChecksum != 0 { + // A zero checksum indicates code object read failed in the kernel (e.g. paged out). + if value, ok := p.addrToCodeObject.Get(addr); ok { + m := value + if m.ebpfChecksum == ebpfChecksum { + return m, nil + } } } @@ -541,7 +544,7 @@ func (p *pythonInstance) getCodeObject(addr libpf.Address, ebpfChecksumCalculated := (argCount << 25) + (kwonlyArgCount << 18) + (flags << 10) + firstLineNo - if ebpfChecksum != ebpfChecksumCalculated { + if ebpfChecksum != 0 && ebpfChecksum != ebpfChecksumCalculated { return nil, fmt.Errorf("read code object was stale: %x != %x", ebpfChecksum, ebpfChecksumCalculated) } @@ -562,7 +565,7 @@ func (p *pythonInstance) getCodeObject(addr libpf.Address, sourceFileName: libpf.Intern(sourceFileName), firstLineNo: firstLineNo, lineTable: lineTable, - ebpfChecksum: ebpfChecksum, + ebpfChecksum: ebpfChecksumCalculated, } p.addrToCodeObject.Add(addr, pco) return pco, nil diff --git a/support/ebpf/bpfdefs.h b/support/ebpf/bpfdefs.h index b153c8dc0..91a261c6d 100644 --- a/support/ebpf/bpfdefs.h +++ b/support/ebpf/bpfdefs.h @@ -46,6 +46,12 @@ static inline long bpf_probe_read_user(void *buf, u32 sz, const void *ptr) return __bpf_probe_read_user(__cgo_ctx->id, buf, sz, ptr); } +static inline long bpf_probe_read_user_with_test_fault(void *buf, u32 sz, const void *ptr) +{ + long __bpf_probe_read_user_with_test_fault(u64, void *, u32, const void *); + return __bpf_probe_read_user_with_test_fault(__cgo_ctx->id, buf, sz, ptr); +} + static inline long bpf_probe_read_kernel(UNUSED void *buf, UNUSED u32 sz, UNUSED const void *ptr) { return -1; @@ -122,6 +128,8 @@ static long (*bpf_probe_read_user)(void *dst, int size, const void *unsafe_ptr) static long (*bpf_probe_read_kernel)(void *dst, int size, const void *unsafe_ptr) = (void *) BPF_FUNC_probe_read_kernel; + #define bpf_probe_read_user_with_test_fault bpf_probe_read_user + #define printt(fmt, ...) \ ({ \ const char ____fmt[] = fmt; \ diff --git a/support/ebpf/python_tracer.ebpf.c b/support/ebpf/python_tracer.ebpf.c index cab564897..9eb455728 100644 --- a/support/ebpf/python_tracer.ebpf.c +++ b/support/ebpf/python_tracer.ebpf.c @@ -141,10 +141,15 @@ static EBPF_INLINE ErrorCode process_python_frame( } // Read PyCodeObject - if (bpf_probe_read_user(pss->code, sizeof(pss->code), py_codeobject)) { + if (bpf_probe_read_user_with_test_fault(pss->code, sizeof(pss->code), py_codeobject)) { DEBUG_PRINT("Failed to read PyCodeObject at 0x%lx", (unsigned long)(py_codeobject)); increment_metric(metricID_UnwindPythonErrBadCodeObjectArgCountAddr); - return ERR_PYTHON_BAD_CODE_OBJECT_ADDR; + // Push the frame with the code object address so the agent can try to + // read it in userspace (which can take page faults unlike BPF). + // codeobject_id=0 distinguishes this from a successful read. + file_id = (u64)py_codeobject; + lineno = py_encode_lineno(0, (u32)py_f_lasti); + goto push_frame; } int py_argcount = *(int *)(&pss->code[pyinfo->PyCodeObject_co_argcount]); diff --git a/support/ebpf/tracer.ebpf.amd64 b/support/ebpf/tracer.ebpf.amd64 index 8425df149..2b79e45ad 100644 Binary files a/support/ebpf/tracer.ebpf.amd64 and b/support/ebpf/tracer.ebpf.amd64 differ diff --git a/support/ebpf/tracer.ebpf.arm64 b/support/ebpf/tracer.ebpf.arm64 index d1a47909a..85823d8b7 100644 Binary files a/support/ebpf/tracer.ebpf.arm64 and b/support/ebpf/tracer.ebpf.arm64 differ diff --git a/tools/coredump/analyze.go b/tools/coredump/analyze.go index b3c793ce5..a17a41263 100644 --- a/tools/coredump/analyze.go +++ b/tools/coredump/analyze.go @@ -111,7 +111,7 @@ func (cmd *analyzeCmd) exec(context.Context, []string) (err error) { } defer proc.Close() - threads, err := ExtractTraces(context.Background(), proc, cmd.debugEbpf, lwpFilter) + threads, err := ExtractTraces(context.Background(), proc, cmd.debugEbpf, lwpFilter, nil) if err != nil { return fmt.Errorf("failed to extract traces: %w", err) } diff --git a/tools/coredump/coredump.go b/tools/coredump/coredump.go index b4dd54754..d653935da 100644 --- a/tools/coredump/coredump.go +++ b/tools/coredump/coredump.go @@ -116,7 +116,7 @@ func (t *traceReporter) ReportTraceEvent(trace *libpf.Trace, meta *samples.Trace } func ExtractTraces(ctx context.Context, pr process.Process, debug bool, - lwpFilter libpf.Set[libpf.PID]) ([]ThreadInfo, error) { + lwpFilter libpf.Set[libpf.PID], faultAddresses map[uintptr]int) ([]ThreadInfo, error) { todo, cancel := context.WithCancel(ctx) defer cancel() @@ -159,7 +159,7 @@ func ExtractTraces(ctx context.Context, pr process.Process, debug bool, } // Interfaces for the managers - ebpfCtx := newEBPFContext(pr) + ebpfCtx := newEBPFContext(pr, faultAddresses) defer ebpfCtx.release() inverse_pac_mask := ^(pr.GetMachineData().CodePACMask) diff --git a/tools/coredump/coredump_test.go b/tools/coredump/coredump_test.go index c873e53bd..fb9163487 100644 --- a/tools/coredump/coredump_test.go +++ b/tools/coredump/coredump_test.go @@ -4,6 +4,7 @@ package main import ( + "strconv" "testing" "github.com/stretchr/testify/require" @@ -11,6 +12,25 @@ import ( "go.opentelemetry.io/ebpf-profiler/tools/coredump/modulestore" ) +// parseFaultAddresses converts the hex/decimal address strings from a test +// case JSON into the uintptr-keyed map consumed by the ebpfContext. The int +// values are hit counters initialized to 0; ExtractTraces will fail the test +// if any remain 0 after the unwind. ParseUint with base=0 honors a "0x" +// prefix, so both "0x7f12..." and decimal forms work. +func parseFaultAddresses(t *testing.T, raw []string) map[uintptr]int { + t.Helper() + if len(raw) == 0 { + return nil + } + out := make(map[uintptr]int, len(raw)) + for _, s := range raw { + v, err := strconv.ParseUint(s, 0, 64) + require.NoErrorf(t, err, "invalid fault-address %q", s) + out[uintptr(v)] = 0 + } + return out +} + func TestCoreDumps(t *testing.T) { cases, err := findTestCases(true) require.NoError(t, err) @@ -34,10 +54,23 @@ func TestCoreDumps(t *testing.T) { require.NoError(t, err) defer core.Close() - data, err := ExtractTraces(t.Context(), core, false, nil) + faults := parseFaultAddresses(t, testCase.FaultAddresses) + data, err := ExtractTraces(t.Context(), core, false, nil, faults) require.NoError(t, err) require.Equal(t, testCase.Threads, data) + + // Every fault address listed in the test case must have been + // visited at least once by bpf_probe_read_user_with_test_fault; + // otherwise the test isn't actually exercising the recovery path + // it claims to (e.g. a stale address that the unwinder no longer + // reads). The map is mutated in place by the helper, so we can + // just iterate the post-run state. + for addr, hits := range faults { + require.Greaterf(t, hits, 0, + "fault address 0x%x was never visited by "+ + "bpf_probe_read_user_with_test_fault", addr) + } }) } } diff --git a/tools/coredump/ebpfcontext.go b/tools/coredump/ebpfcontext.go index 911618df6..0c9fd58c1 100644 --- a/tools/coredump/ebpfcontext.go +++ b/tools/coredump/ebpfcontext.go @@ -57,6 +57,14 @@ type ebpfContext struct { // stackDeltaFileID is context variable for nested map lookups stackDeltaFileID C.u64 + + // faultAddresses maps user-space addresses on which + // bpf_probe_read_user_with_test_fault should pretend the kernel could not + // read (returns -1) to a hit counter. The presence of a key (regardless of + // value) is what triggers the fault; the int value records how many times + // the helper visited that address during the unwind so tests can assert + // every injected fault actually exercised the code path under test. + faultAddresses map[uintptr]int } // ebpfContextMap is global mapping of EBPFContext id (PIDandTGID) to the actual data. @@ -65,8 +73,11 @@ type ebpfContext struct { // passed directly to the C code). var ebpfContextMap = map[C.u64]*ebpfContext{} -// newEBPFContext creates new EBPF Context from given core dump image -func newEBPFContext(pr process.Process) *ebpfContext { +// newEBPFContext creates new EBPF Context from given core dump image. The +// faultAddresses map, if non-empty, instructs bpf_probe_read_user_with_test_fault +// to return -1 for those addresses; the int value of each entry is incremented +// each time the helper visits the address. +func newEBPFContext(pr process.Process, faultAddresses map[uintptr]int) *ebpfContext { pid := pr.PID() ctx := &ebpfContext{ trace: libpf.EbpfTrace{PID: pid}, @@ -78,6 +89,7 @@ func newEBPFContext(pr process.Process) *ebpfContext { maps: make(map[unsafe.Pointer]map[any]unsafe.Pointer), perCPURecord: C.malloc(C.sizeof_PerCPURecord), unwindInfoArray: C.malloc(C.sizeof_UnwindInfo * C.ulong(support.UnwindInfoMaxEntries)), + faultAddresses: faultAddresses, } ebpfContextMap[ctx.PIDandTGID] = ctx return ctx diff --git a/tools/coredump/ebpfhelpers.go b/tools/coredump/ebpfhelpers.go index 0ee5048b9..117fbef0f 100644 --- a/tools/coredump/ebpfhelpers.go +++ b/tools/coredump/ebpfhelpers.go @@ -55,6 +55,31 @@ func __bpf_probe_read_user(id C.u64, buf unsafe.Pointer, sz C.int, ptr unsafe.Po return 0 } +//export __bpf_probe_read_user_with_test_fault +func __bpf_probe_read_user_with_test_fault( + id C.u64, buf unsafe.Pointer, sz C.int, ptr unsafe.Pointer, +) C.long { + ctx := ebpfContextMap[id] + addr := uintptr(ptr) + // Trace every call so coredump test authors can grep the test output to + // pick a candidate address (e.g. the 192-byte read of a PyCodeObject) when + // constructing a fault-injection test case. + log.Debugf("bpf_probe_read_user_with_test_fault: sz=%d ptr=0x%x", int(sz), addr) + if _, ok := ctx.faultAddresses[addr]; ok { + // This log line stays at Info level so it's visible in CI when a + // fault-injection test actually exercises the recovery path. + log.Infof("bpf_probe_read_user_with_test_fault: injecting fault at 0x%x (sz=%d)", + addr, int(sz)) + ctx.faultAddresses[addr]++ + return -1 + } + dst := sliceBuffer(buf, sz) + if _, err := ctx.remoteMemory.ReadAt(dst, int64(addr)); err != nil { + return -1 + } + return 0 +} + // stackDeltaInnerMap is a special map returned to C code to indicate that // we are accessing one of nested maps in the exe_id_to_X_stack_deltas maps var stackDeltaInnerMap = C.malloc(1) diff --git a/tools/coredump/json.go b/tools/coredump/json.go index f0c5fd5cc..0addff0e1 100644 --- a/tools/coredump/json.go +++ b/tools/coredump/json.go @@ -21,6 +21,11 @@ type CoredumpTestCase struct { Skip string `json:"skip,omitempty"` Threads []ThreadInfo `json:"threads"` Modules []ModuleInfo `json:"modules"` + // FaultAddresses is an optional list of user-space addresses (hex strings, + // e.g. "0x7f1234567000") at which the test harness should make + // bpf_probe_read_user_with_test_fault return -1, simulating a BPF read + // failure. Used to exercise recovery paths. + FaultAddresses []string `json:"fault-addresses,omitempty"` } // ModuleInfo stores information about a module that was loaded when the coredump was created. diff --git a/tools/coredump/new.go b/tools/coredump/new.go index 3df9499f3..9be0b7b47 100644 --- a/tools/coredump/new.go +++ b/tools/coredump/new.go @@ -169,7 +169,7 @@ func (cmd *newCmd) exec(context.Context, []string) (err error) { testCase := &CoredumpTestCase{} - testCase.Threads, err = ExtractTraces(context.Background(), core, cmd.debugEbpf, nil) + testCase.Threads, err = ExtractTraces(context.Background(), core, cmd.debugEbpf, nil, nil) if err != nil { return fmt.Errorf("failed to extract traces: %w", err) } diff --git a/tools/coredump/rebase.go b/tools/coredump/rebase.go index 8b024e2ef..d88005ea7 100644 --- a/tools/coredump/rebase.go +++ b/tools/coredump/rebase.go @@ -61,7 +61,7 @@ func (cmd *rebaseCmd) exec(context.Context, []string) (err error) { return fmt.Errorf("failed to open coredump: %w", err) } - testCase.Threads, err = ExtractTraces(context.Background(), core, false, nil) + testCase.Threads, err = ExtractTraces(context.Background(), core, false, nil, nil) _ = core.Close() if err != nil { return fmt.Errorf("failed to extract traces: %w", err) diff --git a/tools/coredump/testdata/amd64/python3.14-faulted-codeobject.json b/tools/coredump/testdata/amd64/python3.14-faulted-codeobject.json new file mode 100644 index 000000000..886a316a2 --- /dev/null +++ b/tools/coredump/testdata/amd64/python3.14-faulted-codeobject.json @@ -0,0 +1,81 @@ +{ + "coredump-ref": "a69bc3dea27b197295177ab1645783ddf27e682fafc9102ef7723b05c528285c", + "fault-addresses": [ + "0x7fb421b7cb70", + "0x7fb421b51c30" + ], + "threads": [ + { + "lwp": 28509, + "frames": [ + "recur_fibo+4 in /tmp/opentelemetry-ebpf-profiler/tools/coredump/testsources/python/fib.py:7", + "recur_fibo+4 in /tmp/opentelemetry-ebpf-profiler/tools/coredump/testsources/python/fib.py:7", + "recur_fibo+4 in /tmp/opentelemetry-ebpf-profiler/tools/coredump/testsources/python/fib.py:7", + "recur_fibo+4 in /tmp/opentelemetry-ebpf-profiler/tools/coredump/testsources/python/fib.py:7", + "recur_fibo+4 in /tmp/opentelemetry-ebpf-profiler/tools/coredump/testsources/python/fib.py:7", + "recur_fibo+4 in /tmp/opentelemetry-ebpf-profiler/tools/coredump/testsources/python/fib.py:7", + "recur_fibo+4 in /tmp/opentelemetry-ebpf-profiler/tools/coredump/testsources/python/fib.py:7", + "recur_fibo+4 in /tmp/opentelemetry-ebpf-profiler/tools/coredump/testsources/python/fib.py:7", + "recur_fibo+4 in /tmp/opentelemetry-ebpf-profiler/tools/coredump/testsources/python/fib.py:7", + "recur_fibo+4 in /tmp/opentelemetry-ebpf-profiler/tools/coredump/testsources/python/fib.py:7", + "recur_fibo+4 in /tmp/opentelemetry-ebpf-profiler/tools/coredump/testsources/python/fib.py:7", + "recur_fibo+4 in /tmp/opentelemetry-ebpf-profiler/tools/coredump/testsources/python/fib.py:7", + "recur_fibo+4 in /tmp/opentelemetry-ebpf-profiler/tools/coredump/testsources/python/fib.py:7", + "recur_fibo+4 in /tmp/opentelemetry-ebpf-profiler/tools/coredump/testsources/python/fib.py:7", + "recur_fibo+4 in /tmp/opentelemetry-ebpf-profiler/tools/coredump/testsources/python/fib.py:7", + "recur_fibo+4 in /tmp/opentelemetry-ebpf-profiler/tools/coredump/testsources/python/fib.py:7", + "recur_fibo+4 in /tmp/opentelemetry-ebpf-profiler/tools/coredump/testsources/python/fib.py:7", + "recur_fibo+4 in /tmp/opentelemetry-ebpf-profiler/tools/coredump/testsources/python/fib.py:7", + "recur_fibo+4 in /tmp/opentelemetry-ebpf-profiler/tools/coredump/testsources/python/fib.py:7", + "recur_fibo+4 in /tmp/opentelemetry-ebpf-profiler/tools/coredump/testsources/python/fib.py:7", + "recur_fibo+4 in /tmp/opentelemetry-ebpf-profiler/tools/coredump/testsources/python/fib.py:7", + "recur_fibo+4 in /tmp/opentelemetry-ebpf-profiler/tools/coredump/testsources/python/fib.py:7", + "recur_fibo+4 in /tmp/opentelemetry-ebpf-profiler/tools/coredump/testsources/python/fib.py:7", + "recur_fibo+4 in /tmp/opentelemetry-ebpf-profiler/tools/coredump/testsources/python/fib.py:7", + "recur_fibo+4 in /tmp/opentelemetry-ebpf-profiler/tools/coredump/testsources/python/fib.py:7", + "recur_fibo+4 in /tmp/opentelemetry-ebpf-profiler/tools/coredump/testsources/python/fib.py:7", + "recur_fibo+4 in /tmp/opentelemetry-ebpf-profiler/tools/coredump/testsources/python/fib.py:7", + "recur_fibo+4 in /tmp/opentelemetry-ebpf-profiler/tools/coredump/testsources/python/fib.py:7", + "+9 in /tmp/opentelemetry-ebpf-profiler/tools/coredump/testsources/python/fib.py:10", + "?+0x0", + "python3.14+0x5454c6", + "python3.14+0x53ffa0", + "python3.14+0x6b741e", + "python3.14+0x6b3dcf", + "python3.14+0x6b377d", + "python3.14+0x6b3271", + "python3.14+0x6b0552", + "python3.14+0x65ea68", + "libc.so.6+0x29f74", + "libc.so.6+0x2a026", + "python3.14+0x65de70" + ] + } + ], + "modules": [ + { + "ref": "85590dd58edf5445e18bc7193e5ebc01ac5841f1ae187e97705a662e90c6421e", + "local-path": "/usr/lib/x86_64-linux-gnu/libz.so.1.3.1" + }, + { + "ref": "e9a55da498abf2f190abd96115c7b9381d51968729b35e9d351e6a16cc951e5c", + "local-path": "/usr/lib/x86_64-linux-gnu/libm.so.6" + }, + { + "ref": "f5ffb9a6143905c445c980b7430ea94ad800b3c15dc83d5ec7667c6807f595dd", + "local-path": "/usr/lib/x86_64-linux-gnu/ld-linux-x86-64.so.2" + }, + { + "ref": "af6ab77528a4732280b9e158c987893a492ae315929277493fb60f0b4d381d8c", + "local-path": "/usr/bin/python3.14" + }, + { + "ref": "7d771b3a3182ca9193afafa461e099af06ec929b6171bb1c20920c11af3e5850", + "local-path": "/usr/lib/x86_64-linux-gnu/libc.so.6" + }, + { + "ref": "01ed980b9420653bfa242cdddcfed0b59c5dc066c79d95b8637a52d0135a64f1", + "local-path": "/usr/lib/x86_64-linux-gnu/libexpat.so.1.11.1" + } + ] +}