diff --git a/Makefile b/Makefile index 23b87d4ed..729c62299 100644 --- a/Makefile +++ b/Makefile @@ -149,7 +149,7 @@ test-deps: ($(MAKE) -C "$(testdata_dir)") || exit ; \ ) -TEST_INTEGRATION_BINARY_DIRS := tracer processmanager/ebpf support interpreter/golabels/integrationtests +TEST_INTEGRATION_BINARY_DIRS := tracer processmanager/ebpf kallsyms support interpreter/golabels/integrationtests pprof-execs: pprof_1_23 pprof_1_24 pprof_1_24_cgo pprof_1_24_cgo_pie pprof_stable pprof_stable_cgo pprof_stable_cgo_pie diff --git a/kallsyms/bpf.go b/kallsyms/bpf.go new file mode 100644 index 000000000..c34c32438 --- /dev/null +++ b/kallsyms/bpf.go @@ -0,0 +1,349 @@ +// Copyright The OpenTelemetry Authors +// SPDX-License-Identifier: Apache-2.0 + +package kallsyms // import "go.opentelemetry.io/ebpf-profiler/kallsyms" + +import ( + "cmp" + "context" + "errors" + "slices" + "strings" + "sync/atomic" + "time" + + "github.com/cilium/ebpf" + "github.com/elastic/go-perf" + "go.opentelemetry.io/ebpf-profiler/internal/log" + "go.opentelemetry.io/ebpf-profiler/libpf" + "golang.org/x/sys/unix" +) + +// bpfProgPrefix is the prefix the kernel uses for all JIT'd BPF program +// symbols in /proc/kallsyms and PERF_RECORD_KSYMBOL events. +const bpfProgPrefix = "bpf_prog_" + +type bpfSymbol struct { + address libpf.Address + size uint32 + name string +} + +// bpfSymbolTable is a sorted (by address) snapshot of all known BPF program +// symbols. It is stored atomically so readers never block writers. +type bpfSymbolTable struct { + symbols []bpfSymbol +} + +// lookup returns the symbol containing addr, or ("", false) if none does. +// A symbol covers [address, address+size). +func (t *bpfSymbolTable) lookup(addr libpf.Address) (string, uint, bool) { + // Binary search for the last symbol whose address <= addr. + // BinarySearchFunc returns (index of exact match, true) or + // (insertion point, false). In both cases the candidate symbol + // is at the returned index when found, or at index-1 when not found. + idx, found := slices.BinarySearchFunc(t.symbols, addr, func(sym bpfSymbol, a libpf.Address) int { + return cmp.Compare(sym.address, a) + }) + + if !found { + // idx is the insertion point; the last symbol with address <= addr + // is one position to the left. + if idx == 0 { + return "", 0, false + } + idx-- + } + + sym := &t.symbols[idx] + if addr >= sym.address+libpf.Address(sym.size) { + return "", 0, false + } + + return sym.name, uint(addr - sym.address), true +} + +// bpfSymbolizer is responsible for getting updates from `PERF_RECORD_KSYMBOL`. +// The symbolizer is not ready to use until startMonitor is called to load the symbols. +type bpfSymbolizer struct { + records chan *perf.KSymbolRecord + events []*perf.Event + cancel context.CancelFunc + table atomic.Pointer[bpfSymbolTable] +} + +// LookupSymbol resolves addr to a BPF program symbol name and offset. +// Returns ("", 0, false) if no BPF program covers addr. +func (s *bpfSymbolizer) LookupSymbol(addr libpf.Address) (string, uint, bool) { + t := s.table.Load() + if t == nil { + return "", 0, false + } + + return t.lookup(addr) +} + +// loadBPFPrograms enumerates all loaded BPF programs via the bpf syscall and +// builds a sorted bpfSymbolTable from their JIT symbol addresses and sizes. +// Only symbols with the "bpf_prog_" prefix are included; trampolines and +// dispatchers are intentionally excluded because they are not visible at +// initial scan time and would cause misattribution. +func (s *bpfSymbolizer) loadBPFPrograms() error { + symbols := []bpfSymbol{} + + id := ebpf.ProgramID(0) + for { + var err error + id, err = ebpf.ProgramGetNextID(id) + if err != nil { + break + } + + prog, err := ebpf.NewProgramFromID(id) + if err != nil { + // Program may have been unloaded between listing and opening. + continue + } + + info, err := prog.Info() + prog.Close() + if err != nil { + continue + } + + addrs, ok := info.JitedKsymAddrs() + if !ok || len(addrs) == 0 { + continue + } + + lens, _ := info.JitedFuncLens() + + // The kernel names BPF JIT symbols as "bpf_prog__". + name := bpfProgPrefix + info.Tag + "_" + info.Name + + for i, addr := range addrs { + sym := bpfSymbol{ + address: libpf.Address(addr), + name: name, + } + + if i < len(lens) { + sym.size = lens[i] + } + + symbols = append(symbols, sym) + } + } + + slices.SortFunc(symbols, func(a, b bpfSymbol) int { + return cmp.Compare(a.address, b.address) + }) + + s.table.Store(&bpfSymbolTable{symbols: symbols}) + + return nil +} + +// startMonitor starts the update monitoring and loads bpf symbols. +func (s *bpfSymbolizer) startMonitor(ctx context.Context, onlineCPUs []int) error { + ctx, s.cancel = context.WithCancel(ctx) + + err := s.subscribe(ctx, onlineCPUs) + if err != nil { + return err + } + + err = s.loadBPFPrograms() + if err != nil { + return err + } + + go s.reloadWorker(ctx) + + return nil +} + +// subscribe subscribes to updates for bpf symbols via `PERF_RECORD_KSYMBOL`. +func (s *bpfSymbolizer) subscribe(ctx context.Context, onlineCPUs []int) error { + attr := new(perf.Attr) + perf.Dummy.Configure(attr) + attr.Options.KSymbol = true + attr.SetWakeupWatermark(1) + + s.records = make(chan *perf.KSymbolRecord) + + for _, cpu := range onlineCPUs { + event, err := perf.Open(attr, perf.AllThreads, cpu, nil) + if err != nil { + return err + } + + s.events = append(s.events, event) + + err = event.MapRing() + if err != nil { + return err + } + + err = event.Enable() + if err != nil { + return err + } + + go func(event *perf.Event) { + for { + record, err := event.ReadRecord(ctx) + if err != nil { + if ctx.Err() != nil { + return + } + + log.Errorf("Failed to read perf event: %v", err) + continue + } + + switch ksymbol := record.(type) { + case *perf.LostRecord: + // nil as a sentinel value to indicate lost events. Whenever this happens + // we trigger a full re-scan of existing bpf programs to prevent data loss. + select { + case s.records <- nil: + case <-ctx.Done(): + } + case *perf.KSymbolRecord: + if ksymbol.Type != unix.PERF_RECORD_KSYMBOL_TYPE_BPF { + continue + } + + select { + case s.records <- ksymbol: + case <-ctx.Done(): + } + default: + log.Debugf("Unexpected perf record type: %T", record) + } + + if ctx.Err() != nil { + return + } + } + }(event) + } + + return nil +} + +// reloadWorker is the goroutine handling the reloads of the bpf symbols. +func (s *bpfSymbolizer) reloadWorker(ctx context.Context) { + noTimeout := make(<-chan time.Time) + nextReload := noTimeout + for { + select { + case <-nextReload: + if err := s.loadBPFPrograms(); err == nil { + log.Debugf("Kernel symbols reloaded") + nextReload = noTimeout + } else { + log.Warnf("Failed to reload kernel symbols: %v", err) + nextReload = time.After(time.Second) + } + case record := <-s.records: + if err := s.handleBPFUpdate(record); err != nil { + log.Warnf("Error handling bpf ksymbol update: %v", err) + nextReload = time.After(time.Second) + } + case <-ctx.Done(): + return + } + } +} + +// handleBPFUpdate handles the update record from perf events. +func (s *bpfSymbolizer) handleBPFUpdate(record *perf.KSymbolRecord) error { + if record == nil { + return errors.New("lost events detected") + } + + // Only track bpf_prog_* symbols. Trampolines, dispatchers, and other + // BPF-tagged symbols are excluded because they are not present at initial + // scan time and would cause misattribution. + if !strings.HasPrefix(record.Name, bpfProgPrefix) { + return nil + } + + if record.Flags&unix.PERF_RECORD_KSYMBOL_FLAGS_UNREGISTER != 0 { + s.removeBPFSymbol(libpf.Address(record.Addr)) + return nil + } + + s.addBPFSymbol(libpf.Address(record.Addr), record.Name, record.Len) + + return nil +} + +// addBPFSymbol inserts a new BPF program symbol into the table. +func (s *bpfSymbolizer) addBPFSymbol(addr libpf.Address, name string, size uint32) { + old := s.table.Load() + var oldSymbols []bpfSymbol + if old != nil { + oldSymbols = old.symbols + } + + // Check for a benign race: symbol already present with the same name. + idx, found := slices.BinarySearchFunc(oldSymbols, addr, func(sym bpfSymbol, a libpf.Address) int { + return cmp.Compare(sym.address, a) + }) + if found && oldSymbols[idx].name == name { + return + } + + // Insert the new symbol into the right position to maintain sorting. + newSym := bpfSymbol{address: addr, size: size, name: name} + newSymbols := make([]bpfSymbol, len(oldSymbols)+1) + copy(newSymbols, oldSymbols[:idx]) + newSymbols[idx] = newSym + copy(newSymbols[idx+1:], oldSymbols[idx:]) + + s.table.Store(&bpfSymbolTable{symbols: newSymbols}) +} + +// removeBPFSymbol removes a BPF program symbol from the table by address. +func (s *bpfSymbolizer) removeBPFSymbol(addr libpf.Address) { + old := s.table.Load() + if old == nil { + return + } + + idx, found := slices.BinarySearchFunc(old.symbols, addr, func(sym bpfSymbol, a libpf.Address) int { + return cmp.Compare(sym.address, a) + }) + if !found { + return + } + + newSymbols := make([]bpfSymbol, len(old.symbols)-1) + copy(newSymbols, old.symbols[:idx]) + copy(newSymbols[idx:], old.symbols[idx+1:]) + + s.table.Store(&bpfSymbolTable{symbols: newSymbols}) +} + +// Close frees resources associated with bpfSymbolizer. +func (s *bpfSymbolizer) Close() { + // Cancel the context first so reader goroutines and reloadWorker + // observe ctx.Done() and exit before we close the perf events. + if s.cancel != nil { + s.cancel() + } + + for _, event := range s.events { + if err := event.Disable(); err != nil { + log.Errorf("Failed to disable perf event: %v", err) + } + if err := event.Close(); err != nil { + log.Errorf("Failed to close perf event: %v", err) + } + } + + s.events = nil +} diff --git a/kallsyms/bpf_integration_test.go b/kallsyms/bpf_integration_test.go new file mode 100644 index 000000000..850678227 --- /dev/null +++ b/kallsyms/bpf_integration_test.go @@ -0,0 +1,179 @@ +//go:build integration && linux + +// Copyright The OpenTelemetry Authors +// SPDX-License-Identifier: Apache-2.0 + +package kallsyms + +import ( + "runtime" + "strings" + "testing" + "time" + + "github.com/cilium/ebpf" + "github.com/cilium/ebpf/asm" + "github.com/stretchr/testify/assert" + "github.com/stretchr/testify/require" + + "go.opentelemetry.io/ebpf-profiler/libpf" + "go.opentelemetry.io/ebpf-profiler/rlimit" +) + +const ( + eventuallyWaitFor = 10 * time.Second + eventuallyTick = 100 * time.Millisecond + + dynamicProgName = "otel_dyn_test" + preexistingProgName = "otel_pre_test" +) + +// linearCPUs returns []int{0, 1, ..., n-1} for n online CPUs. +// This assumes contiguous CPU IDs, which is practical for integration tests. +// The proper parsing of /sys/devices/system/cpu/online lives in tracer/helper.go, +// but we don't want to export or duplicate it here. +func linearCPUs() []int { + cpus := make([]int, runtime.NumCPU()) + for i := range cpus { + cpus[i] = i + } + return cpus +} + +// loadSocketFilter loads a minimal BPF socket filter program with the given name. +// The program simply returns 0. The caller is responsible for closing it. +func loadSocketFilter(t *testing.T, name string) *ebpf.Program { + t.Helper() + + spec := &ebpf.ProgramSpec{ + Name: name, + Type: ebpf.SocketFilter, + License: "GPL", + Instructions: asm.Instructions{ + asm.Mov.Imm(asm.R0, 0), + asm.Return(), + }, + } + + prog, err := ebpf.NewProgram(spec) + require.NoError(t, err) + + return prog +} + +// findBPFSymbol searches the bpfSymbolTable for a symbol whose name ends with +// "_". Returns the full symbol name and its address. +func findBPFSymbol(s *bpfSymbolizer, progName string) (string, libpf.Address) { + suffix := "_" + progName + + tbl := s.table.Load() + if tbl == nil { + return "", 0 + } + + for _, sym := range tbl.symbols { + if strings.HasSuffix(sym.name, suffix) { + return sym.name, sym.address + } + } + return "", 0 +} + +// assertBPFSymbolFound polls the symbolizer until a BPF symbol matching progName +// appears, then verifies the full symbolization path (address -> symbol). +func assertBPFSymbolFound(t *testing.T, s *Symbolizer, progName string) (string, libpf.Address) { + t.Helper() + + var fullName string + var progAddr libpf.Address + require.Eventually(t, func() bool { + fullName, progAddr = findBPFSymbol(s.bpf, progName) + return fullName != "" + }, eventuallyWaitFor, eventuallyTick, + "BPF program with suffix %q not found by symbolizer", "_"+progName) + + t.Logf("Found BPF program %q at address 0x%x", fullName, progAddr) + + funcName, offset, ok := s.LookupBPFSymbol(progAddr) + require.True(t, ok, "LookupBPFSymbol failed for address 0x%x", progAddr) + assert.Equal(t, fullName, funcName) + assert.Equal(t, uint(0), offset) + + funcName, offset, ok = s.LookupBPFSymbol(progAddr + 1) + require.True(t, ok, "LookupBPFSymbol failed for address 0x%x", progAddr+1) + assert.Equal(t, fullName, funcName) + assert.Equal(t, uint(1), offset) + + return fullName, progAddr +} + +// assertBPFSymbolRemoved polls the symbolizer until the BPF symbol matching +// progName disappears. +func assertBPFSymbolRemoved(t *testing.T, s *Symbolizer, progName string) { + t.Helper() + + require.Eventually(t, func() bool { + name, _ := findBPFSymbol(s.bpf, progName) + return name == "" + }, eventuallyWaitFor, eventuallyTick, + "BPF program with suffix %q not removed from symbolizer", "_"+progName) + + t.Logf("BPF program with suffix %q successfully removed from symbolizer", "_"+progName) +} + +// TestBPFSymbolizerDynamic verifies that programs loaded after the monitor +// starts are discovered via PERF_RECORD_KSYMBOL events and that unloading +// them removes the symbols. +func TestBPFSymbolizerDynamic(t *testing.T) { + restoreRlimit, err := rlimit.MaximizeMemlock() + require.NoError(t, err) + defer restoreRlimit() + + s, err := NewSymbolizer() + require.NoError(t, err) + + err = s.bpf.startMonitor(t.Context(), linearCPUs()) + require.NoError(t, err) + defer s.bpf.Close() + + // The program hasn't been loaded yet, so the symbolizer must not know about it. + name, _ := findBPFSymbol(s.bpf, dynamicProgName) + require.Empty(t, name, "BPF program %q found before loading", dynamicProgName) + + prog := loadSocketFilter(t, dynamicProgName) + + fullName, _ := assertBPFSymbolFound(t, s, dynamicProgName) + + prog.Close() + assertBPFSymbolRemoved(t, s, dynamicProgName) + + t.Logf("Dynamic test passed: %q added and removed", fullName) +} + +// TestBPFSymbolizerPreexisting verifies that programs loaded before the +// monitor starts are discovered via the initial /proc/kallsyms parse. +func TestBPFSymbolizerPreexisting(t *testing.T) { + restoreRlimit, err := rlimit.MaximizeMemlock() + require.NoError(t, err) + defer restoreRlimit() + + // Load the program before starting the monitor. + prog := loadSocketFilter(t, preexistingProgName) + + s, err := NewSymbolizer() + require.NoError(t, err) + + err = s.bpf.startMonitor(t.Context(), linearCPUs()) + require.NoError(t, err) + defer s.bpf.Close() + + // The program was loaded before the monitor started, so it must be + // discovered from /proc/kallsyms during the initial load. + fullName, _ := assertBPFSymbolFound(t, s, preexistingProgName) + t.Logf("Preexisting program %q found from initial kallsyms load", fullName) + + // Close the program and verify the symbol is removed via perf event. + prog.Close() + assertBPFSymbolRemoved(t, s, preexistingProgName) + t.Logf("Preexisting program %q successfully removed", fullName) +} diff --git a/kallsyms/kallsyms.go b/kallsyms/kallsyms.go index efcc5e3be..c9e46cd65 100644 --- a/kallsyms/kallsyms.go +++ b/kallsyms/kallsyms.go @@ -38,6 +38,9 @@ const Kernel = "vmlinux" // from the kernel kallsyms file. const pointerBits = int(unsafe.Sizeof(libpf.Address(0)) * 8) +// maxAddr is the max address value. +const maxAddr = uint64(1< maxAddr { + return fmt.Errorf("address exceeds pointer size: %x > %x", address, maxAddr) + } if address != 0 { noSymbols = false } @@ -409,13 +417,11 @@ func (s *Symbolizer) updateSymbolsFrom(r io.Reader) error { symbols: syms[0:0], names: names[0:0], } - if moduleName != "bpf" { - oldMod, _ = getModuleByAddress(modules, libpf.Address(address)) - if oldMod != nil && !oldMod.stub && oldMod.Name() == moduleName { - oldMtime = oldMod.mtime - } else { - oldMod = nil - } + oldMod, _ = getModuleByAddress(modules, libpf.Address(address)) + if oldMod != nil && !oldMod.stub && oldMod.Name() == moduleName { + oldMtime = oldMod.mtime + } else { + oldMod = nil } if loadModuleMetadata(&newMod, moduleName, oldMtime) { // Module metadata was updated. Parse this module symbols. @@ -458,6 +464,9 @@ func (s *Symbolizer) updateSymbolsFrom(r io.Reader) error { } } } + if err := scanner.Err(); err != nil { + return fmt.Errorf("error scanning /proc/kallsyms: %w", err) + } if mod != nil { mod.finish() } @@ -486,7 +495,6 @@ func (s *Symbolizer) loadKallsyms() error { var nonsyfsModules = libpf.Set[string]{ Kernel: libpf.Void{}, - "bpf": libpf.Void{}, } // loadModules will reload module metadata. @@ -575,12 +583,6 @@ func (s *Symbolizer) reloadWorker(ctx context.Context, kobjectClient *kobject.Cl log.Warnf("Failed to reload kernel modules metadata: %v", err) nextModulesReload = time.After(10 * time.Second) } - case <-s.reloadSymbols: - // Just trigger reloading of symbols with small delay to batch - // potentially multiple module loads. - if nextKallsymsReload == noTimeout { - nextKallsymsReload = time.After(100 * time.Millisecond) - } case <-nextKallsymsReload: if err := s.loadKallsyms(); err == nil { log.Debugf("Kernel symbols reloaded") @@ -615,26 +617,28 @@ func (s *Symbolizer) pollKobjectClient(_ context.Context, kobjectClient *kobject } } -// Reload will trigger asynchronous update of modules and symbols. -func (s *Symbolizer) StartMonitor(ctx context.Context) error { +// Close frees resources associated with the Symbolizer. +func (s *Symbolizer) Close() { + s.bpf.Close() +} + +// StartMonitor starts the update monitoring for kallsyms. +func (s *Symbolizer) StartMonitor(ctx context.Context, onlineCPUs []int) error { kobjectClient, err := kobject.New() if err != nil { return fmt.Errorf("failed to create kobject netlink socket: %v", err) } + err = s.bpf.startMonitor(ctx, onlineCPUs) + if err != nil { + s.bpf.Close() + _ = kobjectClient.Close() + return err + } go s.reloadWorker(ctx, kobjectClient) go s.pollKobjectClient(ctx, kobjectClient) return nil } -// Reload triggers a non-blocking reload and update of Symbolizer -// with the recent information of /proc/kallsyms. -func (s *Symbolizer) Reload() { - select { - case s.reloadSymbols <- libpf.Void{}: - default: - } -} - // getModuleByAddress is a helper to find a Module from the sorted 'modules' // slice matching the address 'pc'. func getModuleByAddress(modules []Module, pc libpf.Address) (*Module, error) { @@ -656,7 +660,7 @@ func (s *Symbolizer) GetModuleByAddress(pc libpf.Address) (*Module, error) { return getModuleByAddress(s.modules.Load().([]Module), pc) } -// GetModuleByAddress finds the Module containing the module 'module'. +// GetModuleByName finds the Module with the given name. func (s *Symbolizer) GetModuleByName(module string) (*Module, error) { modules := s.modules.Load().([]Module) for i := range modules { @@ -667,3 +671,12 @@ func (s *Symbolizer) GetModuleByName(module string) (*Module, error) { } return nil, ErrNoModule } + +// LookupBPFSymbol resolves addr to a BPF program symbol name and offset. +// Returns ("", 0, false) if no BPF program covers addr. +func (s *Symbolizer) LookupBPFSymbol(addr libpf.Address) (string, uint, bool) { + if s.bpf == nil { + return "", 0, false + } + return s.bpf.LookupSymbol(addr) +} diff --git a/kallsyms/kallsyms_test.go b/kallsyms/kallsyms_test.go index c69471d50..dd8bed983 100644 --- a/kallsyms/kallsyms_test.go +++ b/kallsyms/kallsyms_test.go @@ -4,12 +4,16 @@ package kallsyms import ( + "cmp" "io" + "slices" "strings" "testing" "go.opentelemetry.io/ebpf-profiler/libpf" + "golang.org/x/sys/unix" + "github.com/elastic/go-perf" "github.com/stretchr/testify/assert" "github.com/stretchr/testify/require" ) @@ -105,6 +109,156 @@ ffffffffc13fcb20 t init_xfs_fs [xfs]`)) assertSymbol(t, s, 0xffffffffc13cc610+1, "xfs", "perf_trace_xfs_attr_list_class", 1) } +// setBPFSymbols stores the given symbols in the bpfSymbolizer as a sorted +// bpfSymbolTable. This replaces the production loadBPFPrograms for tests. +func setBPFSymbols(s *bpfSymbolizer, symbols []bpfSymbol) { + sorted := make([]bpfSymbol, len(symbols)) + copy(sorted, symbols) + slices.SortFunc(sorted, func(a, b bpfSymbol) int { + return cmp.Compare(a.address, b.address) + }) + s.table.Store(&bpfSymbolTable{symbols: sorted}) +} + +// assertBPFSymbol checks that the BPF symbolizer resolves addr to the expected +// function name and offset. +func assertBPFSymbol(t *testing.T, s *Symbolizer, addr libpf.Address, eFuncName string, eOffset uint) { + t.Helper() + funcName, off, ok := s.LookupBPFSymbol(addr) + if assert.True(t, ok, "expected BPF symbol at 0x%x", addr) { + assert.Equal(t, eFuncName, funcName) + assert.Equal(t, eOffset, off) + } +} + +// assertNoBPFSymbol checks that the BPF symbolizer does not resolve addr. +func assertNoBPFSymbol(t *testing.T, s *Symbolizer, addr libpf.Address) { + t.Helper() + _, _, ok := s.LookupBPFSymbol(addr) + assert.False(t, ok, "expected no BPF symbol at 0x%x", addr) +} + +func TestBPFUpdates(t *testing.T) { + s := &Symbolizer{ + bpf: &bpfSymbolizer{}, + } + + bpfSymbols := []bpfSymbol{ + {address: 0xffffffc080f26228, size: 512, name: "bpf_prog_00354c172d366337_sd_devices"}, + {address: 0xffffffc080f26430, size: 512, name: "bpf_prog_772db7720b2728e9_sd_fw_egress"}, + {address: 0xffffffc080f264d8, size: 512, name: "bpf_prog_772db7720b2728e9_sd_fw_ingress"}, + {address: 0xffffffc080f28490, size: 512, name: "bpf_prog_56551fa66be1356a_sd_devices"}, + {address: 0xffffffc080f2867c, size: 512, name: "bpf_prog_772db7720b2728e9_sd_fw_egress"}, + {address: 0xffffffc080f2871c, size: 512, name: "bpf_prog_772db7720b2728e9_sd_fw_ingress"}, + {address: 0xffffffc080f2da64, size: 512, name: "bpf_prog_00354c172d366337_sd_devices"}, + {address: 0xffffffc080f304a0, size: 512, name: "bpf_prog_5be112cdf63b0d8c_sysctl_monitor"}, + {address: 0xffffffc080f3089c, size: 512, name: "bpf_prog_292e0637857c1257_cut_last"}, + {address: 0xffffffc080f3096c, size: 512, name: "bpf_prog_a97c143260cd9940_sd_devices"}, + {address: 0xffffffc080f32f4c, size: 512, name: "bpf_prog_79c5319176ee7ce5_sd_devices"}, + {address: 0xffffffc080f331e4, size: 512, name: "bpf_prog_772db7720b2728e9_sd_fw_egress"}, + {address: 0xffffffc080f33288, size: 512, name: "bpf_prog_772db7720b2728e9_sd_fw_ingress"}, + {address: 0xffffffc080f35f1c, size: 512, name: "bpf_prog_461f9f5162fd8042_sd_devices"}, + {address: 0xffffffc080f3629c, size: 512, name: "bpf_prog_b8f4fb5f08605bc5"}, + } + + setBPFSymbols(s.bpf, bpfSymbols) + + // Adding a symbol at the end with a known size of 12288 bytes. This ensures + // that an address 10240 bytes into the symbol is covered even though that + // far exceeds a single page past the symbol start. + const lastSymAddr = libpf.Address(0xffffffc080f38288) + const lastSymSize = uint32(12288) + err := s.bpf.handleBPFUpdate(&perf.KSymbolRecord{ + Addr: uint64(lastSymAddr), + Len: lastSymSize, + Name: "bpf_prog_05cbe5ca7b74dd09_sys_enter", + }) + require.NoError(t, err) + + // exact symbol match + assertBPFSymbol(t, s, lastSymAddr, "bpf_prog_05cbe5ca7b74dd09_sys_enter", 0) + + // 10240 bytes into the last symbol must resolve correctly + assertBPFSymbol(t, s, lastSymAddr+10240, "bpf_prog_05cbe5ca7b74dd09_sys_enter", 10240) + + // address beyond the symbol's end must not resolve + assertNoBPFSymbol(t, s, lastSymAddr+libpf.Address(lastSymSize)) + + // remove the added symbol + err = s.bpf.handleBPFUpdate(&perf.KSymbolRecord{ + Addr: uint64(lastSymAddr), + Name: "bpf_prog_05cbe5ca7b74dd09_sys_enter", + Flags: unix.PERF_RECORD_KSYMBOL_FLAGS_UNREGISTER, + }) + require.NoError(t, err) + + // the address goes poof + assertNoBPFSymbol(t, s, lastSymAddr) + + // add it back + err = s.bpf.handleBPFUpdate(&perf.KSymbolRecord{ + Addr: uint64(lastSymAddr), + Len: lastSymSize, + Name: "bpf_prog_05cbe5ca7b74dd09_sys_enter", + }) + require.NoError(t, err) + + // find a pre-existing symbol by aiming slightly above its start + assertBPFSymbol(t, s, 0xffffffc080f3089e, "bpf_prog_292e0637857c1257_cut_last", 0x2) + + // remove the pre-existing symbol + err = s.bpf.handleBPFUpdate(&perf.KSymbolRecord{ + Addr: 0xffffffc080f3089c, + Name: "bpf_prog_292e0637857c1257_cut_last", + Flags: unix.PERF_RECORD_KSYMBOL_FLAGS_UNREGISTER, + }) + require.NoError(t, err) + + // the address no longer resolves (previous symbol ends before 0x3089e) + assertNoBPFSymbol(t, s, 0xffffffc080f3089e) + + // put the removed symbol back + err = s.bpf.handleBPFUpdate(&perf.KSymbolRecord{ + Addr: 0xffffffc080f3089c, + Len: 512, + Name: "bpf_prog_292e0637857c1257_cut_last", + }) + require.NoError(t, err) + + // and it's right there where we put it + assertBPFSymbol(t, s, 0xffffffc080f3089e, "bpf_prog_292e0637857c1257_cut_last", 0x2) + + // checking for lost symbols triggering full reload + err = s.bpf.handleBPFUpdate(nil) + assert.NotNil(t, err) + + // trampolines and non-bpf_prog_ symbols are silently ignored + err = s.bpf.handleBPFUpdate(&perf.KSymbolRecord{ + Addr: 0xffffffc080f26226, + Name: "bpf_trampoline_6442536467", + }) + require.NoError(t, err) + assertNoBPFSymbol(t, s, 0xffffffc080f26226) + + // a bpf_prog_ symbol added before existing ones is found correctly + err = s.bpf.handleBPFUpdate(&perf.KSymbolRecord{ + Addr: 0xffffffc080f26000, + Len: 512, + Name: "bpf_prog_earliest", + }) + require.NoError(t, err) + assertBPFSymbol(t, s, 0xffffffc080f26000, "bpf_prog_earliest", 0) + + // removing it works + err = s.bpf.handleBPFUpdate(&perf.KSymbolRecord{ + Addr: 0xffffffc080f26000, + Name: "bpf_prog_earliest", + Flags: unix.PERF_RECORD_KSYMBOL_FLAGS_UNREGISTER, + }) + require.NoError(t, err) + assertNoBPFSymbol(t, s, 0xffffffc080f26000) +} + func BenchmarkSort(b *testing.B) { r := strings.NewReader(`0000000000000000 A __per_cpu_start 0000000000001000 A cpu_debug_store diff --git a/support/ebpf/interpreter_dispatcher.ebpf.c b/support/ebpf/interpreter_dispatcher.ebpf.c index ae9a1c9a7..8bcfb7b25 100644 --- a/support/ebpf/interpreter_dispatcher.ebpf.c +++ b/support/ebpf/interpreter_dispatcher.ebpf.c @@ -34,7 +34,7 @@ struct perf_progs_t { __uint(max_entries, NUM_TRACER_PROGS); } perf_progs SEC(".maps"); -// report_events notifies user space about events (GENERIC_PID and RELOAD_KALLSYMS). +// report_events notifies user space about events (GENERIC_PID). // // As a key the CPU number is used and the value represents a perf event file descriptor. // Information transmitted is the event type only. We use 0 as the number of max entries diff --git a/support/ebpf/kallsyms.ebpf.c b/support/ebpf/kallsyms.ebpf.c deleted file mode 100644 index 8e2eee667..000000000 --- a/support/ebpf/kallsyms.ebpf.c +++ /dev/null @@ -1,11 +0,0 @@ -#include "bpfdefs.h" -#include "tracemgmt.h" -#include "types.h" - -// kprobe__kallsyms notifies user space about changes to kallsyms. -SEC("kprobe/kallsysms") -int kprobe__kallsyms(void *ctx) -{ - event_send_trigger(ctx, EVENT_TYPE_RELOAD_KALLSYMS); - return 0; -} diff --git a/support/ebpf/system_config.ebpf.c b/support/ebpf/system_config.ebpf.c index d29d8c662..7d81e8169 100644 --- a/support/ebpf/system_config.ebpf.c +++ b/support/ebpf/system_config.ebpf.c @@ -33,15 +33,15 @@ int read_kernel_memory(UNUSED void *ctx) return 0; } - // Mark request handled - sys->pid = 0; - // Handle the read request - if (bpf_probe_read_kernel(sys->code, sizeof(sys->code), (void *)sys->address)) { - DEBUG_PRINT("Failed to read code from 0x%lx", (unsigned long)sys->address); - return -1; + sys->err = bpf_probe_read_kernel(sys->code, sizeof(sys->code), (void *)sys->address); + if (sys->err) { + DEBUG_PRINT("Failed to read code from 0x%lx: %ld", (unsigned long)sys->address, (long)sys->err); } + // Mark request handled once the helper has finished populating the result. + sys->pid = 0; + return 0; } @@ -65,9 +65,6 @@ int read_task_struct(struct bpf_raw_tracepoint_args *ctx) return 0; } - // Mark request handled - sys->pid = 0; - // Request to read current task. Adjust read address, and return // also the address of struct pt_regs in the entry stack. u64 addr = bpf_get_current_task() + sys->address; @@ -78,11 +75,14 @@ int read_task_struct(struct bpf_raw_tracepoint_args *ctx) sys->address = (u64)regs; // Execute the read request. - if (bpf_probe_read_kernel(sys->code, sizeof(sys->code), (void *)addr)) { - DEBUG_PRINT("Failed to read task_struct from 0x%lx", (unsigned long)addr); - return -1; + sys->err = bpf_probe_read_kernel(sys->code, sizeof(sys->code), (void *)addr); + if (sys->err) { + DEBUG_PRINT("Failed to read task_struct from 0x%lx: %ld", (unsigned long)addr, (long)sys->err); } + // Mark request handled once the helper has finished populating the result. + sys->pid = 0; + return 0; } diff --git a/support/ebpf/tracemgmt.h b/support/ebpf/tracemgmt.h index d2420df01..40e1b9c69 100644 --- a/support/ebpf/tracemgmt.h +++ b/support/ebpf/tracemgmt.h @@ -53,8 +53,7 @@ static inline EBPF_INLINE void increment_metric(u32 metricID) } // Send immediate notifications for event triggers to Go. -// Notifications for GENERIC_PID and RELOAD_KALLSYMS will be -// automatically inhibited until HA resets the type. +// Notifications for GENERIC_PID will be automatically inhibited until HA resets the type. static inline EBPF_INLINE void event_send_trigger(struct pt_regs *ctx, u32 event_type) { int inhibit_key = event_type; diff --git a/support/ebpf/tracer.ebpf.amd64 b/support/ebpf/tracer.ebpf.amd64 index c745bd234..f402a0916 100644 Binary files a/support/ebpf/tracer.ebpf.amd64 and b/support/ebpf/tracer.ebpf.amd64 differ diff --git a/support/ebpf/tracer.ebpf.arm64 b/support/ebpf/tracer.ebpf.arm64 index a964f079d..9603dd50f 100644 Binary files a/support/ebpf/tracer.ebpf.arm64 and b/support/ebpf/tracer.ebpf.arm64 differ diff --git a/support/ebpf/types.h b/support/ebpf/types.h index 187caa82d..8c8291269 100644 --- a/support/ebpf/types.h +++ b/support/ebpf/types.h @@ -973,6 +973,7 @@ typedef struct OffsetRange { typedef struct SystemAnalysis { u64 address; u32 pid; + s32 err; u8 code[128]; } SystemAnalysis; @@ -983,8 +984,7 @@ typedef struct Event { } Event; // Event types that notifications are sent for through event_send_trigger. -#define EVENT_TYPE_GENERIC_PID 1 -#define EVENT_TYPE_RELOAD_KALLSYMS 2 +#define EVENT_TYPE_GENERIC_PID 1 // PIDPage represents the key of the eBPF map pid_page_to_mapping_info. typedef struct PIDPage { diff --git a/support/types.go b/support/types.go index 34a94bde5..85f6d183c 100644 --- a/support/types.go +++ b/support/types.go @@ -54,8 +54,7 @@ const ( ) const ( - EventTypeGenericPID = 0x1 - EventTypeReloadKallsyms = 0x2 + EventTypeGenericPID = 0x1 ) const UnwindInfoMaxEntries = 0x4000 @@ -138,10 +137,10 @@ type StackDeltaPageKey struct { Page uint64 } type SystemAnalysis struct { - Address uint64 - Pid uint32 - Code [128]uint8 - Pad_cgo_0 [4]byte + Address uint64 + Pid uint32 + Err int32 + Code [128]uint8 } type TSDInfo struct { Offset int16 diff --git a/support/types_def.go b/support/types_def.go index 760b263f2..37f7f201d 100644 --- a/support/types_def.go +++ b/support/types_def.go @@ -60,8 +60,7 @@ const ( ) const ( - EventTypeGenericPID = C.EVENT_TYPE_GENERIC_PID - EventTypeReloadKallsyms = C.EVENT_TYPE_RELOAD_KALLSYMS + EventTypeGenericPID = C.EVENT_TYPE_GENERIC_PID ) const UnwindInfoMaxEntries = C.UNWIND_INFO_MAX_ENTRIES diff --git a/tracer/events.go b/tracer/events.go index 9cd8cf6ea..d2e365daa 100644 --- a/tracer/events.go +++ b/tracer/events.go @@ -82,9 +82,6 @@ func (t *Tracer) triggerReportEvent(data []byte) { switch event.Type { case support.EventTypeGenericPID: t.handleGenericPID() - case support.EventTypeReloadKallsyms: - t.enableEvent(support.EventTypeReloadKallsyms) - t.kernelSymbolizer.Reload() } } diff --git a/tracer/helper.go b/tracer/helper.go index 75bfef12a..ca75aec37 100644 --- a/tracer/helper.go +++ b/tracer/helper.go @@ -50,7 +50,7 @@ func hasProbeReadBug(major, minor, patch uint32) bool { } // getOnlineCPUIDs reads online CPUs from /sys/devices/system/cpu/online and reports -// the core IDs as a list of integers. +// the core IDs as a list of integers. You should probably use `onlineCPUsOnce` instead. func getOnlineCPUIDs() ([]int, error) { cpuPath := "/sys/devices/system/cpu/online" buf, err := os.ReadFile(cpuPath) diff --git a/tracer/systemconfig.go b/tracer/systemconfig.go index 51781dd6a..cd89050fa 100644 --- a/tracer/systemconfig.go +++ b/tracer/systemconfig.go @@ -10,6 +10,7 @@ import ( "os" "runtime" "strings" + "syscall" "unsafe" "go.opentelemetry.io/ebpf-profiler/kallsyms" @@ -32,6 +33,11 @@ type sysConfigVars struct { stack_ptregs_offset uint32 } +var ( + errSystemAnalysisNotHandled = errors.New("system analysis request was not handled") + errSystemAnalysisFailed = errors.New("system analysis helper failed") +) + // memberByName resolves btf Member from a Struct with given name func memberByName(t *btf.Struct, field string) (*btf.Member, error) { for i, m := range t.Members { @@ -163,10 +169,29 @@ func executeSystemAnalysisBpfCode(progSpec *cebpf.ProgramSpec, maps map[string]* if err != nil { return nil, 0, fmt.Errorf("failed to get analysis data: %v", err) } + if err = validateSystemAnalysisResult(data, address); err != nil { + return nil, 0, err + } return data.Code[:], data.Address, nil } +func validateSystemAnalysisResult(data support.SystemAnalysis, address libpf.SymbolValue) error { + if data.Pid != 0 { + return fmt.Errorf("%w for pid %d at 0x%x", errSystemAnalysisNotHandled, data.Pid, address) + } + + if data.Err != 0 { + if data.Err < 0 { + return fmt.Errorf("%w at 0x%x: %w (helper err=%d)", errSystemAnalysisFailed, address, syscall.Errno(-data.Err), data.Err) + } + + return fmt.Errorf("%w at 0x%x: helper err=%d", errSystemAnalysisFailed, address, data.Err) + } + + return nil +} + // loadKernelCode will request the ebpf code to read the first X bytes from given address. func loadKernelCode(coll *cebpf.CollectionSpec, maps map[string]*cebpf.Map, address libpf.SymbolValue, diff --git a/tracer/systemconfig_test.go b/tracer/systemconfig_test.go new file mode 100644 index 000000000..b11a35635 --- /dev/null +++ b/tracer/systemconfig_test.go @@ -0,0 +1,37 @@ +// Copyright The OpenTelemetry Authors +// SPDX-License-Identifier: Apache-2.0 + +package tracer + +import ( + "errors" + "testing" + + "github.com/stretchr/testify/require" + + "go.opentelemetry.io/ebpf-profiler/libpf" + "go.opentelemetry.io/ebpf-profiler/support" +) + +func TestValidateSystemAnalysisResult(t *testing.T) { + address := libpf.SymbolValue(0x1234) + + t.Run("not handled", func(t *testing.T) { + err := validateSystemAnalysisResult(support.SystemAnalysis{Pid: 77}, address) + require.Error(t, err) + require.ErrorIs(t, err, errSystemAnalysisNotHandled) + require.ErrorContains(t, err, "pid 77") + }) + + t.Run("helper failure", func(t *testing.T) { + err := validateSystemAnalysisResult(support.SystemAnalysis{Err: -14}, address) + require.Error(t, err) + require.True(t, errors.Is(err, errSystemAnalysisFailed)) + require.ErrorContains(t, err, "helper err=-14") + }) + + t.Run("success", func(t *testing.T) { + err := validateSystemAnalysisResult(support.SystemAnalysis{}, address) + require.NoError(t, err) + }) +} diff --git a/tracer/tracer.go b/tracer/tracer.go index 3a9f09258..70e0dc11c 100644 --- a/tracer/tracer.go +++ b/tracer/tracer.go @@ -79,6 +79,9 @@ type Intervals interface { ExecutableUnloadDelay() time.Duration } +// onlineCPUs once resolves and caches the list of online CPUs. +var onlineCPUsOnce = sync.OnceValues(getOnlineCPUIDs) + // Tracer provides an interface for loading and initializing the eBPF components as // well as for monitoring the output maps for new traces and count updates. type Tracer struct { @@ -305,6 +308,7 @@ func (t *Tracer) Close() { } t.processManager.Close() + t.kernelSymbolizer.Close() t.signalDone() } @@ -492,10 +496,6 @@ func initializeMapsAndPrograms(kmod *kallsyms.Module, cfg *Config) ( } } - if err = loadKallsymsTrigger(coll, ebpfProgs, cfg.BPFVerifierLogLevel); err != nil { - return nil, nil, nil, fmt.Errorf("failed to load kallsym eBPF program: %v", err) - } - if err = removeTemporaryMaps(ebpfMaps); err != nil { return nil, nil, nil, fmt.Errorf("failed to remove temporary maps: %v", err) } @@ -695,27 +695,6 @@ func loadAllMaps(coll *cebpf.CollectionSpec, cfg *Config, return nil } -// loadKallsymsTrigger loads the eBPF program that triggers kallsym updates. -func loadKallsymsTrigger(coll *cebpf.CollectionSpec, - ebpfProgs map[string]*cebpf.Program, bpfVerifierLogLevel uint32) error { - programOptions := cebpf.ProgramOptions{ - LogLevel: cebpf.LogLevel(bpfVerifierLogLevel), - } - - kallsymsTriggerProg := "kprobe__kallsyms" - progSpec, ok := coll.Programs[kallsymsTriggerProg] - if !ok { - return fmt.Errorf("program %s does not exist", kallsymsTriggerProg) - } - - if err := loadProgram(ebpfProgs, nil, 0, progSpec, - programOptions, true); err != nil { - return err - } - - return nil -} - // schedTimesSize calculates the size of the sched_times map based on the // configured off-cpu threshold. // To not lose too many scheduling events but also not oversize sched_times, @@ -904,7 +883,11 @@ func (t *Tracer) symbolizeKernelFrames(addrs []uint64, oldFrames libpf.Frames) l Type: libpf.KernelFrame, AddressOrLineno: libpf.AddressOrLineno(address - 1), } - if kmod, err := t.kernelSymbolizer.GetModuleByAddress(address); err == nil { + if funcName, offset, ok := t.kernelSymbolizer.LookupBPFSymbol(address); ok { + // BPF program: use address relative to symbol start for deduplication. + frame.AddressOrLineno = libpf.AddressOrLineno(offset) + frame.FunctionName = libpf.Intern(funcName) + } else if kmod, err := t.kernelSymbolizer.GetModuleByAddress(address); err == nil { frame.Mapping = kmod.Mapping() frame.AddressOrLineno -= libpf.AddressOrLineno(kmod.Start()) if funcName, _, err := kmod.LookupSymbolByAddress(address); err == nil { @@ -1092,7 +1075,12 @@ func (t *Tracer) loadBpfTrace(raw []byte, cpu int) (*libpf.EbpfTrace, error) { // StartMapMonitors starts goroutines for collecting metrics and monitoring eBPF // maps for tracepoints, new traces, trace count updates and unknown PCs. func (t *Tracer) StartMapMonitors(ctx context.Context, traceOutChan chan<- *libpf.EbpfTrace) error { - if err := t.kernelSymbolizer.StartMonitor(ctx); err != nil { + onlineCPUs, err := onlineCPUsOnce() + if err != nil { + return fmt.Errorf("failed to get online cpus: %w", err) + } + + if err := t.kernelSymbolizer.StartMonitor(ctx, onlineCPUs); err != nil { log.Warnf("Failed to start kallsyms monitor: %v", err) } eventMetricCollector, err := t.startEventMonitor(ctx) @@ -1142,33 +1130,6 @@ func (t *Tracer) StartMapMonitors(ctx context.Context, traceOutChan chan<- *libp return nil } -func (t *Tracer) attachToKallsymsUpdates() error { - prog, ok := t.ebpfProgs["kprobe__kallsyms"] - if !ok { - return fmt.Errorf("kprobe__kallsyms is not available") - } - - kallsymsAttachPoint := "bpf_ksym_add" - kmod, err := t.kernelSymbolizer.GetModuleByName(kallsyms.Kernel) - if err != nil { - return err - } - - if _, err := kmod.LookupSymbol(kallsymsAttachPoint); err != nil { - log.Infof("Monitoring kallsyms is supported only for Linux kernel 5.8 or greater: %s: %v", - kallsymsAttachPoint, err) - return nil - } - - hook, err := link.Kprobe(kallsymsAttachPoint, prog, nil) - if err != nil { - return fmt.Errorf("failed opening kprobe for kallsyms trigger: %s", err) - } - t.hooks[hookPoint{group: "kprobe", name: kallsymsAttachPoint}] = hook - - return nil -} - // terminatePerfEvents disables perf events and closes their file descriptor. func terminatePerfEvents(events []*perf.Event) { for _, event := range events { @@ -1196,14 +1157,14 @@ func (t *Tracer) AttachTracer() error { return fmt.Errorf("failed to configure software perf event: %v", err) } - onlineCPUIDs, err := getOnlineCPUIDs() + onlineCPUs, err := onlineCPUsOnce() if err != nil { - return fmt.Errorf("failed to get online CPUs: %v", err) + return fmt.Errorf("failed to get online cpus: %w", err) } events := t.perfEntrypoints.WLock() defer t.perfEntrypoints.WUnlock(&events) - for _, id := range onlineCPUIDs { + for _, id := range onlineCPUs { perfEvent, err := perf.Open(perfAttribute, perf.AllThreads, id, nil) if err != nil { terminatePerfEvents(*events) @@ -1216,11 +1177,6 @@ func (t *Tracer) AttachTracer() error { *events = append(*events, perfEvent) } - if err = t.attachToKallsymsUpdates(); err != nil { - terminatePerfEvents(*events) - return err - } - return nil }