diff --git a/cli_flags.go b/cli_flags.go index 182cd768e..6ace06145 100644 --- a/cli_flags.go +++ b/cli_flags.go @@ -25,7 +25,7 @@ const ( defaultProbabilisticThreshold = tracer.ProbabilisticThresholdMax defaultProbabilisticInterval = 1 * time.Minute defaultArgSendErrorFrames = false - defaultOffCPUThreshold = support.OffCPUThresholdMax + defaultOffCPUThreshold = 0 // This is the X in 2^(n + x) where n is the default hardcoded map size value defaultArgMapScaleFactor = 0 @@ -63,11 +63,10 @@ var ( "If zero, monotonic-realtime clock sync will be performed once, " + "on agent startup, but not periodically." sendErrorFramesHelp = "Send error frames (devfiler only, breaks Kibana)" - offCPUThresholdHelp = fmt.Sprintf("If set to a value between 1 and %d will enable "+ - "off-cpu profiling: Every time an off-cpu entry point is hit, a random number between "+ - "0 and %d is chosen. If the given threshold is greater than this random number, the "+ - "off-cpu trace is collected and reported.", - support.OffCPUThresholdMax-1, support.OffCPUThresholdMax-1) + offCPUThresholdHelp = fmt.Sprintf("The per-mille chance for an off-cpu event being recorded. "+ + "Valid values are in the range [1..%d], and 0 to disable off-cpu profiling."+ + "Default is %d.", + support.OffCPUThresholdMax, defaultOffCPUThreshold) ) // Package-scope variable, so that conditionally compiled other components can refer diff --git a/internal/controller/config.go b/internal/controller/config.go index 9a6ece498..57f108248 100644 --- a/internal/controller/config.go +++ b/internal/controller/config.go @@ -8,7 +8,9 @@ import ( "time" log "github.com/sirupsen/logrus" + "go.opentelemetry.io/ebpf-profiler/reporter" + "go.opentelemetry.io/ebpf-profiler/support" "go.opentelemetry.io/ebpf-profiler/tracer" ) @@ -89,6 +91,14 @@ func (cfg *Config) Validate() error { ) } + if cfg.OffCPUThreshold > support.OffCPUThresholdMax { + return fmt.Errorf( + "invalid argument for off-cpu-threshold. Value "+ + "should be between 1 and %d, or 0 to disable off-cpu profiling", + support.OffCPUThresholdMax, + ) + } + if !cfg.NoKernelVersionCheck { major, minor, patch, err := tracer.GetCurrentKernelVersion() if err != nil { diff --git a/internal/controller/controller.go b/internal/controller/controller.go index d56c604a5..c57ade248 100644 --- a/internal/controller/controller.go +++ b/internal/controller/controller.go @@ -11,7 +11,6 @@ import ( "go.opentelemetry.io/ebpf-profiler/host" "go.opentelemetry.io/ebpf-profiler/metrics" "go.opentelemetry.io/ebpf-profiler/reporter" - "go.opentelemetry.io/ebpf-profiler/support" "go.opentelemetry.io/ebpf-profiler/times" "go.opentelemetry.io/ebpf-profiler/tracehandler" "go.opentelemetry.io/ebpf-profiler/tracer" @@ -108,7 +107,7 @@ func (c *Controller) Start(ctx context.Context) error { } log.Info("Attached tracer program") - if c.config.OffCPUThreshold < support.OffCPUThresholdMax { + if c.config.OffCPUThreshold > 0 { if err := trc.StartOffCPUProfiling(); err != nil { return fmt.Errorf("failed to start off-cpu profiling: %v", err) } diff --git a/support/ebpf/off_cpu.ebpf.c b/support/ebpf/off_cpu.ebpf.c index a98f5d67e..3a81eef7c 100644 --- a/support/ebpf/off_cpu.ebpf.c +++ b/support/ebpf/off_cpu.ebpf.c @@ -37,7 +37,7 @@ int tracepoint__sched_switch(void *ctx) return ERR_UNREACHABLE; } - if (bpf_get_prandom_u32() % OFF_CPU_THRESHOLD_MAX > syscfg->off_cpu_threshold) { + if (bpf_get_prandom_u32() % OFF_CPU_THRESHOLD_MAX >= syscfg->off_cpu_threshold) { return 0; } diff --git a/support/ebpf/tracer.ebpf.release.amd64 b/support/ebpf/tracer.ebpf.release.amd64 index 93097d5f2..c9fd60c3b 100644 Binary files a/support/ebpf/tracer.ebpf.release.amd64 and b/support/ebpf/tracer.ebpf.release.amd64 differ diff --git a/support/ebpf/tracer.ebpf.release.arm64 b/support/ebpf/tracer.ebpf.release.arm64 index 5621f4cd7..97ae2093e 100644 Binary files a/support/ebpf/tracer.ebpf.release.arm64 and b/support/ebpf/tracer.ebpf.release.arm64 differ diff --git a/tracer/tracer.go b/tracer/tracer.go index 9237fbba6..a5eaa16c5 100644 --- a/tracer/tracer.go +++ b/tracer/tracer.go @@ -490,7 +490,7 @@ func initializeMapsAndPrograms(kernelSymbols *libpf.SymbolMap, cfg *Config) ( return nil, nil, fmt.Errorf("failed to load perf eBPF programs: %v", err) } - if cfg.OffCPUThreshold < support.OffCPUThresholdMax { + if cfg.OffCPUThreshold > 0 { if err = loadKProbeUnwinders(coll, ebpfProgs, ebpfMaps["kprobe_progs"], tailCallProgs, cfg.BPFVerifierLogLevel, ebpfMaps["perf_progs"].FD()); err != nil { return nil, nil, fmt.Errorf("failed to load kprobe eBPF programs: %v", err) @@ -554,7 +554,7 @@ func loadAllMaps(coll *cebpf.CollectionSpec, cfg *Config, // On modern systems /proc/sys/kernel/pid_max defaults to 4194304. // Try to fit this PID space scaled down with cfg.OffCPUThreshold into // this map. - adaption["sched_times"] = (4194304 / support.OffCPUThresholdMax) * cfg.OffCPUThreshold + adaption["sched_times"] = (4194304 * cfg.OffCPUThreshold) / support.OffCPUThresholdMax for i := support.StackDeltaBucketSmallest; i <= support.StackDeltaBucketLargest; i++ { mapName := fmt.Sprintf("exe_id_to_%d_stack_deltas", i) @@ -562,15 +562,14 @@ func loadAllMaps(coll *cebpf.CollectionSpec, cfg *Config, } for mapName, mapSpec := range coll.Maps { + if mapName == "sched_times" && cfg.OffCPUThreshold == 0 { + // Off CPU Profiling is disabled. So do not load this map. + continue + } if newSize, ok := adaption[mapName]; ok { log.Debugf("Size of eBPF map %s: %v", mapName, newSize) mapSpec.MaxEntries = newSize } - if mapName == "sched_times" && - cfg.OffCPUThreshold >= support.OffCPUThresholdMax { - // Off CPU Profiling is not enabled. So do not load this map. - continue - } ebpfMap, err := cebpf.NewMap(mapSpec) if err != nil { return fmt.Errorf("failed to load %s: %v", mapName, err)