From 716aac7f19e84fcb1ffd7ba5d95b8301537ab999 Mon Sep 17 00:00:00 2001 From: Ron Federman <73110295+RonFed@users.noreply.github.com> Date: Tue, 17 Dec 2024 12:55:01 +0200 Subject: [PATCH] Track un-active instrumentation for better reporting (#2016) This PR changes when the instrumentation manager tracks an instrumentation. Once an instrumentation is initialized, we should start tracking it even if it fails to load. The reason is, that we need to call the reporter once the process exits (so it can report/clean it) - however, if we only track the PID once the instrumentation is loaded we won't be able to invoke the reporter properly once a failed-to-load instrumentation needs to be cleaned. Each PID we track is marked whether the instrumentation is active (loaded successfully) or not - by the inst field being nil or not. For un-active PIDs we won't apply config updates. In addition, in case we get an exec event on a non-active PID, we try to instrument it again. This is helpful in cases of chain-loading where the first executable is written in a language we can't instrument, while the second is valid for instrumentation. --- instrumentation/manager.go | 46 +++++++++++++++++++++++++------------- 1 file changed, 30 insertions(+), 16 deletions(-) diff --git a/instrumentation/manager.go b/instrumentation/manager.go index 94f037603..6da5792ee 100644 --- a/instrumentation/manager.go +++ b/instrumentation/manager.go @@ -24,6 +24,9 @@ var ( type ConfigUpdate[configGroup ConfigGroup] map[configGroup]Config type instrumentationDetails[processGroup ProcessGroup, configGroup ConfigGroup] struct { + // we want to track the instrumentation even if it failed to load, to be able to report the error + // and clean up the reporter resources once the process exits. + // hence, this might be nil if the instrumentation failed to load. inst Instrumentation pg processGroup cg configGroup @@ -75,11 +78,11 @@ type manager[processGroup ProcessGroup, configGroup ConfigGroup] struct { factories map[OtelDistribution]Factory logger logr.Logger - // all the active instrumentations by pid, + // all the created instrumentations by pid, // this map is not concurrent safe, so it should be accessed only from the main event loop detailsByPid map[int]*instrumentationDetails[processGroup, configGroup] - // active instrumentations by workload, and aggregated by pid + // instrumentations by workload, and aggregated by pid // this map is not concurrent safe, so it should be accessed only from the main event loop detailsByWorkload map[configGroup]map[int]*instrumentationDetails[processGroup, configGroup] @@ -142,6 +145,9 @@ func (m *manager[ProcessGroup, ConfigGroup]) runEventLoop(ctx context.Context) { case <-ctx.Done(): m.logger.Info("stopping eBPF instrumentation manager") for pid, details := range m.detailsByPid { + if details.inst == nil { + continue + } err := details.inst.Close(ctx) if err != nil { m.logger.Error(err, "failed to close instrumentation", "pid", pid) @@ -201,12 +207,14 @@ func (m *manager[ProcessGroup, ConfigGroup]) cleanInstrumentation(ctx context.Co m.logger.Info("cleaning instrumentation resources", "pid", pid, "process group details", details.pg) - err := details.inst.Close(ctx) - if err != nil { - m.logger.Error(err, "failed to close instrumentation") + if details.inst != nil { + err := details.inst.Close(ctx) + if err != nil { + m.logger.Error(err, "failed to close instrumentation") + } } - err = m.handler.Reporter.OnExit(ctx, pid, details.pg) + err := m.handler.Reporter.OnExit(ctx, pid, details.pg) if err != nil { m.logger.Error(err, "failed to report instrumentation exit") } @@ -215,7 +223,7 @@ func (m *manager[ProcessGroup, ConfigGroup]) cleanInstrumentation(ctx context.Co } func (m *manager[ProcessGroup, ConfigGroup]) handleProcessExecEvent(ctx context.Context, e detector.ProcessEvent) error { - if _, found := m.detailsByPid[e.PID]; found { + if details, found := m.detailsByPid[e.PID]; found && details.inst != nil { // this can happen if we have multiple exec events for the same pid (chain loading) // TODO: better handle this? // this can be done by first closing the existing instrumentation, @@ -266,21 +274,24 @@ func (m *manager[ProcessGroup, ConfigGroup]) handleProcessExecEvent(ctx context. return err } - err = inst.Load(ctx) - // call the reporter regardless of the load result - as we want to report the load status - reporterErr := m.handler.Reporter.OnLoad(ctx, e.PID, err, pg) - if err != nil { + loadErr := inst.Load(ctx) + + reporterErr := m.handler.Reporter.OnLoad(ctx, e.PID, loadErr, pg) + if reporterErr != nil { + m.logger.Error(reporterErr, "failed to report instrumentation load", "loaded", loadErr == nil, "pid", e.PID, "process group details", pg) + } + if loadErr != nil { + // we need to track the instrumentation even if the load failed. + // consider a reporter which writes a persistent record for a failed/successful load + // we need to notify the reporter once that PID exits to clean up the resources - hence we track it. + // saving the inst as nil marking the instrumentation failed to load, and is not valid to run/configure/close. + m.startTrackInstrumentation(e.PID, nil, pg, configGroup) m.logger.Error(err, "failed to load instrumentation", "language", otelDisto.Language, "sdk", otelDisto.OtelSdk) // TODO: should we return here the load error? or the instance write error? or both? return err } - if reporterErr != nil { - m.logger.Error(reporterErr, "failed to report instrumentation load") - } - m.startTrackInstrumentation(e.PID, inst, pg, configGroup) - m.logger.Info("instrumentation loaded", "pid", e.PID, "process group details", pg) go func() { @@ -337,6 +348,9 @@ func (m *manager[ProcessGroup, ConfigGroup]) applyInstrumentationConfigurationFo } for _, instDetails := range configGroupInstrumentations { + if instDetails.inst == nil { + continue + } m.logger.Info("applying configuration to instrumentation", "process group details", instDetails.pg, "configGroup", configGroup) applyErr := instDetails.inst.ApplyConfig(ctx, config) err = errors.Join(err, applyErr)