-
Notifications
You must be signed in to change notification settings - Fork 399
Handle processes whose main thread has exited #376
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Changes from all commits
a03325e
6ebd504
218cdd4
9cb832a
75697c8
8057b43
ba8e383
1efeee6
feba978
408cab3
fefdc8f
c2ea120
195f78b
f9a7e22
ce6940a
34f5656
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -1,8 +1,27 @@ | ||
| package libpf // import "go.opentelemetry.io/ebpf-profiler/libpf" | ||
|
|
||
| import ( | ||
| "fmt" | ||
| ) | ||
|
|
||
| // PID represent Unix Process ID (pid_t) | ||
| type PID uint32 | ||
|
|
||
| func (p PID) Hash32() uint32 { | ||
| return uint32(p) | ||
| } | ||
|
|
||
| // PIDTID encodes a process id and a thread id | ||
| type PIDTID uint64 | ||
|
|
||
| func (pt PIDTID) PID() PID { | ||
| return PID(pt >> 32) | ||
| } | ||
|
|
||
| func (pt PIDTID) TID() PID { | ||
| return PID(pt & 0xFFFFFFFF) | ||
| } | ||
|
|
||
| func (pt PIDTID) String() string { | ||
| return fmt.Sprintf("PID: %v TID: %v", pt.PID(), pt.TID()) | ||
| } |
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -25,12 +25,17 @@ import ( | |
| "go.opentelemetry.io/ebpf-profiler/stringutil" | ||
| ) | ||
|
|
||
| // GetMappings returns this error when no mappings can be extracted. | ||
| var ErrNoMappings = errors.New("no mappings") | ||
|
|
||
| // systemProcess provides an implementation of the Process interface for a | ||
| // process that is currently running on this machine. | ||
| type systemProcess struct { | ||
| pid libpf.PID | ||
| tid libpf.PID | ||
|
|
||
| remoteMemory remotememory.RemoteMemory | ||
| mainThreadExit bool | ||
| remoteMemory remotememory.RemoteMemory | ||
|
|
||
| fileToMapping map[string]*Mapping | ||
| } | ||
|
|
@@ -54,9 +59,10 @@ func init() { | |
| } | ||
|
|
||
| // New returns an object with Process interface accessing it | ||
| func New(pid libpf.PID) Process { | ||
| func New(pid, tid libpf.PID) Process { | ||
| return &systemProcess{ | ||
| pid: pid, | ||
| tid: tid, | ||
| remoteMemory: remotememory.NewProcessVirtualMemory(pid), | ||
| } | ||
| } | ||
|
|
@@ -166,9 +172,8 @@ func parseMappings(mapsFile io.Reader) ([]Mapping, uint32, error) { | |
| path = VdsoPathName | ||
| device = 0 | ||
| inode = vdsoInode | ||
| } else if path != "" { | ||
| // Ignore [vsyscall] and similar executable kernel | ||
| // pages we don't care about | ||
| } else { | ||
|
Member
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. No semantic change, I just inlined the logic from |
||
| // Ignore mappings that are invalid, non-existent or are special pseudo-files | ||
| continue | ||
| } | ||
| } else { | ||
|
|
@@ -230,20 +235,48 @@ func (sp *systemProcess) GetMappings() ([]Mapping, uint32, error) { | |
| defer mapsFile.Close() | ||
|
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Could this original path of checking the primary
Member
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Hmm yes, I can add the check/skip-pid-maps logic but I'll also need to add the zombie check to ensure that we're only going to be reading tid-specific maps if it's absolutely the case that main thread has exited.
Member
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I added the zombie check for process exit but adding a conditional check for So we can either have |
||
|
|
||
| mappings, numParseErrors, err := parseMappings(mapsFile) | ||
| if err == nil { | ||
| fileToMapping := make(map[string]*Mapping, len(mappings)) | ||
| for idx := range mappings { | ||
| m := &mappings[idx] | ||
| if m.Inode == 0 { | ||
| // Ignore mappings that are invalid, | ||
| // non-existent or are special pseudo-files. | ||
| continue | ||
| } | ||
| fileToMapping[m.Path] = m | ||
| if err != nil { | ||
|
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I think here we should also return early if
Member
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Doesn't the If |
||
| return mappings, numParseErrors, err | ||
| } | ||
|
|
||
| if len(mappings) == 0 { | ||
| // We could test for main thread exit here by checking for zombie state | ||
| // in /proc/sp.pid/stat but it's simpler to assume that this is the case | ||
| // and try extracting mappings for a different thread. Since we stopped | ||
| // processing /proc at agent startup, it's not possible that the agent | ||
| // will sample a process without mappings | ||
| log.Debugf("PID: %v main thread exit", sp.pid) | ||
| sp.mainThreadExit = true | ||
|
|
||
| if sp.pid == sp.tid { | ||
| return mappings, numParseErrors, ErrNoMappings | ||
| } | ||
|
|
||
| log.Debugf("TID: %v extracting mappings", sp.tid) | ||
| mapsFileAlt, err := os.Open(fmt.Sprintf("/proc/%d/task/%d/maps", sp.pid, sp.tid)) | ||
| // On all errors resulting from trying to get mappings from a different thread, | ||
| // return ErrNoMappings which will keep the PID tracked in processmanager and | ||
| // allow for a future iteration to try extracting mappings from a different thread. | ||
| // This is done to deal with race conditions triggered by thread exits (we do not want | ||
| // the agent to unload process metadata when a thread exits but the process is still | ||
| // alive). | ||
| if err != nil { | ||
| return mappings, numParseErrors, ErrNoMappings | ||
| } | ||
| defer mapsFileAlt.Close() | ||
| mappings, numParseErrors, err = parseMappings(mapsFileAlt) | ||
| if err != nil || len(mappings) == 0 { | ||
| return mappings, numParseErrors, ErrNoMappings | ||
| } | ||
| sp.fileToMapping = fileToMapping | ||
| } | ||
| return mappings, numParseErrors, err | ||
|
|
||
| fileToMapping := make(map[string]*Mapping, len(mappings)) | ||
| for idx := range mappings { | ||
| m := &mappings[idx] | ||
| fileToMapping[m.Path] = m | ||
| } | ||
| sp.fileToMapping = fileToMapping | ||
| return mappings, numParseErrors, nil | ||
| } | ||
|
|
||
| func (sp *systemProcess) GetThreads() ([]ThreadInfo, error) { | ||
|
|
@@ -272,6 +305,13 @@ func (sp *systemProcess) getMappingFile(m *Mapping) string { | |
| if m.IsAnonymous() || m.IsVDSO() { | ||
| return "" | ||
| } | ||
| if sp.mainThreadExit { | ||
| // Neither /proc/sp.pid/map_files nor /proc/sp.pid/task/sp.tid/map_files | ||
| // nor /proc/sp.pid/root exist if main thread has exited, so we use the | ||
| // mapping path directly under the sp.tid root. | ||
| rootPath := fmt.Sprintf("/proc/%v/task/%v/root", sp.pid, sp.tid) | ||
| return path.Join(rootPath, m.Path) | ||
| } | ||
| return fmt.Sprintf("/proc/%v/map_files/%x-%x", sp.pid, m.Vaddr, m.Vaddr+m.Length) | ||
| } | ||
|
|
||
|
|
||
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -614,9 +614,16 @@ func (pm *ProcessManager) SynchronizeProcess(pr process.Process) { | |
| return | ||
| } | ||
|
|
||
| if errors.Is(err, process.ErrNoMappings) { | ||
| // When no mappings can be extracted but the process is still alive, | ||
| // do not trigger a process exit to avoid unloading process metadata. | ||
| // As it's likely that a future iteration can extract mappings from a | ||
| // different thread in the process, notify eBPF to enable further notifications. | ||
| pm.ebpf.RemoveReportedPID(pid) | ||
| return | ||
| } | ||
|
|
||
| // All other errors imply that the process has exited. | ||
| // Clean up, and notify eBPF. | ||
| pm.processPIDExit(pid) | ||
| if os.IsNotExist(err) { | ||
| // Since listing /proc and opening files in there later is inherently racy, | ||
| // we expect to lose the race sometimes and thus expect to hit os.IsNotExist. | ||
|
|
@@ -626,22 +633,7 @@ func (pm *ProcessManager) SynchronizeProcess(pr process.Process) { | |
| // return ESRCH. Handle it as if the process did not exist. | ||
| pm.mappingStats.errProcESRCH.Add(1) | ||
| } | ||
| return | ||
| } | ||
| if len(mappings) == 0 { | ||
|
Member
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. These comments are no longer relevant. |
||
| // Valid process without any (executable) mappings. All cases are | ||
| // handled as process exit. Possible causes and reasoning: | ||
| // 1. It is a kernel worker process. The eBPF does not send events from these, | ||
| // but we can see kernel threads here during startup when tracer walks | ||
| // /proc and tries to synchronize all PIDs it sees. | ||
| // The PID should not exist anywhere, but we can still double check and | ||
| // make sure the PID is not tracked. | ||
| // 2. It is a normal process executing, but we just sampled it when the kernel | ||
| // execve() is rebuilding the mappings and nothing is currently mapped. | ||
| // In this case we can handle it as process exit because everything about | ||
| // the process is changing: all mappings, comm, etc. If execve fails, we | ||
| // reaped it early. If execve succeeds, we will get new synchronization | ||
| // request soon, and handle it as a new process event. | ||
|
christos68k marked this conversation as resolved.
|
||
| // Clean up, and notify eBPF. | ||
| pm.processPIDExit(pid) | ||
| return | ||
| } | ||
|
|
@@ -744,6 +736,7 @@ func (pm *ProcessManager) ProcessedUntil(traceCaptureKTime times.KTime) { | |
| continue | ||
| } | ||
|
|
||
| log.Debugf("PID %v deleted", pid) | ||
| delete(pm.pidToProcessInfo, pid) | ||
|
|
||
| for _, instance := range pm.interpreters[pid] { | ||
|
|
||
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
I didn't switch
Processto acceptlibpf.PIDTIDas the latter is only used with PID events, and I'd rather not couple it here too.