Skip to content

Commit 0a1d30d

Browse files
committed
crashmonitor: a monitor for unexpected crashes
This change defines a crash monitor that increments a telemetry counter for the stack any time the Go runtime crashes. It relies on the runtime/debug.SetCrashOutput feature of go1.23 (#42888); if built with an earlier version of Go, the monitor is a no-op. This feature causes the Go runtime to write the crash to an arbitrary file instead of stderr. In this case, the file is a pipe to a subprocess that executes the application executable in monitor mode. When the pipe is closed, the monitor parses the program counters out of the backtrace, then uses a the usual logic to convert it to the name of a counter, which it then increments. The CL includes two tests, an internal test that parses a crash from the child process's stderr, and an integration test that uses crashmonitor.Start and SetCrashOutput, if available. Updates golang/go#42888 Change-Id: Ie40f9403fa99a9e0bab3b4edc9430be5e18150d7 Reviewed-on: https://go-review.googlesource.com/c/telemetry/+/559503 Auto-Submit: Alan Donovan <[email protected]> LUCI-TryBot-Result: Go LUCI <[email protected]> Reviewed-by: Robert Findley <[email protected]>
1 parent 1cb064e commit 0a1d30d

File tree

5 files changed

+522
-44
lines changed

5 files changed

+522
-44
lines changed

crashmonitor/crash_go123.go

+14
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,14 @@
1+
// Copyright 2024 The Go Authors. All rights reserved.
2+
// Use of this source code is governed by a BSD-style
3+
// license that can be found in the LICENSE file.
4+
5+
//go:build go1.23
6+
// +build go1.23
7+
8+
package crashmonitor
9+
10+
import "runtime/debug"
11+
12+
func init() {
13+
setCrashOutput = debug.SetCrashOutput
14+
}

crashmonitor/monitor.go

+294
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,294 @@
1+
// Copyright 2024 The Go Authors. All rights reserved.
2+
// Use of this source code is governed by a BSD-style
3+
// license that can be found in the LICENSE file.
4+
5+
package crashmonitor
6+
7+
// This file defines a monitor that reports arbitrary Go runtime
8+
// crashes to telemetry.
9+
10+
import (
11+
"bytes"
12+
"fmt"
13+
"io"
14+
"log"
15+
"os"
16+
"os/exec"
17+
"reflect"
18+
"runtime/debug"
19+
"strconv"
20+
"strings"
21+
22+
"golang.org/x/telemetry/internal/counter"
23+
)
24+
25+
// Supported reports whether the runtime supports [runtime.SetCrashOutput].
26+
//
27+
// TODO(adonovan): eliminate once go1.23+ is assured.
28+
func Supported() bool { return setCrashOutput != nil }
29+
30+
var setCrashOutput func(*os.File) error // = runtime.SetCrashOutput on go1.23+
31+
32+
// Start starts the monitor process, which performs automated
33+
// reporting of unexpected crashes via Go telemetry. Call this
34+
// function once immediately after [counter.Open]() within the main
35+
// function of your application, before argument parsing.
36+
//
37+
// This function re-executes the current executable as a child
38+
// process, in a special mode. In that mode, the call to Start will
39+
// never return.
40+
//
41+
// The application should avoid doing expensive work in init functions
42+
// as they will be executed twice. Run with GODEBUG=inittrace=1 to
43+
// display the running time of each package initializer.
44+
//
45+
// Start uses the [debug.SetCrashOutput] mechanism, which is a
46+
// process-wide resource. Do not make other calls to that function
47+
// within your application. Start is a no-op unless the program is
48+
// built with go1.23+.
49+
func Start() {
50+
if !Supported() {
51+
return
52+
}
53+
54+
const crashmonitorVar = "X_TELEMETRY_CRASHMONITOR"
55+
if os.Getenv(crashmonitorVar) != "" {
56+
// This process is the crashmonitor (child).
57+
log.SetFlags(0)
58+
log.SetPrefix("crashmonitor: ")
59+
60+
// Wait for parent process's dying gasp.
61+
// If the parent dies for any reason this read will return.
62+
data, err := io.ReadAll(os.Stdin)
63+
if err != nil {
64+
log.Fatalf("failed to read from input pipe: %v", err)
65+
}
66+
67+
// If the only line is the sentinel, it wasn't a crash.
68+
if bytes.Count(data, []byte("\n")) < 2 {
69+
os.Exit(0) // parent exited without crash report
70+
}
71+
72+
log.Printf("parent reported crash:\n%s", data)
73+
74+
// Parse the stack out of the crash report
75+
// and record a telemetry count for it.
76+
name, err := telemetryCounterName(data)
77+
if err != nil {
78+
// Keep count of how often this happens
79+
// so that we can investigate if necessary.
80+
incrementCounter("crash/malformed")
81+
82+
// Something went wrong.
83+
// Save the crash securely in the file system.
84+
f, err := os.CreateTemp(os.TempDir(), "*.crash")
85+
if err != nil {
86+
log.Fatal(err)
87+
}
88+
if _, err := f.Write(data); err != nil {
89+
log.Fatal(err)
90+
}
91+
if err := f.Close(); err != nil {
92+
log.Fatal(err)
93+
}
94+
log.Printf("failed to report crash to telemetry: %v", err)
95+
log.Fatalf("crash report saved at %s", f.Name())
96+
}
97+
98+
incrementCounter(name)
99+
100+
log.Fatalf("telemetry crash recorded")
101+
}
102+
103+
// This process is the application (parent).
104+
// Fork+exec the crashmonitor (child).
105+
exe, err := os.Executable()
106+
if err != nil {
107+
log.Fatal(err)
108+
}
109+
cmd := exec.Command(exe, "** crashmonitor **") // this unused arg is just for ps(1)
110+
cmd.Env = append(os.Environ(), crashmonitorVar+"=1")
111+
cmd.Stderr = os.Stderr
112+
cmd.Stdout = os.Stderr
113+
pipe, err := cmd.StdinPipe()
114+
if err != nil {
115+
log.Fatalf("StdinPipe: %v", err)
116+
}
117+
118+
writeSentinel(pipe)
119+
// Ensure that we get pc=0x%x values in the traceback.
120+
debug.SetTraceback("system")
121+
setCrashOutput(pipe.(*os.File)) // (this conversion is safe)
122+
123+
if err := cmd.Start(); err != nil {
124+
log.Fatalf("can't start crash monitor: %v", err)
125+
}
126+
127+
// Now return and run the application proper...
128+
}
129+
130+
// (stubbed by test)
131+
var incrementCounter = func(name string) { counter.New(name).Inc() }
132+
133+
// The sentinel function returns its address. The difference between
134+
// this value as observed by calls in two different processes of the
135+
// same executable tells us the relative offset of their text segments.
136+
//
137+
// It would be nice if SetCrashOutput took care of this as it's fiddly
138+
// and likely to confuse every user at first.
139+
func sentinel() uint64 {
140+
return uint64(reflect.ValueOf(sentinel).Pointer())
141+
}
142+
143+
func writeSentinel(out io.Writer) {
144+
fmt.Fprintf(out, "sentinel %x\n", sentinel())
145+
}
146+
147+
// telemetryCounterName parses a crash report produced by the Go
148+
// runtime, extracts the stack of the first runnable goroutine,
149+
// converts each line into telemetry form ("symbol:relative-line"),
150+
// and returns this as the name of a counter.
151+
func telemetryCounterName(crash []byte) (string, error) {
152+
pcs, err := parseStackPCs(string(crash))
153+
if err != nil {
154+
return "", err
155+
}
156+
157+
// Limit the number of frames we request.
158+
pcs = pcs[:min(len(pcs), 16)]
159+
160+
if len(pcs) == 0 {
161+
// This can occur if all goroutines are idle, as when
162+
// caught in a deadlock, or killed by an async signal
163+
// while blocked.
164+
//
165+
// TODO(adonovan): consider how to report such
166+
// situations. Reporting a goroutine in [sleep] or
167+
// [select] state could be quite confusing without
168+
// further information about the nature of the crash,
169+
// as the problem is not local to the code location.
170+
//
171+
// For now, we keep count of this situation so that we
172+
// can access whether it needs a more involved solution.
173+
return "crash/no-running-goroutine", nil
174+
}
175+
176+
// This string appears at the start of all
177+
// crashmonitor-generated counter names.
178+
//
179+
// It is tempting to expose this as a parameter of Start, but
180+
// it is not without risk. What value should most programs
181+
// provide? There's no point giving the name of the executable
182+
// as this is already recorded by telemetry. What if the
183+
// application runs in multiple modes? Then it might be useful
184+
// to record the mode. The problem is that an application with
185+
// multiple modes probably doesn't know its mode by line 1 of
186+
// main.main: it might require flag or argument parsing, or
187+
// even validation of an environment variable, and we really
188+
// want to steer users aware from any logic before Start. The
189+
// flags and arguments will be wrong in the child process, and
190+
// every extra conditional branch creates a risk that the
191+
// recursively executed child program will behave not like the
192+
// monitor but like the application. If the child process
193+
// exits before calling Start, then the parent application
194+
// will not have a monitor, and its crash reports will be
195+
// discarded (written in to a pipe that is never read).
196+
//
197+
// So for now, we use this constant string.
198+
const prefix = "crash/crash"
199+
return counter.EncodeStack(pcs, prefix), nil
200+
}
201+
202+
// parseStackPCs parses the parent process's program counters for the
203+
// first running goroutine out of a GOTRACEBACK=system traceback,
204+
// adjusting them so that they are valid for the child process's text
205+
// segment.
206+
//
207+
// This function returns only program counter values, ensuring that
208+
// there is no possibility of strings from the crash report (which may
209+
// contain PII) leaking into the telemetry system.
210+
func parseStackPCs(crash string) ([]uintptr, error) {
211+
// getPC parses the PC out of a line of the form:
212+
// \tFILE:LINE +0xRELPC sp=... fp=... pc=...
213+
getPC := func(line string) (uint64, error) {
214+
_, pcstr, ok := strings.Cut(line, " pc=") // e.g. pc=0x%x
215+
if !ok {
216+
return 0, fmt.Errorf("no pc= for stack frame: %s", line)
217+
}
218+
return strconv.ParseUint(pcstr, 0, 64) // 0 => allow 0x prefix
219+
}
220+
221+
var (
222+
pcs []uintptr
223+
parentSentinel uint64
224+
childSentinel = sentinel()
225+
on = false // are we in the first running goroutine?
226+
lines = strings.Split(crash, "\n")
227+
)
228+
for i := 0; i < len(lines); i++ {
229+
line := lines[i]
230+
231+
// Read sentinel value.
232+
if parentSentinel == 0 && strings.HasPrefix(line, "sentinel ") {
233+
_, err := fmt.Sscanf(line, "sentinel %x", &parentSentinel)
234+
if err != nil {
235+
return nil, fmt.Errorf("can't read sentinel line")
236+
}
237+
continue
238+
}
239+
240+
// Search for "goroutine GID [STATUS]"
241+
if !on {
242+
if strings.HasPrefix(line, "goroutine ") &&
243+
strings.Contains(line, " [running]:") {
244+
on = true
245+
246+
if parentSentinel == 0 {
247+
return nil, fmt.Errorf("no sentinel value in crash report")
248+
}
249+
}
250+
continue
251+
}
252+
253+
// A blank line marks end of a goroutine stack.
254+
if line == "" {
255+
break
256+
}
257+
258+
// Skip the final "created by SYMBOL in goroutine GID" part.
259+
if strings.HasPrefix(line, "created by ") {
260+
break
261+
}
262+
263+
// Expect a pair of lines:
264+
// SYMBOL(ARGS)
265+
// \tFILE:LINE +0xRELPC sp=0x%x fp=0x%x pc=0x%x
266+
// Note: SYMBOL may contain parens "pkg.(*T).method"
267+
// The RELPC is sometimes missing.
268+
269+
// Skip the symbol(args) line.
270+
i++
271+
if i == len(lines) {
272+
break
273+
}
274+
line = lines[i]
275+
276+
// Parse the PC, and correct for the parent and child's
277+
// different mappings of the text section.
278+
pc, err := getPC(line)
279+
if err != nil {
280+
// Inlined frame, perhaps; skip it.
281+
continue
282+
}
283+
pcs = append(pcs, uintptr(pc-parentSentinel+childSentinel))
284+
}
285+
return pcs, nil
286+
}
287+
288+
func min(x, y int) int {
289+
if x < y {
290+
return x
291+
} else {
292+
return y
293+
}
294+
}

0 commit comments

Comments
 (0)