|
| 1 | +// Copyright 2024 The Go Authors. All rights reserved. |
| 2 | +// Use of this source code is governed by a BSD-style |
| 3 | +// license that can be found in the LICENSE file. |
| 4 | + |
| 5 | +package crashmonitor |
| 6 | + |
| 7 | +// This file defines a monitor that reports arbitrary Go runtime |
| 8 | +// crashes to telemetry. |
| 9 | + |
| 10 | +import ( |
| 11 | + "bytes" |
| 12 | + "fmt" |
| 13 | + "io" |
| 14 | + "log" |
| 15 | + "os" |
| 16 | + "os/exec" |
| 17 | + "reflect" |
| 18 | + "runtime/debug" |
| 19 | + "strconv" |
| 20 | + "strings" |
| 21 | + |
| 22 | + "golang.org/x/telemetry/internal/counter" |
| 23 | +) |
| 24 | + |
| 25 | +// Supported reports whether the runtime supports [runtime.SetCrashOutput]. |
| 26 | +// |
| 27 | +// TODO(adonovan): eliminate once go1.23+ is assured. |
| 28 | +func Supported() bool { return setCrashOutput != nil } |
| 29 | + |
| 30 | +var setCrashOutput func(*os.File) error // = runtime.SetCrashOutput on go1.23+ |
| 31 | + |
| 32 | +// Start starts the monitor process, which performs automated |
| 33 | +// reporting of unexpected crashes via Go telemetry. Call this |
| 34 | +// function once immediately after [counter.Open]() within the main |
| 35 | +// function of your application, before argument parsing. |
| 36 | +// |
| 37 | +// This function re-executes the current executable as a child |
| 38 | +// process, in a special mode. In that mode, the call to Start will |
| 39 | +// never return. |
| 40 | +// |
| 41 | +// The application should avoid doing expensive work in init functions |
| 42 | +// as they will be executed twice. Run with GODEBUG=inittrace=1 to |
| 43 | +// display the running time of each package initializer. |
| 44 | +// |
| 45 | +// Start uses the [debug.SetCrashOutput] mechanism, which is a |
| 46 | +// process-wide resource. Do not make other calls to that function |
| 47 | +// within your application. Start is a no-op unless the program is |
| 48 | +// built with go1.23+. |
| 49 | +func Start() { |
| 50 | + if !Supported() { |
| 51 | + return |
| 52 | + } |
| 53 | + |
| 54 | + const crashmonitorVar = "X_TELEMETRY_CRASHMONITOR" |
| 55 | + if os.Getenv(crashmonitorVar) != "" { |
| 56 | + // This process is the crashmonitor (child). |
| 57 | + log.SetFlags(0) |
| 58 | + log.SetPrefix("crashmonitor: ") |
| 59 | + |
| 60 | + // Wait for parent process's dying gasp. |
| 61 | + // If the parent dies for any reason this read will return. |
| 62 | + data, err := io.ReadAll(os.Stdin) |
| 63 | + if err != nil { |
| 64 | + log.Fatalf("failed to read from input pipe: %v", err) |
| 65 | + } |
| 66 | + |
| 67 | + // If the only line is the sentinel, it wasn't a crash. |
| 68 | + if bytes.Count(data, []byte("\n")) < 2 { |
| 69 | + os.Exit(0) // parent exited without crash report |
| 70 | + } |
| 71 | + |
| 72 | + log.Printf("parent reported crash:\n%s", data) |
| 73 | + |
| 74 | + // Parse the stack out of the crash report |
| 75 | + // and record a telemetry count for it. |
| 76 | + name, err := telemetryCounterName(data) |
| 77 | + if err != nil { |
| 78 | + // Keep count of how often this happens |
| 79 | + // so that we can investigate if necessary. |
| 80 | + incrementCounter("crash/malformed") |
| 81 | + |
| 82 | + // Something went wrong. |
| 83 | + // Save the crash securely in the file system. |
| 84 | + f, err := os.CreateTemp(os.TempDir(), "*.crash") |
| 85 | + if err != nil { |
| 86 | + log.Fatal(err) |
| 87 | + } |
| 88 | + if _, err := f.Write(data); err != nil { |
| 89 | + log.Fatal(err) |
| 90 | + } |
| 91 | + if err := f.Close(); err != nil { |
| 92 | + log.Fatal(err) |
| 93 | + } |
| 94 | + log.Printf("failed to report crash to telemetry: %v", err) |
| 95 | + log.Fatalf("crash report saved at %s", f.Name()) |
| 96 | + } |
| 97 | + |
| 98 | + incrementCounter(name) |
| 99 | + |
| 100 | + log.Fatalf("telemetry crash recorded") |
| 101 | + } |
| 102 | + |
| 103 | + // This process is the application (parent). |
| 104 | + // Fork+exec the crashmonitor (child). |
| 105 | + exe, err := os.Executable() |
| 106 | + if err != nil { |
| 107 | + log.Fatal(err) |
| 108 | + } |
| 109 | + cmd := exec.Command(exe, "** crashmonitor **") // this unused arg is just for ps(1) |
| 110 | + cmd.Env = append(os.Environ(), crashmonitorVar+"=1") |
| 111 | + cmd.Stderr = os.Stderr |
| 112 | + cmd.Stdout = os.Stderr |
| 113 | + pipe, err := cmd.StdinPipe() |
| 114 | + if err != nil { |
| 115 | + log.Fatalf("StdinPipe: %v", err) |
| 116 | + } |
| 117 | + |
| 118 | + writeSentinel(pipe) |
| 119 | + // Ensure that we get pc=0x%x values in the traceback. |
| 120 | + debug.SetTraceback("system") |
| 121 | + setCrashOutput(pipe.(*os.File)) // (this conversion is safe) |
| 122 | + |
| 123 | + if err := cmd.Start(); err != nil { |
| 124 | + log.Fatalf("can't start crash monitor: %v", err) |
| 125 | + } |
| 126 | + |
| 127 | + // Now return and run the application proper... |
| 128 | +} |
| 129 | + |
| 130 | +// (stubbed by test) |
| 131 | +var incrementCounter = func(name string) { counter.New(name).Inc() } |
| 132 | + |
| 133 | +// The sentinel function returns its address. The difference between |
| 134 | +// this value as observed by calls in two different processes of the |
| 135 | +// same executable tells us the relative offset of their text segments. |
| 136 | +// |
| 137 | +// It would be nice if SetCrashOutput took care of this as it's fiddly |
| 138 | +// and likely to confuse every user at first. |
| 139 | +func sentinel() uint64 { |
| 140 | + return uint64(reflect.ValueOf(sentinel).Pointer()) |
| 141 | +} |
| 142 | + |
| 143 | +func writeSentinel(out io.Writer) { |
| 144 | + fmt.Fprintf(out, "sentinel %x\n", sentinel()) |
| 145 | +} |
| 146 | + |
| 147 | +// telemetryCounterName parses a crash report produced by the Go |
| 148 | +// runtime, extracts the stack of the first runnable goroutine, |
| 149 | +// converts each line into telemetry form ("symbol:relative-line"), |
| 150 | +// and returns this as the name of a counter. |
| 151 | +func telemetryCounterName(crash []byte) (string, error) { |
| 152 | + pcs, err := parseStackPCs(string(crash)) |
| 153 | + if err != nil { |
| 154 | + return "", err |
| 155 | + } |
| 156 | + |
| 157 | + // Limit the number of frames we request. |
| 158 | + pcs = pcs[:min(len(pcs), 16)] |
| 159 | + |
| 160 | + if len(pcs) == 0 { |
| 161 | + // This can occur if all goroutines are idle, as when |
| 162 | + // caught in a deadlock, or killed by an async signal |
| 163 | + // while blocked. |
| 164 | + // |
| 165 | + // TODO(adonovan): consider how to report such |
| 166 | + // situations. Reporting a goroutine in [sleep] or |
| 167 | + // [select] state could be quite confusing without |
| 168 | + // further information about the nature of the crash, |
| 169 | + // as the problem is not local to the code location. |
| 170 | + // |
| 171 | + // For now, we keep count of this situation so that we |
| 172 | + // can access whether it needs a more involved solution. |
| 173 | + return "crash/no-running-goroutine", nil |
| 174 | + } |
| 175 | + |
| 176 | + // This string appears at the start of all |
| 177 | + // crashmonitor-generated counter names. |
| 178 | + // |
| 179 | + // It is tempting to expose this as a parameter of Start, but |
| 180 | + // it is not without risk. What value should most programs |
| 181 | + // provide? There's no point giving the name of the executable |
| 182 | + // as this is already recorded by telemetry. What if the |
| 183 | + // application runs in multiple modes? Then it might be useful |
| 184 | + // to record the mode. The problem is that an application with |
| 185 | + // multiple modes probably doesn't know its mode by line 1 of |
| 186 | + // main.main: it might require flag or argument parsing, or |
| 187 | + // even validation of an environment variable, and we really |
| 188 | + // want to steer users aware from any logic before Start. The |
| 189 | + // flags and arguments will be wrong in the child process, and |
| 190 | + // every extra conditional branch creates a risk that the |
| 191 | + // recursively executed child program will behave not like the |
| 192 | + // monitor but like the application. If the child process |
| 193 | + // exits before calling Start, then the parent application |
| 194 | + // will not have a monitor, and its crash reports will be |
| 195 | + // discarded (written in to a pipe that is never read). |
| 196 | + // |
| 197 | + // So for now, we use this constant string. |
| 198 | + const prefix = "crash/crash" |
| 199 | + return counter.EncodeStack(pcs, prefix), nil |
| 200 | +} |
| 201 | + |
| 202 | +// parseStackPCs parses the parent process's program counters for the |
| 203 | +// first running goroutine out of a GOTRACEBACK=system traceback, |
| 204 | +// adjusting them so that they are valid for the child process's text |
| 205 | +// segment. |
| 206 | +// |
| 207 | +// This function returns only program counter values, ensuring that |
| 208 | +// there is no possibility of strings from the crash report (which may |
| 209 | +// contain PII) leaking into the telemetry system. |
| 210 | +func parseStackPCs(crash string) ([]uintptr, error) { |
| 211 | + // getPC parses the PC out of a line of the form: |
| 212 | + // \tFILE:LINE +0xRELPC sp=... fp=... pc=... |
| 213 | + getPC := func(line string) (uint64, error) { |
| 214 | + _, pcstr, ok := strings.Cut(line, " pc=") // e.g. pc=0x%x |
| 215 | + if !ok { |
| 216 | + return 0, fmt.Errorf("no pc= for stack frame: %s", line) |
| 217 | + } |
| 218 | + return strconv.ParseUint(pcstr, 0, 64) // 0 => allow 0x prefix |
| 219 | + } |
| 220 | + |
| 221 | + var ( |
| 222 | + pcs []uintptr |
| 223 | + parentSentinel uint64 |
| 224 | + childSentinel = sentinel() |
| 225 | + on = false // are we in the first running goroutine? |
| 226 | + lines = strings.Split(crash, "\n") |
| 227 | + ) |
| 228 | + for i := 0; i < len(lines); i++ { |
| 229 | + line := lines[i] |
| 230 | + |
| 231 | + // Read sentinel value. |
| 232 | + if parentSentinel == 0 && strings.HasPrefix(line, "sentinel ") { |
| 233 | + _, err := fmt.Sscanf(line, "sentinel %x", &parentSentinel) |
| 234 | + if err != nil { |
| 235 | + return nil, fmt.Errorf("can't read sentinel line") |
| 236 | + } |
| 237 | + continue |
| 238 | + } |
| 239 | + |
| 240 | + // Search for "goroutine GID [STATUS]" |
| 241 | + if !on { |
| 242 | + if strings.HasPrefix(line, "goroutine ") && |
| 243 | + strings.Contains(line, " [running]:") { |
| 244 | + on = true |
| 245 | + |
| 246 | + if parentSentinel == 0 { |
| 247 | + return nil, fmt.Errorf("no sentinel value in crash report") |
| 248 | + } |
| 249 | + } |
| 250 | + continue |
| 251 | + } |
| 252 | + |
| 253 | + // A blank line marks end of a goroutine stack. |
| 254 | + if line == "" { |
| 255 | + break |
| 256 | + } |
| 257 | + |
| 258 | + // Skip the final "created by SYMBOL in goroutine GID" part. |
| 259 | + if strings.HasPrefix(line, "created by ") { |
| 260 | + break |
| 261 | + } |
| 262 | + |
| 263 | + // Expect a pair of lines: |
| 264 | + // SYMBOL(ARGS) |
| 265 | + // \tFILE:LINE +0xRELPC sp=0x%x fp=0x%x pc=0x%x |
| 266 | + // Note: SYMBOL may contain parens "pkg.(*T).method" |
| 267 | + // The RELPC is sometimes missing. |
| 268 | + |
| 269 | + // Skip the symbol(args) line. |
| 270 | + i++ |
| 271 | + if i == len(lines) { |
| 272 | + break |
| 273 | + } |
| 274 | + line = lines[i] |
| 275 | + |
| 276 | + // Parse the PC, and correct for the parent and child's |
| 277 | + // different mappings of the text section. |
| 278 | + pc, err := getPC(line) |
| 279 | + if err != nil { |
| 280 | + // Inlined frame, perhaps; skip it. |
| 281 | + continue |
| 282 | + } |
| 283 | + pcs = append(pcs, uintptr(pc-parentSentinel+childSentinel)) |
| 284 | + } |
| 285 | + return pcs, nil |
| 286 | +} |
| 287 | + |
| 288 | +func min(x, y int) int { |
| 289 | + if x < y { |
| 290 | + return x |
| 291 | + } else { |
| 292 | + return y |
| 293 | + } |
| 294 | +} |
0 commit comments