This repository has been archived by the owner on Nov 5, 2021. It is now read-only.
-
Notifications
You must be signed in to change notification settings - Fork 151
/
external.go
585 lines (517 loc) · 16.4 KB
/
external.go
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
// Copyright 2017-2019 Google Inc.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
/*
Package external implements an external probe type for cloudprober.
External probe type executes an external process for actual probing. These probes
can have two modes: "once" and "server". In "once" mode, the external process is
started for each probe run cycle, while in "server" mode, external process is
started only if it's not running already and Cloudprober communicates with it
over stdin/stdout for each probe cycle.
TODO(manugarg): Add a way to test this program. Write another program that
implements the probe server protocol and use that for testing.
*/
package external
import (
"bufio"
"context"
"fmt"
"io"
"os"
"os/exec"
"strings"
"sync"
"time"
"github.com/golang/protobuf/proto"
"github.com/google/cloudprober/logger"
"github.com/google/cloudprober/metrics"
"github.com/google/cloudprober/metrics/payload"
configpb "github.com/google/cloudprober/probes/external/proto"
serverpb "github.com/google/cloudprober/probes/external/proto"
"github.com/google/cloudprober/probes/external/serverutils"
"github.com/google/cloudprober/probes/options"
"github.com/google/cloudprober/targets/endpoint"
"github.com/google/cloudprober/validators"
)
var (
// TimeBetweenRequests is the time interval between probe requests for
// multiple targets. In server mode, probe requests for multiple targets are
// sent to the same external probe process. Sleeping between requests provides
// some time buffer for the probe process to dequeue the incoming requests and
// avoids filling up the communication pipe.
//
// Note that this value impacts the effective timeout for a target as timeout
// is applied for all the targets in aggregate. For example, 100th target in
// the targets list will have the effective timeout of (timeout - 1ms).
// TODO(manugarg): Make sure that the last target in the list has an impact of
// less than 1% on its timeout.
TimeBetweenRequests = 10 * time.Microsecond
)
type result struct {
total, success int64
latency metrics.Value
validationFailure *metrics.Map
payloadMetrics *metrics.EventMetrics
}
// Probe holds aggregate information about all probe runs, per-target.
type Probe struct {
name string
mode string
cmdName string
cmdArgs []string
opts *options.Options
c *configpb.ProbeConf
l *logger.Logger
// book-keeping params
labelKeys map[string]bool // Labels for substitution
requestID int32
cmdRunning bool
cmdStdin io.Writer
cmdStdout io.ReadCloser
cmdStderr io.ReadCloser
replyChan chan *serverpb.ProbeReply
targets []endpoint.Endpoint
results map[string]*result // probe results keyed by targets
dataChan chan *metrics.EventMetrics
// default payload metrics that we clone from to build per-target payload
// metrics.
payloadParser *payload.Parser
}
// Init initializes the probe with the given params.
func (p *Probe) Init(name string, opts *options.Options) error {
c, ok := opts.ProbeConf.(*configpb.ProbeConf)
if !ok {
return fmt.Errorf("not external probe config")
}
p.name = name
p.opts = opts
if p.l = opts.Logger; p.l == nil {
p.l = &logger.Logger{}
}
p.c = c
p.replyChan = make(chan *serverpb.ProbeReply)
cmdParts := strings.Split(p.c.GetCommand(), " ")
p.cmdName = cmdParts[0]
p.cmdArgs = cmdParts[1:len(cmdParts)]
// Figure out labels we are interested in
p.labelKeys = make(map[string]bool)
validLabels := []string{"@target@", "@address@", "@probe@"}
for _, l := range validLabels {
for _, opt := range p.c.GetOptions() {
if strings.Contains(opt.GetValue(), l) {
p.labelKeys[l] = true
}
}
for _, arg := range p.cmdArgs {
if strings.Contains(arg, l) {
p.labelKeys[l] = true
}
}
}
switch p.c.GetMode() {
case configpb.ProbeConf_ONCE:
p.mode = "once"
case configpb.ProbeConf_SERVER:
p.mode = "server"
default:
return fmt.Errorf("invalid mode: %s", p.c.GetMode())
}
p.results = make(map[string]*result)
if !p.c.GetOutputAsMetrics() {
return nil
}
defaultKind := metrics.CUMULATIVE
if p.c.GetMode() == configpb.ProbeConf_ONCE {
defaultKind = metrics.GAUGE
}
var err error
p.payloadParser, err = payload.NewParser(p.c.GetOutputMetricsOptions(), "external", p.name, metrics.Kind(defaultKind), p.l)
if err != nil {
return fmt.Errorf("error initializing payload metrics: %v", err)
}
return nil
}
// substituteLabels replaces occurrences of @label@ with the values from
// labels. It returns the substituted string and a bool indicating if there
// was a @label@ that did not exist in the labels map.
func substituteLabels(in string, labels map[string]string) (string, bool) {
if len(labels) == 0 {
return in, !strings.Contains(in, "@")
}
delimiter := "@"
output := ""
words := strings.Split(in, delimiter)
count := len(words)
foundAll := true
for j, kwd := range words {
// Even number of words => just copy out.
if j%2 == 0 {
output += kwd
continue
}
// Special case: If there are an odd number of '@' (unbalanced), the last
// odd index doesn't actually have a closing '@', so we just append it as it
// is.
if j == count-1 {
output += delimiter
output += kwd
continue
}
// Special case: "@@" => "@"
if kwd == "" {
output += delimiter
continue
}
// Finally, the labels.
replace, ok := labels[kwd]
if ok {
output += replace
continue
}
// Nothing - put the token back in.
foundAll = false
output += delimiter
output += kwd
output += delimiter
}
return output, foundAll
}
func (p *Probe) startCmdIfNotRunning() error {
// Start external probe command if it's not running already. Note that here we
// are trusting the cmdRunning to be set correctly. It can be false for 3 reasons:
// 1) This is the first call and the process has actually never been started.
// 2) cmd.Start() started the process but still returned an error.
// 3) cmd.Wait() returned incorrectly, while the process was still running.
//
// 2 or 3 should never happen as per design, but managing processes can be tricky.
// Documenting here to help with debugging if we run into an issue.
if p.cmdRunning {
return nil
}
p.l.Infof("Starting external command: %s %s", p.cmdName, strings.Join(p.cmdArgs, " "))
cmd := exec.Command(p.cmdName, p.cmdArgs...)
var err error
if p.cmdStdin, err = cmd.StdinPipe(); err != nil {
return err
}
if p.cmdStdout, err = cmd.StdoutPipe(); err != nil {
return err
}
if p.cmdStderr, err = cmd.StderrPipe(); err != nil {
return err
}
go func() {
scanner := bufio.NewScanner(p.cmdStderr)
for scanner.Scan() {
p.l.Warningf("Stderr of %s: %s", cmd.Path, scanner.Text())
}
}()
if err = cmd.Start(); err != nil {
p.l.Errorf("error while starting the cmd: %s %s. Err: %v", cmd.Path, cmd.Args, err)
return fmt.Errorf("error while starting the cmd: %s %s. Err: %v", cmd.Path, cmd.Args, err)
}
doneChan := make(chan struct{})
// This goroutine waits for the process to terminate and sets cmdRunning to
// false when that happens.
go func() {
err := cmd.Wait()
close(doneChan)
p.cmdRunning = false
if err != nil {
if exitErr, ok := err.(*exec.ExitError); ok {
p.l.Errorf("external probe process died with the status: %s. Stderr: %s", exitErr.Error(), string(exitErr.Stderr))
}
}
}()
go p.readProbeReplies(doneChan)
p.cmdRunning = true
return nil
}
func (p *Probe) readProbeReplies(done chan struct{}) error {
bufReader := bufio.NewReader(p.cmdStdout)
// Start a background goroutine to read probe replies from the probe server
// process's stdout and put them on the probe's replyChan. Note that replyChan
// is a one element channel. Idea is that we won't need buffering other than
// the one provided by Unix pipes.
for {
select {
case <-done:
return nil
default:
}
rep, err := serverutils.ReadProbeReply(bufReader)
if err != nil {
// Return if external probe process pipe has closed. We get:
// io.EOF: when other process has closed the pipe.
// os.ErrClosed: when we have closed the pipe (through cmd.Wait()).
// *os.PathError: deferred close of the pipe.
_, isPathError := err.(*os.PathError)
if err == os.ErrClosed || err == io.EOF || isPathError {
p.l.Errorf("External probe process pipe is closed. Err: %s", err.Error())
return err
}
p.l.Errorf("Error reading probe reply: %s", err.Error())
continue
}
p.replyChan <- rep
}
}
func (p *Probe) defaultMetrics(target string, result *result) *metrics.EventMetrics {
em := metrics.NewEventMetrics(time.Now()).
AddMetric("success", metrics.NewInt(result.success)).
AddMetric("total", metrics.NewInt(result.total)).
AddMetric("latency", result.latency).
AddLabel("ptype", "external").
AddLabel("probe", p.name).
AddLabel("dst", target)
for _, al := range p.opts.AdditionalLabels {
em.AddLabel(al.KeyValueForTarget(target))
}
if p.opts.Validators != nil {
em.AddMetric("validation_failure", result.validationFailure)
}
return em
}
func (p *Probe) labels(target string) map[string]string {
labels := make(map[string]string)
if p.labelKeys["@probe@"] {
labels["probe"] = p.name
}
if p.labelKeys["@target@"] {
labels["target"] = target
}
if p.labelKeys["@address@"] {
addr, err := p.opts.Targets.Resolve(target, p.opts.IPVersion)
if err != nil {
p.l.Warningf("Targets.Resolve(%v, %v) failed: %v ", target, p.opts.IPVersion, err)
} else if !addr.IsUnspecified() {
labels["address"] = addr.String()
}
}
return labels
}
func (p *Probe) sendRequest(requestID int32, target string) error {
req := &serverpb.ProbeRequest{
RequestId: proto.Int32(requestID),
TimeLimit: proto.Int32(int32(p.opts.Timeout / time.Millisecond)),
Options: []*serverpb.ProbeRequest_Option{},
}
for _, opt := range p.c.GetOptions() {
value, found := substituteLabels(opt.GetValue(), p.labels(target))
if !found {
p.l.Warningf("Missing substitution in option %q", value)
}
req.Options = append(req.Options, &serverpb.ProbeRequest_Option{
Name: opt.Name,
Value: proto.String(value),
})
}
p.l.Debugf("Sending a probe request %v to the external probe server for target %v", requestID, target)
return serverutils.WriteMessage(req, p.cmdStdin)
}
type requestInfo struct {
target string
timestamp time.Time
}
// probeStatus captures the single probe status. It's only used by runProbe
// functions to pass a probe's status to processProbeResult method.
type probeStatus struct {
target string
success bool
latency time.Duration
payload string
}
func (p *Probe) processProbeResult(ps *probeStatus, result *result) {
if ps.success && p.opts.Validators != nil {
failedValidations := validators.RunValidators(p.opts.Validators, nil, []byte(ps.payload), result.validationFailure, p.l)
// If any validation failed, log and set success to false.
if len(failedValidations) > 0 {
p.l.Debug("Target:", ps.target, " failed validations: ", strings.Join(failedValidations, ","), ".")
ps.success = false
}
}
if ps.success {
result.success++
result.latency.AddFloat64(ps.latency.Seconds() / p.opts.LatencyUnit.Seconds())
}
em := p.defaultMetrics(ps.target, result)
p.opts.LogMetrics(em)
p.dataChan <- em
// If probe is configured to use the external process output (or reply payload
// in case of server probe) as metrics.
if p.c.GetOutputAsMetrics() {
result.payloadMetrics = p.payloadParser.PayloadMetrics(result.payloadMetrics, ps.payload, ps.target)
p.opts.LogMetrics(result.payloadMetrics)
p.dataChan <- result.payloadMetrics
}
}
func (p *Probe) runServerProbe(ctx context.Context) {
requests := make(map[int32]requestInfo)
var requestsMu sync.RWMutex
doneChan := make(chan struct{})
if err := p.startCmdIfNotRunning(); err != nil {
p.l.Error(err.Error())
return
}
var wg sync.WaitGroup
wg.Add(1)
go func() {
defer wg.Done()
// Read probe replies until we have no outstanding requests or context has
// run out.
for {
_, ok := <-doneChan
if !ok {
// It is safe to access requests without lock here as it won't be accessed
// by the send loop after doneChan is closed.
p.l.Debugf("Number of outstanding requests: %d", len(requests))
if len(requests) == 0 {
return
}
}
select {
case <-ctx.Done():
p.l.Error(ctx.Err().Error())
return
case rep := <-p.replyChan:
requestsMu.Lock()
reqInfo, ok := requests[rep.GetRequestId()]
if ok {
delete(requests, rep.GetRequestId())
}
requestsMu.Unlock()
if !ok {
// Not our reply, could be from the last timed out probe.
p.l.Warningf("Got a reply that doesn't match any outstading request: Request id from reply: %v. Ignoring.", rep.GetRequestId())
continue
}
success := true
if rep.GetErrorMessage() != "" {
p.l.Errorf("Probe for target %v failed with error message: %s", reqInfo.target, rep.GetErrorMessage())
success = false
}
p.processProbeResult(&probeStatus{
target: reqInfo.target,
success: success,
latency: time.Since(reqInfo.timestamp),
payload: rep.GetPayload(),
}, p.results[reqInfo.target])
}
}
}()
// Send probe requests
for _, target := range p.targets {
p.requestID++
p.results[target.Name].total++
requestsMu.Lock()
requests[p.requestID] = requestInfo{
target: target.Name,
timestamp: time.Now(),
}
requestsMu.Unlock()
p.sendRequest(p.requestID, target.Name)
time.Sleep(TimeBetweenRequests)
}
// Send signal to receiver loop that we are done sending request.
close(doneChan)
// Wait for receiver goroutine to exit.
wg.Wait()
}
// runCommand encapsulates command executor in a variable so that we can
// override it for testing.
var runCommand = func(ctx context.Context, cmd string, args []string) ([]byte, error) {
return exec.CommandContext(ctx, cmd, args...).Output()
}
func (p *Probe) runOnceProbe(ctx context.Context) {
var wg sync.WaitGroup
for _, target := range p.targets {
wg.Add(1)
go func(target endpoint.Endpoint, result *result) {
defer wg.Done()
args := make([]string, len(p.cmdArgs))
for i, arg := range p.cmdArgs {
res, found := substituteLabels(arg, p.labels(target.Name))
if !found {
p.l.Warningf("Substitution not found in %q", arg)
}
args[i] = res
}
p.l.Infof("Running external command: %s %s", p.cmdName, strings.Join(args, " "))
result.total++
startTime := time.Now()
b, err := runCommand(ctx, p.cmdName, args)
success := true
if err != nil {
success = false
if exitErr, ok := err.(*exec.ExitError); ok {
p.l.Errorf("external probe process died with the status: %s. Stderr: %s", exitErr.Error(), exitErr.Stderr)
} else {
p.l.Errorf("Error executing the external program. Err: %v", err)
}
}
p.processProbeResult(&probeStatus{
target: target.Name,
success: success,
latency: time.Since(startTime),
payload: string(b),
}, result)
}(target, p.results[target.Name])
}
wg.Wait()
}
func (p *Probe) updateTargets() {
p.targets = p.opts.Targets.ListEndpoints()
for _, target := range p.targets {
if _, ok := p.results[target.Name]; ok {
continue
}
var latencyValue metrics.Value
if p.opts.LatencyDist != nil {
latencyValue = p.opts.LatencyDist.Clone()
} else {
latencyValue = metrics.NewFloat(0)
}
p.results[target.Name] = &result{
latency: latencyValue,
validationFailure: validators.ValidationFailureMap(p.opts.Validators),
}
for _, al := range p.opts.AdditionalLabels {
al.UpdateForTarget(target.Name, target.Labels)
}
}
}
func (p *Probe) runProbe(ctx context.Context) {
ctxTimeout, cancelFunc := context.WithTimeout(ctx, p.opts.Timeout)
defer cancelFunc()
p.updateTargets()
if p.mode == "server" {
p.runServerProbe(ctxTimeout)
} else {
p.runOnceProbe(ctxTimeout)
}
}
// Start starts and runs the probe indefinitely.
func (p *Probe) Start(ctx context.Context, dataChan chan *metrics.EventMetrics) {
p.dataChan = dataChan
ticker := time.NewTicker(p.opts.Interval)
defer ticker.Stop()
for range ticker.C {
// Don't run another probe if context is canceled already.
select {
case <-ctx.Done():
return
default:
}
p.runProbe(ctx)
}
}