Skip to content

Commit 1c9c63e

Browse files
andrewvcmergify-bot
authored andcommitted
[Heartbeat] Defer monitor / ICMP errors to monitor runtime / ES (#29413)
This PR generally improves the error behavior of all monitors, and some specific ICMP related errors as well. These two items are combined in one PR because the general theme here is improving the ICMP error experience, and improving ICMP required improving all monitors. Fixes #29346 and incremental progress toward #29692 General monitor improvements Generally speaking, per #29692 we are trying to send monitor output to ES wherever possible. With this PR we now send any monitor initialization errors (such as a lack of ICMP kernel capabilities) during monitor creation to ES. We do this by allowing the monitor to initialize and run on schedule, even though we know it will always send the same error message. This lets users more easily debug issues in Kibana. ICMP Specific Improvement This PR also Removes broken a IP capability check that caused heartbeat to be unable to start. We now just rely on return codes from attempts to actually send packets. This is the more specific fix for #29346 . I was not able to exactly reproduce the exact customer reported issue, where the user somehow disabled ipv6 in a way that the ICMP loop that I can't exactly reproduce. I tried disabling ipv6 fully with sudo sysctl net.ipv6.conf.all.disable_ipv6=1 but that didn't yield the error in #29346 The logic is now simplified, there's no truly reliable way to know if you can send an ipv6 (or ipv4) ping before you send it (settings can change at any time! network cards can disappear!), so we just let the error codes happen as the check is executed. This is also generally a better UX in that the errors will now be visible in the Uptime app, not just the logs. It should be noted that the ipv4 and ipv6 boolean options only are documented to affect how DNS lookups happen. With this change the behavior matches the docs. Note that ICMP is a bit weird in that there's a single ICMP loop in heartbeat, and all monitors are really just interacting with that. Removal of .synthetics This also ignores the .synthetics folder which has been inconvenient for some time for devs, in that it dirties the git path (cherry picked from commit 616db13)
1 parent 5409d19 commit 1c9c63e

File tree

12 files changed

+119
-68
lines changed

12 files changed

+119
-68
lines changed

CHANGELOG.next.asciidoc

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -50,6 +50,7 @@ https://github.com/elastic/beats/compare/v7.0.0-alpha2...master[Check the HEAD d
5050
*Heartbeat*
5151

5252
- Fix race condition in http monitors using `mode:all` that can cause crashes. {pull}29697[pull]
53+
- Fix broken ICMP availability check that prevented heartbeat from starting in rare cases. {pull}29413[pull]
5354

5455
*Metricbeat*
5556

@@ -82,6 +83,8 @@ https://github.com/elastic/beats/compare/v7.0.0-alpha2...master[Check the HEAD d
8283

8384
*Heartbeat*
8485

86+
- More errors are now visible in ES with new logic failing monitors later to ease debugging. {pull}29413[pull]
87+
8588

8689
*Metricbeat*
8790

heartbeat/monitors/active/icmp/icmp.go

Lines changed: 0 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -91,10 +91,6 @@ func (jf *jobFactory) checkConfig() error {
9191
}
9292

9393
func (jf *jobFactory) makePlugin() (plugin2 plugin.Plugin, err error) {
94-
if err := jf.loop.checkNetworkMode(jf.ipVersion); err != nil {
95-
return plugin.Plugin{}, err
96-
}
97-
9894
pingFactory := jf.pingIPFactory(&jf.config)
9995

10096
var j []jobs.Job

heartbeat/monitors/active/icmp/loop.go

Lines changed: 0 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -23,7 +23,6 @@ import (
2323
)
2424

2525
type ICMPLoop interface {
26-
checkNetworkMode(mode string) error
2726
ping(
2827
addr *net.IPAddr,
2928
timeout time.Duration,

heartbeat/monitors/active/icmp/stdloop.go

Lines changed: 9 additions & 25 deletions
Original file line numberDiff line numberDiff line change
@@ -20,7 +20,6 @@ package icmp
2020
import (
2121
"bytes"
2222
"encoding/binary"
23-
"errors"
2423
"fmt"
2524
"math/rand"
2625
"net"
@@ -159,29 +158,6 @@ func newICMPLoop() (*stdICMPLoop, error) {
159158
return l, nil
160159
}
161160

162-
func (l *stdICMPLoop) checkNetworkMode(mode string) error {
163-
ip4, ip6 := false, false
164-
switch mode {
165-
case "ip4":
166-
ip4 = true
167-
case "ip6":
168-
ip6 = true
169-
case "ip":
170-
ip4, ip6 = true, true
171-
default:
172-
return fmt.Errorf("'%v' is not supported", mode)
173-
}
174-
175-
if ip4 && l.conn4 == nil {
176-
return errors.New("failed to initiate IPv4 support. Check log details for permission configuration")
177-
}
178-
if ip6 && l.conn6 == nil {
179-
return errors.New("failed to initiate IPv6 support. Check log details for permission configuration")
180-
}
181-
182-
return nil
183-
}
184-
185161
func (l *stdICMPLoop) runICMPRecv(conn *icmp.PacketConn, proto int) {
186162
for {
187163
bytes := make([]byte, 512)
@@ -251,6 +227,14 @@ func (l *stdICMPLoop) ping(
251227
timeout time.Duration,
252228
interval time.Duration,
253229
) (time.Duration, int, error) {
230+
isIPv6 := addr.IP.To4() == nil
231+
if isIPv6 && l.conn6 == nil {
232+
return -1, -1, fmt.Errorf("cannot ping IPv6 address '%s', no IPv6 connection available", addr)
233+
}
234+
if !isIPv6 && l.conn4 == nil {
235+
return -1, -1, fmt.Errorf("cannot ping IPv4 address '%s', no IPv4 connection available", addr)
236+
}
237+
254238
var err error
255239
toTimer := time.NewTimer(timeout)
256240
defer toTimer.Stop()
@@ -379,7 +363,7 @@ func (l *stdICMPLoop) sendEchoRequest(addr *net.IPAddr) (*requestContext, error)
379363

380364
_, err := conn.WriteTo(encoded, addr)
381365
if err != nil {
382-
return nil, err
366+
return nil, fmt.Errorf("could not write to conn: %w", err)
383367
}
384368

385369
ctx.ts = ts

heartbeat/monitors/factory_test.go

Lines changed: 4 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -149,7 +149,7 @@ func TestPreProcessors(t *testing.T) {
149149
}
150150

151151
func TestDuplicateMonitorIDs(t *testing.T) {
152-
serverMonConf := mockPluginConf(t, "custom", "@every 1ms", "http://example.net")
152+
serverMonConf := mockPluginConf(t, "custom", "custom", "@every 1ms", "http://example.net")
153153
badConf := mockBadPluginConf(t, "custom", "@every 1ms")
154154
reg, built, closed := mockPluginsReg()
155155
pipelineConnector := &MockPipelineConnector{}
@@ -190,8 +190,9 @@ func TestDuplicateMonitorIDs(t *testing.T) {
190190
m1.Stop()
191191
m2.Stop()
192192

193-
// 3 are counted as built, even the bad config
194-
require.Equal(t, 3, built.Load())
193+
// Two are counted as built. The bad config is missing a stdfield so it
194+
// doesn't complete construction
195+
require.Equal(t, 2, built.Load())
195196
// Only 2 closes, because the bad config isn't closed
196197
require.Equal(t, 2, closed.Load())
197198
}

heartbeat/monitors/mocks_test.go

Lines changed: 41 additions & 20 deletions
Original file line numberDiff line numberDiff line change
@@ -99,24 +99,28 @@ func (pc *MockPipelineConnector) ConnectWith(beat.ClientConfig) (beat.Client, er
9999
return c, nil
100100
}
101101

102-
func mockEventMonitorValidator(id string) validator.Validator {
102+
func baseMockEventMonitorValidator(id string, name string, status string) validator.Validator {
103103
var idMatcher isdef.IsDef
104104
if id == "" {
105105
idMatcher = isdef.IsStringMatching(regexp.MustCompile(`^auto-test-.*`))
106106
} else {
107107
idMatcher = isdef.IsEqual(id)
108108
}
109+
return lookslike.MustCompile(map[string]interface{}{
110+
"monitor": map[string]interface{}{
111+
"id": idMatcher,
112+
"name": name,
113+
"type": "test",
114+
"duration.us": isdef.IsDuration,
115+
"status": status,
116+
"check_group": isdef.IsString,
117+
},
118+
})
119+
}
120+
121+
func mockEventMonitorValidator(id string, name string) validator.Validator {
109122
return lookslike.Strict(lookslike.Compose(
110-
lookslike.MustCompile(map[string]interface{}{
111-
"monitor": map[string]interface{}{
112-
"id": idMatcher,
113-
"name": "",
114-
"type": "test",
115-
"duration.us": isdef.IsDuration,
116-
"status": "up",
117-
"check_group": isdef.IsString,
118-
},
119-
}),
123+
baseMockEventMonitorValidator(id, name, "up"),
120124
hbtestllext.MonitorTimespanValidator,
121125
hbtest.SummaryChecks(1, 0),
122126
lookslike.MustCompile(mockEventCustomFields()),
@@ -151,15 +155,19 @@ func mockPluginBuilder() (plugin.PluginFactory, *atomic.Int, *atomic.Int) {
151155
unpacked := struct {
152156
URLs []string `config:"urls" validate:"required"`
153157
}{}
154-
err := config.Unpack(&unpacked)
155-
if err != nil {
156-
return plugin.Plugin{}, err
157-
}
158-
j, err := createMockJob()
158+
159+
// track all closes, even on error
159160
closer := func() error {
160161
closed.Inc()
161162
return nil
162163
}
164+
165+
err := config.Unpack(&unpacked)
166+
if err != nil {
167+
return plugin.Plugin{DoClose: closer}, err
168+
}
169+
j, err := createMockJob()
170+
163171
return plugin.Plugin{Jobs: j, DoClose: closer, Endpoints: 1}, err
164172
},
165173
Stats: plugin.NewPluginCountersRecorder("test", reg)},
@@ -174,13 +182,15 @@ func mockPluginsReg() (p *plugin.PluginsReg, built *atomic.Int, closed *atomic.I
174182
return reg, built, closed
175183
}
176184

177-
func mockPluginConf(t *testing.T, id string, schedule string, url string) *common.Config {
185+
func mockPluginConf(t *testing.T, id string, name string, schedule string, url string) *common.Config {
178186
confMap := map[string]interface{}{
179187
"type": "test",
180188
"urls": []string{url},
181189
"schedule": schedule,
190+
"name": name,
182191
}
183192

193+
// Optional to let us simulate this key missing
184194
if id != "" {
185195
confMap["id"] = id
186196
}
@@ -197,7 +207,6 @@ func mockBadPluginConf(t *testing.T, id string, schedule string) *common.Config
197207
confMap := map[string]interface{}{
198208
"type": "test",
199209
"notanoption": []string{"foo"},
200-
"schedule": schedule,
201210
}
202211

203212
if id != "" {
@@ -210,8 +219,6 @@ func mockBadPluginConf(t *testing.T, id string, schedule string) *common.Config
210219
return conf
211220
}
212221

213-
// mockInvalidPlugin conf returns a config that invalid at the basic level of
214-
// what's expected in heartbeat, i.e. no type.
215222
func mockInvalidPluginConf(t *testing.T) *common.Config {
216223
confMap := map[string]interface{}{
217224
"hoeutnheou": "oueanthoue",
@@ -222,3 +229,17 @@ func mockInvalidPluginConf(t *testing.T) *common.Config {
222229

223230
return conf
224231
}
232+
233+
func mockInvalidPluginConfWithStdFields(t *testing.T, id string, name string, schedule string) *common.Config {
234+
confMap := map[string]interface{}{
235+
"type": "test",
236+
"id": id,
237+
"name": name,
238+
"schedule": schedule,
239+
}
240+
241+
conf, err := common.NewConfigFrom(confMap)
242+
require.NoError(t, err)
243+
244+
return conf
245+
}

heartbeat/monitors/monitor.go

Lines changed: 17 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -163,13 +163,26 @@ func newMonitorUnsafe(
163163
return p.Close()
164164
}
165165

166-
wrappedJobs := wrappers.WrapCommon(p.Jobs, m.stdFields)
167-
m.endpoints = p.Endpoints
168-
166+
// If we've hit an error at this point, still run on schedule, but always return an error.
167+
// This way the error is clearly communicated through to kibana.
168+
// Since the error is not recoverable in these instances, the user will need to reconfigure
169+
// the monitor, which will destroy and recreate it in heartbeat, thus clearing this error.
170+
//
171+
// Note: we do this at this point, and no earlier, because at a minimum we need the
172+
// standard monitor fields (id, name and schedule) to deliver an error to kibana in a way
173+
// that it can render.
169174
if err != nil {
170-
return m, fmt.Errorf("job err %v", err)
175+
// Note, needed to hoist err to this scope, not just to add a prefix
176+
fullErr := fmt.Errorf("job could not be initialized: %s", err)
177+
// A placeholder job that always returns an error
178+
p.Jobs = []jobs.Job{func(event *beat.Event) ([]jobs.Job, error) {
179+
return nil, fullErr
180+
}}
171181
}
172182

183+
wrappedJobs := wrappers.WrapCommon(p.Jobs, m.stdFields)
184+
m.endpoints = p.Endpoints
185+
173186
m.configuredJobs, err = m.makeTasks(config, wrappedJobs)
174187
if err != nil {
175188
return m, err

heartbeat/monitors/monitor_test.go

Lines changed: 34 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -25,19 +25,49 @@ import (
2525
"github.com/stretchr/testify/require"
2626

2727
"github.com/elastic/beats/v7/heartbeat/scheduler"
28+
"github.com/elastic/beats/v7/libbeat/common"
2829
"github.com/elastic/beats/v7/libbeat/monitoring"
30+
"github.com/elastic/go-lookslike"
31+
"github.com/elastic/go-lookslike/isdef"
2932
"github.com/elastic/go-lookslike/testslike"
33+
"github.com/elastic/go-lookslike/validator"
3034
)
3135

32-
func TestMonitor(t *testing.T) {
33-
serverMonConf := mockPluginConf(t, "", "@every 1ms", "http://example.net")
36+
// TestMonitorBasic tests a basic config
37+
func TestMonitorBasic(t *testing.T) {
38+
testMonitorConfig(
39+
t,
40+
mockPluginConf(t, "myId", "myName", "@every 1ms", "http://example.net"),
41+
mockEventMonitorValidator("myId", "myName"),
42+
)
43+
}
44+
45+
// TestMonitorBasic tests a config that errors out at plugin creation, but still has stdfields defined.
46+
// This should cause the monitor to run, but only produce error documents
47+
func TestMonitorCfgError(t *testing.T) {
48+
testMonitorConfig(
49+
t,
50+
mockInvalidPluginConfWithStdFields(t, "invalidTestId", "invalidTestName", "@every 10s"),
51+
lookslike.Compose(
52+
baseMockEventMonitorValidator("invalidTestId", "invalidTestName", "down"),
53+
lookslike.MustCompile(common.MapStr{
54+
"error": common.MapStr{
55+
"message": isdef.IsStringContaining("missing required field"),
56+
"type": "io",
57+
},
58+
}),
59+
),
60+
)
61+
}
62+
63+
func testMonitorConfig(t *testing.T, conf *common.Config, eventValidator validator.Validator) {
3464
reg, built, closed := mockPluginsReg()
3565
pipelineConnector := &MockPipelineConnector{}
3666

3767
sched := scheduler.Create(1, monitoring.NewRegistry(), time.Local, nil, false)
3868
defer sched.Stop()
3969

40-
mon, err := newMonitor(serverMonConf, reg, pipelineConnector, sched.Add, nil, false)
70+
mon, err := newMonitor(conf, reg, pipelineConnector, sched.Add, nil, false)
4171
require.NoError(t, err)
4272

4373
mon.Start()
@@ -56,7 +86,7 @@ func TestMonitor(t *testing.T) {
5686
pcClient.Close()
5787

5888
for _, event := range pcClient.Publishes() {
59-
testslike.Test(t, mockEventMonitorValidator(""), event.Fields)
89+
testslike.Test(t, eventValidator, event.Fields)
6090
}
6191
} else {
6292
// Let's yield this goroutine so we don't spin

heartbeat/monitors/stdfields/stdfields.go

Lines changed: 2 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -18,10 +18,9 @@
1818
package stdfields
1919

2020
import (
21+
"fmt"
2122
"time"
2223

23-
"github.com/pkg/errors"
24-
2524
"github.com/elastic/beats/v7/heartbeat/scheduler/schedule"
2625
"github.com/elastic/beats/v7/libbeat/common"
2726
)
@@ -46,7 +45,7 @@ func ConfigToStdMonitorFields(config *common.Config) (StdMonitorFields, error) {
4645
mpi := StdMonitorFields{Enabled: true}
4746

4847
if err := config.Unpack(&mpi); err != nil {
49-
return mpi, errors.Wrap(err, "error unpacking monitor plugin config")
48+
return mpi, fmt.Errorf("error unpacking monitor plugin config: %w", err)
5049
}
5150

5251
// Use `service_name` if `service.name` is unspecified

heartbeat/monitors/task.go

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -108,7 +108,7 @@ func runPublishJob(job jobs.Job, client *WrappedClient) []scheduler.TaskFunc {
108108

109109
conts, err := job(event)
110110
if err != nil {
111-
logp.Err("Job %v failed with: ", err)
111+
logp.Err("Job failed with: %s", err)
112112
}
113113

114114
hasContinuations := len(conts) > 0

0 commit comments

Comments
 (0)