-
Notifications
You must be signed in to change notification settings - Fork 5k
[otelbeat] fix status reporting when running beatreceivers #44528
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Changes from 34 commits
c6bd143
a4ead4d
5b5a218
c7ab288
8b17fb0
1f18cd6
48b8e12
47c529b
025f2c7
ba19557
308ccaf
5c53d41
afcaffe
e25c343
006f3fd
21bd422
4a7827a
0fb0cd2
70d930a
2517b55
513f433
9540d52
b44759c
cb09c7b
43e3d91
c39093e
461dd58
b54d7ef
b0b268a
5d7f818
31b99f3
a93f275
dfd1d24
e2649a4
8e33dfb
d64de30
77a7911
c274941
3f8c1ef
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Large diffs are not rendered by default.
| Original file line number | Diff line number | Diff line change | ||||||||||||||||||||||||||||
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| @@ -0,0 +1,122 @@ | ||||||||||||||||||||||||||||||
| // Licensed to Elasticsearch B.V. under one or more contributor | ||||||||||||||||||||||||||||||
| // license agreements. See the NOTICE file distributed with | ||||||||||||||||||||||||||||||
| // this work for additional information regarding copyright | ||||||||||||||||||||||||||||||
| // ownership. Elasticsearch B.V. licenses this file to you under | ||||||||||||||||||||||||||||||
| // the Apache License, Version 2.0 (the "License"); you may | ||||||||||||||||||||||||||||||
| // not use this file except in compliance with the License. | ||||||||||||||||||||||||||||||
| // You may obtain a copy of the License at | ||||||||||||||||||||||||||||||
| // | ||||||||||||||||||||||||||||||
| // http://www.apache.org/licenses/LICENSE-2.0 | ||||||||||||||||||||||||||||||
| // | ||||||||||||||||||||||||||||||
| // Unless required by applicable law or agreed to in writing, | ||||||||||||||||||||||||||||||
| // software distributed under the License is distributed on an | ||||||||||||||||||||||||||||||
| // "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY | ||||||||||||||||||||||||||||||
| // KIND, either express or implied. See the License for the | ||||||||||||||||||||||||||||||
| // specific language governing permissions and limitations | ||||||||||||||||||||||||||||||
| // under the License. | ||||||||||||||||||||||||||||||
|
|
||||||||||||||||||||||||||||||
| package status | ||||||||||||||||||||||||||||||
|
|
||||||||||||||||||||||||||||||
| import ( | ||||||||||||||||||||||||||||||
| "sync" | ||||||||||||||||||||||||||||||
| ) | ||||||||||||||||||||||||||||||
|
|
||||||||||||||||||||||||||||||
| type runnerState struct { | ||||||||||||||||||||||||||||||
| state Status | ||||||||||||||||||||||||||||||
| msg string | ||||||||||||||||||||||||||||||
| } | ||||||||||||||||||||||||||||||
|
|
||||||||||||||||||||||||||||||
| // RunnerReporter defines an interface that returns a StatusReporter for a specific runner. | ||||||||||||||||||||||||||||||
| // This is used for grouping and managing statuses of multiple runners | ||||||||||||||||||||||||||||||
| type RunnerReporter interface { | ||||||||||||||||||||||||||||||
| GetReporterForRunner(id uint64) StatusReporter | ||||||||||||||||||||||||||||||
| } | ||||||||||||||||||||||||||||||
|
|
||||||||||||||||||||||||||||||
| // NewGroupStatusReporter creates a reporter that aggregates the statuses of multiple runners | ||||||||||||||||||||||||||||||
| // and reports the combined status to the parent StatusReporter. | ||||||||||||||||||||||||||||||
| // This is needed because multiple modules can report different statuses, and we want to avoid | ||||||||||||||||||||||||||||||
| // repeatedly flipping the parent's status. | ||||||||||||||||||||||||||||||
| func NewGroupStatusReporter(parent StatusReporter) RunnerReporter { | ||||||||||||||||||||||||||||||
| if parent == nil { | ||||||||||||||||||||||||||||||
| return &nopStatus{} | ||||||||||||||||||||||||||||||
| } | ||||||||||||||||||||||||||||||
| return &reporter{ | ||||||||||||||||||||||||||||||
| parent: parent, | ||||||||||||||||||||||||||||||
| runnerStates: make(map[uint64]runnerState), | ||||||||||||||||||||||||||||||
| } | ||||||||||||||||||||||||||||||
| } | ||||||||||||||||||||||||||||||
|
|
||||||||||||||||||||||||||||||
| type reporter struct { | ||||||||||||||||||||||||||||||
| runnerStates map[uint64]runnerState | ||||||||||||||||||||||||||||||
| parent StatusReporter | ||||||||||||||||||||||||||||||
| mtx sync.Mutex | ||||||||||||||||||||||||||||||
| } | ||||||||||||||||||||||||||||||
|
|
||||||||||||||||||||||||||||||
| func (r *reporter) GetReporterForRunner(id uint64) StatusReporter { | ||||||||||||||||||||||||||||||
| r.mtx.Lock() | ||||||||||||||||||||||||||||||
| defer r.mtx.Unlock() | ||||||||||||||||||||||||||||||
| return &subReporter{ | ||||||||||||||||||||||||||||||
| id: id, | ||||||||||||||||||||||||||||||
| r: r, | ||||||||||||||||||||||||||||||
| } | ||||||||||||||||||||||||||||||
| } | ||||||||||||||||||||||||||||||
|
|
||||||||||||||||||||||||||||||
| func (r *reporter) updateStatusForRunner(id uint64, state Status, msg string) { | ||||||||||||||||||||||||||||||
| r.mtx.Lock() | ||||||||||||||||||||||||||||||
| defer r.mtx.Unlock() | ||||||||||||||||||||||||||||||
| if r.runnerStates == nil { | ||||||||||||||||||||||||||||||
| r.runnerStates = make(map[uint64]runnerState) | ||||||||||||||||||||||||||||||
| } | ||||||||||||||||||||||||||||||
|
|
||||||||||||||||||||||||||||||
| // add status for the runner to the map | ||||||||||||||||||||||||||||||
| r.runnerStates[id] = runnerState{ | ||||||||||||||||||||||||||||||
| state: state, | ||||||||||||||||||||||||||||||
| msg: msg, | ||||||||||||||||||||||||||||||
| } | ||||||||||||||||||||||||||||||
|
|
||||||||||||||||||||||||||||||
| // calculate the aggregate state of beat based on the module states | ||||||||||||||||||||||||||||||
| calcState, calcMsg := r.calculateState() | ||||||||||||||||||||||||||||||
|
|
||||||||||||||||||||||||||||||
| // report status to parent reporter | ||||||||||||||||||||||||||||||
| r.parent.UpdateStatus(calcState, calcMsg) | ||||||||||||||||||||||||||||||
| } | ||||||||||||||||||||||||||||||
|
|
||||||||||||||||||||||||||||||
| func (r *reporter) calculateState() (Status, string) { | ||||||||||||||||||||||||||||||
|
Member
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Now that you've pointed me at the pre-existing implementation for inputs+streams in beats/x-pack/libbeat/management/unit.go Line 208 in 4309305
Why does this also need to exist? I can see what it is doing but it's not clear why you had to add this separate group reporter? The manager already has an UpdateStatus method meant to report status, and there is now an OtelManager that can do OTel specific things, why do we need to add group status reporters in metricbeat and filebeat without going through the existing manager interface?
Member
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. The previous one is tied to the concept of units, which we don't have anymore? It still feels like it would be best if there was a way to account for this through the manager interface so that everything is centralized into a single interface.
Member
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. In the previous control protocol implementation, the status reporter was injected when the manager reloaded inputs: beats/x-pack/libbeat/management/managerV2.go Line 817 in 4309305
This also defined the reporting resolution of status reporters, there was one per unit which mapped to one per input. In the otel world there would naturally be one status reporter per receiver, and we would ideally have one per beat receiver input one day. Why can't the status reporter be injected when the receiver is created? I think you probably still need this code to toggle the receiver status based on the input UpdateStatus calls but I am suspecting the group reporters are being created in the wrong places (directly in metricbeat.go and crawler.go) which are not beat receiver specific.
Contributor
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more.
I'll give you an example why we need the resolution logic. Consider the following config inputs:
- module: system
metricsets:
- process
- module: system
metricsets:
- cpu
- module: system
metricsets:
- memoryIn metricbeat, we create one Runner per module. The status reporter for runner is set here beats/metricbeat/mb/module/runner_group.go Lines 44 to 50 in ac45d48
status.Reporter interface beats/libbeat/management/status/status.go Lines 46 to 49 in ac45d48
if we directly set the otel's status reporter as it is (without any group reporter), then we face following problems:
That should be the case in an ideal world, but receivers can report status via I mentioned this when I originally opened the PR, but I guess it got lost in the history.
This is on my TODO list. I'll work on it as a follow-up.
Member
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Thanks, going to summarize what I mentioned in the project meeting today to make sure it is captured here:
To use a configuration example let's imagine someone split the cpu, memory, network, and filesystem system metricsets across two inputs: inputs:
# This system/metrics module reports state independently of any other in the same configuration.
- type: system/metrics
id: cpu-memory
use_output: default
streams:
# The status of each metricset is aggregated into the state of overall parent module.
- metricsets:
- cpu
data_stream.dataset: system.cpu
- metricsets:
- memory
data_stream.dataset: system.memory
# This system/metrics module reports state independently of the one above.
- type: system/metrics
id: network-filesystem
use_output: default
streams:
# The status of each metricset is aggregated into the state of overall parent module.
- metricsets:
- network
data_stream.dataset: system.network
- metricsets:
- filesystem
data_stream.dataset: system.filesystemI think to get this to work correctly Elastic Agent would have to orchestrate both of those two inputs into separate system/metrics receivers (CC @leehinman and @swiatekm keep me honest) without doing anything else. Otherwise, we may need to do something outside of straight collector component status reporting to preserve the way this works today. If we one day gained the ability to report sub-component status they wouldn't need to be separate receivers. |
||||||||||||||||||||||||||||||
| reportedState := Running | ||||||||||||||||||||||||||||||
| reportedMsg := "" | ||||||||||||||||||||||||||||||
| for _, s := range r.runnerStates { | ||||||||||||||||||||||||||||||
| switch s.state { | ||||||||||||||||||||||||||||||
| case Degraded: | ||||||||||||||||||||||||||||||
| if reportedState != Degraded { | ||||||||||||||||||||||||||||||
| reportedState = Degraded | ||||||||||||||||||||||||||||||
| reportedMsg = s.msg | ||||||||||||||||||||||||||||||
| } | ||||||||||||||||||||||||||||||
| case Failed: | ||||||||||||||||||||||||||||||
| // we've encountered a failed runner. | ||||||||||||||||||||||||||||||
| // short-circuit and return, as Failed state takes precedence over other states | ||||||||||||||||||||||||||||||
| return s.state, s.msg | ||||||||||||||||||||||||||||||
|
cmacknz marked this conversation as resolved.
|
||||||||||||||||||||||||||||||
| } | ||||||||||||||||||||||||||||||
| } | ||||||||||||||||||||||||||||||
| return reportedState, reportedMsg | ||||||||||||||||||||||||||||||
| } | ||||||||||||||||||||||||||||||
|
|
||||||||||||||||||||||||||||||
| type nopStatus struct{} | ||||||||||||||||||||||||||||||
|
|
||||||||||||||||||||||||||||||
| type noopReporter struct{} | ||||||||||||||||||||||||||||||
|
|
||||||||||||||||||||||||||||||
| func (*noopReporter) UpdateStatus(Status, string) {} | ||||||||||||||||||||||||||||||
|
|
||||||||||||||||||||||||||||||
| func (s *nopStatus) GetReporterForRunner(id uint64) StatusReporter { | ||||||||||||||||||||||||||||||
| return &noopReporter{} | ||||||||||||||||||||||||||||||
| } | ||||||||||||||||||||||||||||||
|
|
||||||||||||||||||||||||||||||
| // subReporter implements status.StatusReporter | ||||||||||||||||||||||||||||||
| type subReporter struct { | ||||||||||||||||||||||||||||||
| id uint64 | ||||||||||||||||||||||||||||||
| r *reporter | ||||||||||||||||||||||||||||||
| } | ||||||||||||||||||||||||||||||
|
|
||||||||||||||||||||||||||||||
| func (m *subReporter) UpdateStatus(status Status, msg string) { | ||||||||||||||||||||||||||||||
| // report status to its parent | ||||||||||||||||||||||||||||||
| m.r.updateStatusForRunner(m.id, status, msg) | ||||||||||||||||||||||||||||||
| } | ||||||||||||||||||||||||||||||
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -0,0 +1,56 @@ | ||
| // Licensed to Elasticsearch B.V. under one or more contributor | ||
| // license agreements. See the NOTICE file distributed with | ||
| // this work for additional information regarding copyright | ||
| // ownership. Elasticsearch B.V. licenses this file to you under | ||
| // the Apache License, Version 2.0 (the "License"); you may | ||
| // not use this file except in compliance with the License. | ||
| // You may obtain a copy of the License at | ||
| // | ||
| // http://www.apache.org/licenses/LICENSE-2.0 | ||
| // | ||
| // Unless required by applicable law or agreed to in writing, | ||
| // software distributed under the License is distributed on an | ||
| // "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY | ||
| // KIND, either express or implied. See the License for the | ||
| // specific language governing permissions and limitations | ||
| // under the License. | ||
|
|
||
| package status | ||
|
|
||
| import ( | ||
| "testing" | ||
|
|
||
| "github.com/stretchr/testify/require" | ||
| ) | ||
|
|
||
| type mockStatusReporter struct { | ||
| s Status | ||
| msg string | ||
| } | ||
|
|
||
| func (m *mockStatusReporter) UpdateStatus(s Status, msg string) { | ||
| m.s = s | ||
| m.msg = msg | ||
| } | ||
|
|
||
| func TestGroupStatus(t *testing.T) { | ||
| m := &mockStatusReporter{} | ||
| reporter := NewGroupStatusReporter(m) | ||
|
|
||
| subReporter1, subReporter2, subReporter3 := reporter.GetReporterForRunner(1), reporter.GetReporterForRunner(2), reporter.GetReporterForRunner(3) | ||
|
|
||
| subReporter1.UpdateStatus(Running, "") | ||
| subReporter2.UpdateStatus(Running, "") | ||
| subReporter3.UpdateStatus(Running, "") | ||
|
|
||
| require.Equal(t, m.s, Running) | ||
| require.Equal(t, m.msg, "") | ||
|
|
||
| subReporter1.UpdateStatus(Degraded, "Degrade Runner1") | ||
| require.Equal(t, m.s, Degraded) | ||
| require.Equal(t, m.msg, "Degrade Runner1") | ||
|
|
||
| subReporter2.UpdateStatus(Failed, "Failed Runner2") | ||
|
VihasMakwana marked this conversation as resolved.
Outdated
|
||
| require.Equal(t, m.s, Failed) | ||
| require.Equal(t, m.msg, "Failed Runner2") | ||
| } | ||
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -25,6 +25,8 @@ import ( | |
| "github.com/elastic/beats/v7/libbeat/beat" | ||
| "github.com/elastic/beats/v7/libbeat/cfgfile" | ||
| "github.com/elastic/beats/v7/libbeat/management" | ||
| "github.com/elastic/beats/v7/libbeat/management/status" | ||
|
|
||
| "github.com/elastic/beats/v7/libbeat/monitoring/inputmon" | ||
| "github.com/elastic/beats/v7/metricbeat/mb" | ||
| "github.com/elastic/beats/v7/metricbeat/mb/module" | ||
|
|
@@ -44,9 +46,9 @@ import ( | |
|
|
||
| // Metricbeat implements the Beater interface for metricbeat. | ||
| type Metricbeat struct { | ||
| done chan struct{} // Channel used to initiate shutdown. | ||
| stopOnce sync.Once // wraps the Stop() method | ||
| runners []cfgfile.Runner // Active list of module runners. | ||
| done chan struct{} // Channel used to initiate shutdown. | ||
| stopOnce sync.Once // wraps the Stop() method | ||
| runners map[uint64]cfgfile.Runner // Active list of module runners. | ||
| config Config | ||
| registry *mb.Register | ||
| autodiscover *autodiscover.Autodiscover | ||
|
|
@@ -154,6 +156,7 @@ func newMetricbeat(b *beat.Beat, c *conf.C, registry *mb.Register, options ...Op | |
| config: config, | ||
| registry: registry, | ||
| logger: b.Info.Logger, | ||
| runners: make(map[uint64]cfgfile.Runner), | ||
| } | ||
|
|
||
| for _, applyOption := range options { | ||
|
|
@@ -202,7 +205,12 @@ func newMetricbeat(b *beat.Beat, c *conf.C, registry *mb.Register, options ...Op | |
| return nil, err | ||
| } | ||
|
|
||
| metricbeat.runners = append(metricbeat.runners, runner) | ||
| hash, err := cfgfile.HashConfig(moduleCfg) | ||
|
cmacknz marked this conversation as resolved.
|
||
| if err != nil { | ||
| return nil, fmt.Errorf("error hashing module config: %w", err) | ||
| } | ||
|
|
||
| metricbeat.runners[hash] = runner | ||
|
Comment on lines
+208
to
+213
Contributor
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. This is needed because each runner has to be associated with a unique id. |
||
| } | ||
|
|
||
| if len(metricbeat.runners) == 0 && !dynamicCfgEnabled { | ||
|
|
@@ -235,8 +243,15 @@ func newMetricbeat(b *beat.Beat, c *conf.C, registry *mb.Register, options ...Op | |
| func (bt *Metricbeat) Run(b *beat.Beat) error { | ||
| var wg sync.WaitGroup | ||
|
|
||
| groupReporter := status.NewGroupStatusReporter(b.Manager) | ||
|
|
||
| // Static modules (metricbeat.runners) | ||
| for _, r := range bt.runners { | ||
| for hash, r := range bt.runners { | ||
| // If the otelStatusReporter is set, we need to set the status reporter | ||
| if status, ok := r.(status.WithStatusReporter); ok { | ||
| status.SetStatusReporter(groupReporter.GetReporterForRunner(hash)) | ||
| } | ||
|
|
||
| r.Start() | ||
| wg.Add(1) | ||
|
|
||
|
|
||
Uh oh!
There was an error while loading. Please reload this page.