From 4bcac664b4f1fa559b45b2ddaba2ad6c2bdb206c Mon Sep 17 00:00:00 2001 From: Dakota Paasman <122491662+dpaasman00@users.noreply.github.com> Date: Fri, 5 Sep 2025 11:19:50 -0400 Subject: [PATCH 1/7] configure heartbeat capability --- cmd/opampsupervisor/supervisor/config/config.go | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/cmd/opampsupervisor/supervisor/config/config.go b/cmd/opampsupervisor/supervisor/config/config.go index 21609ca15c5a1..8871aebff4176 100644 --- a/cmd/opampsupervisor/supervisor/config/config.go +++ b/cmd/opampsupervisor/supervisor/config/config.go @@ -111,6 +111,7 @@ type Capabilities struct { ReportsHealth bool `mapstructure:"reports_health"` ReportsRemoteConfig bool `mapstructure:"reports_remote_config"` ReportsAvailableComponents bool `mapstructure:"reports_available_components"` + ReportsHeartbeat bool `mapstructure:"reports_heartbeat"` } func (c Capabilities) SupportedCapabilities() protobufs.AgentCapabilities { @@ -155,6 +156,9 @@ func (c Capabilities) SupportedCapabilities() protobufs.AgentCapabilities { if c.ReportsAvailableComponents { supportedCapabilities |= protobufs.AgentCapabilities_AgentCapabilities_ReportsAvailableComponents } + if c.ReportsHeartbeat { + supportedCapabilities |= protobufs.AgentCapabilities_AgentCapabilities_ReportsHeartbeat + } return supportedCapabilities } From d2a497bd2280dd763b4a862f0a58aa0ff032ebf4 Mon Sep 17 00:00:00 2001 From: Dakota Paasman <122491662+dpaasman00@users.noreply.github.com> Date: Fri, 5 Sep 2025 15:08:24 -0400 Subject: [PATCH 2/7] add e2e test --- ...t_dakotapaasman-supervisor-heartbeats.yaml | 27 ++++++++++ cmd/opampsupervisor/e2e_test.go | 54 +++++++++++++++++++ .../supervisor_reports_heartbeat.yaml | 19 +++++++ 3 files changed, 100 insertions(+) create mode 100644 .chloggen/feat_dakotapaasman-supervisor-heartbeats.yaml create mode 100644 cmd/opampsupervisor/testdata/supervisor/supervisor_reports_heartbeat.yaml diff --git a/.chloggen/feat_dakotapaasman-supervisor-heartbeats.yaml b/.chloggen/feat_dakotapaasman-supervisor-heartbeats.yaml new file mode 100644 index 0000000000000..2ec11b7161866 --- /dev/null +++ b/.chloggen/feat_dakotapaasman-supervisor-heartbeats.yaml @@ -0,0 +1,27 @@ +# Use this changelog template to create an entry for release notes. + +# One of 'breaking', 'deprecation', 'new_component', 'enhancement', 'bug_fix' +change_type: "enhancement" + +# The name of the component, or a single word describing the area of concern, (e.g. filelogreceiver) +component: "opampsupervisor" + +# A brief description of the change. Surround your text with quotes ("") if it needs to start with a backtick (`). +note: "Add support for client initiated OpAMP heartbeats in the supervisor." + +# Mandatory: One or more tracking issues related to the change. You can use the PR number here if no issue exists. +issues: [] + +# (Optional) One or more lines of additional information to render under the primary note. +# These lines will be padded with 2 spaces and then inserted directly into the document. +# Use pipe (|) for multiline entries. +subtext: + +# If your change doesn't affect end users or the exported elements of any package, +# you should instead start your pull request title with [chore] or use the "Skip Changelog" label. +# Optional: The change log or logs in which this entry should be included. +# e.g. '[user]' or '[user, api]' +# Include 'user' if the change is relevant to end users. +# Include 'api' if there is a change to a library API. +# Default: '[user]' +change_logs: [] diff --git a/cmd/opampsupervisor/e2e_test.go b/cmd/opampsupervisor/e2e_test.go index b2d16e592fe73..34a5d42d988dc 100644 --- a/cmd/opampsupervisor/e2e_test.go +++ b/cmd/opampsupervisor/e2e_test.go @@ -2501,3 +2501,57 @@ func TestSupervisorEmitBootstrapTelemetry(t *testing.T) { require.Truef(t, gotSpan, "expected to find span '%s', but did not find it", expectedSpan) } } + +func TestSupervisorReportsHeartbeat(t *testing.T) { + var heartbeatReport atomic.Bool + server := newOpAMPServer( + t, + defaultConnectingHandler, + types.ConnectionCallbacks{ + OnMessage: func(_ context.Context, _ types.Connection, message *protobufs.AgentToServer) *protobufs.ServerToAgent { + if isHeartbeatMessage(message) { + heartbeatReport.Store(true) + } + return &protobufs.ServerToAgent{} + }, + }, + ) + s, _ := newSupervisor(t, "reports_heartbeat", map[string]string{"url": server.addr}) + + require.Nil(t, s.Start()) + defer s.Shutdown() + + waitForSupervisorConnection(server.supervisorConnected, true) + + // Set the heartbeat interval to 5 seconds + server.sendToSupervisor(&protobufs.ServerToAgent{ + ConnectionSettings: &protobufs.ConnectionSettingsOffers{ + Opamp: &protobufs.OpAMPConnectionSettings{ + HeartbeatIntervalSeconds: 5, + }, + }, + }) + + require.Eventually(t, func() bool { + return heartbeatReport.Load() + }, 7*time.Second, 250*time.Millisecond) +} + +// isHeartbeatMessage returns true if all fields of the message are nil. +func isHeartbeatMessage(message *protobufs.AgentToServer) bool { + empty := true + + empty = empty && message.AgentDescription == nil + empty = empty && message.Health == nil + empty = empty && message.EffectiveConfig == nil + empty = empty && message.RemoteConfigStatus == nil + empty = empty && message.PackageStatuses == nil + empty = empty && message.AgentDisconnect == nil + empty = empty && message.ConnectionSettingsRequest == nil + empty = empty && message.CustomCapabilities == nil + empty = empty && message.CustomMessage == nil + empty = empty && message.AvailableComponents == nil + empty = empty && message.Flags == 0 + + return empty +} diff --git a/cmd/opampsupervisor/testdata/supervisor/supervisor_reports_heartbeat.yaml b/cmd/opampsupervisor/testdata/supervisor/supervisor_reports_heartbeat.yaml new file mode 100644 index 0000000000000..5c2f1938abe7a --- /dev/null +++ b/cmd/opampsupervisor/testdata/supervisor/supervisor_reports_heartbeat.yaml @@ -0,0 +1,19 @@ +server: + endpoint: ws://{{.url}}/v1/opamp + +capabilities: + reports_effective_config: true + reports_own_metrics: true + reports_own_logs: true + reports_own_traces: true + reports_health: true + accepts_remote_config: true + reports_remote_config: true + accepts_opamp_connection_settings: true + reports_heartbeat: true + +storage: + directory: "{{.storage_dir}}" + +agent: + executable: ../../bin/otelcontribcol_{{.goos}}_{{.goarch}}{{.extension}} From b41a08b9fb6215816d20713ba17f1f8ce4d7f553 Mon Sep 17 00:00:00 2001 From: Dakota Paasman <122491662+dpaasman00@users.noreply.github.com> Date: Fri, 5 Sep 2025 15:15:45 -0400 Subject: [PATCH 3/7] update doc and config default --- ...t_dakotapaasman-supervisor-heartbeats.yaml | 2 +- cmd/opampsupervisor/specification/README.md | 100 +++++++++--------- .../supervisor/config/config.go | 1 + .../supervisor/config/config_test.go | 9 +- 4 files changed, 60 insertions(+), 52 deletions(-) diff --git a/.chloggen/feat_dakotapaasman-supervisor-heartbeats.yaml b/.chloggen/feat_dakotapaasman-supervisor-heartbeats.yaml index 2ec11b7161866..f0eb710966587 100644 --- a/.chloggen/feat_dakotapaasman-supervisor-heartbeats.yaml +++ b/.chloggen/feat_dakotapaasman-supervisor-heartbeats.yaml @@ -10,7 +10,7 @@ component: "opampsupervisor" note: "Add support for client initiated OpAMP heartbeats in the supervisor." # Mandatory: One or more tracking issues related to the change. You can use the PR number here if no issue exists. -issues: [] +issues: [42533] # (Optional) One or more lines of additional information to render under the primary note. # These lines will be padded with 2 spaces and then inserted directly into the document. diff --git a/cmd/opampsupervisor/specification/README.md b/cmd/opampsupervisor/specification/README.md index da799394b573e..08bf2802f8a6f 100644 --- a/cmd/opampsupervisor/specification/README.md +++ b/cmd/opampsupervisor/specification/README.md @@ -10,7 +10,7 @@ for the Collector in 2 different ways: - As a Collector extension, with limited functionality, - As an external Supervisor, that implements all or most of OpAMP - capabilities. + capabilities. In discussions with users and Collector contributors we found that both of these approaches are wanted. This document describes how to implement @@ -37,34 +37,34 @@ Here is how a Supervisor-based management works: The Supervisor process does the following: - Implements the client-side of OpAMP protocol and communicates with - the OpAMP Backend. + the OpAMP Backend. - Starts/stops the Collector process as necessary. - Receives configuration from the OpAMP Backend and pushes it to the - Collector, using the Collector config.yaml file as an intermediary, - restarting the Collector process as necessary. + Collector, using the Collector config.yaml file as an intermediary, + restarting the Collector process as necessary. - Serves as a watchdog, restarts the Collector process if the - Collector crashes. -- Accepts an OpAMP connection from Collectors' [*opamp - extension*](#collectors-opamp-extension), receives the Collector's - AgentDescription, HealthStatus and EffectiveConfig messages and - forwards them to the OpAMP Backend. + Collector crashes. +- Accepts an OpAMP connection from Collectors' [_opamp + extension_](#collectors-opamp-extension), receives the Collector's + AgentDescription, HealthStatus and EffectiveConfig messages and + forwards them to the OpAMP Backend. - Optionally: downloads Collector executable packages offered by the - Backend and performs the Collector updates. + Backend and performs the Collector updates. - Optionally: configures Collector to collect Collector's own metrics - and report the metrics to the OTLP telemetry backend requested by - OpAMP Backend. + and report the metrics to the OTLP telemetry backend requested by + OpAMP Backend. - Optionally: collects Collector logs and sends them to the Telemetry - Backend via OTLP. + Backend via OTLP. Supervisor is implemented as a Go library that may be customized and rebuilt by vendors with useful default configurations, such as the OpAMP Backend endpoint to connect to, in order to minimize the manual configuration required. -*Important: the Supervisor needs to be highly stable, so we need to keep +_Important: the Supervisor needs to be highly stable, so we need to keep its complexity and functionality to minimum. The features listed in this section need a critical review and may be removed (responsibility moved -elsewhere, e.g. to the Collector itself).* +elsewhere, e.g. to the Collector itself)._ ### Supervisor Configuration @@ -116,6 +116,9 @@ capabilities: # The Collector will report Health. reports_health: # true if unspecified + # The supervisor will report OpAMP heartbeats to the Server. + reports_heartbeat: # true if unspecified + storage: # A writable directory where the Supervisor can store data # (e.g. cached remote config). @@ -221,7 +224,6 @@ telemetry: # Resource attributes. resource: service.namespace: otel-demo - ``` #### Notes on `agent::config_files`, `agent::args`, and `agent::env` @@ -275,13 +277,13 @@ Take the configuration below as an example: agent: executable: ./otel-binary config_files: - - './custom-config.yaml' - - './another-custom-config.yaml' + - "./custom-config.yaml" + - "./another-custom-config.yaml" args: - - '--feature-gates exporter.datadogexporter.UseLogsAgentExporter,exporter.datadogexporter.metricexportnativeclient' + - "--feature-gates exporter.datadogexporter.UseLogsAgentExporter,exporter.datadogexporter.metricexportnativeclient" env: - HOME: '/dev/home' - GO_HOME: '~/go' + HOME: "/dev/home" + GO_HOME: "~/go" ``` This results in the following Collector process invocation: @@ -326,8 +328,8 @@ Note: this capability must be manually enabled by the user via a AcceptsRemoteConfig setting in the supervisor config file and is disabled by default. -The Supervisor receives [*Remote -Configuration*](https://github.com/open-telemetry/opamp-spec/blob/main/specification.md#configuration) +The Supervisor receives [_Remote +Configuration_](https://github.com/open-telemetry/opamp-spec/blob/main/specification.md#configuration) from the OpAMP Backend, merges it with an optional local config file and writes it to the Collector's config file, then restarts the Collector. @@ -354,13 +356,13 @@ The Supervisor will locate all such entries while building the Collector config file and will delete the ones which are prohibited by the access control settings. -*Open Question: if after sanitizing the component's directory setting -the configuration becomes invalid what do we do?* +_Open Question: if after sanitizing the component's directory setting +the configuration becomes invalid what do we do?_ -*The sanitizing logic is hard-coded in the Supervisor and works for +_The sanitizing logic is hard-coded in the Supervisor and works for specific components only. In the future we will consider implementing a more generic safety mechanism that does not depend on the knowledge -about specific component behavior.* +about specific component behavior._ #### Bootstrapping @@ -470,8 +472,8 @@ The Supervisor will also write the Collector's log to a local log file. The path to the Collector log files will be printed in the Supervisor output. -*Open Question: instead of writing to a local log file do we want to -pipe Collector logs to Supervisor's log output?* +_Open Question: instead of writing to a local log file do we want to +pipe Collector logs to Supervisor's log output?_ ### Collector Executable Updates @@ -537,19 +539,19 @@ Collector's configuration. The opamp extension implements an OpAMP client with a small subset of OpAMP agent capabilities: - ReportsStatus. The extension reports agent description and status. - This is the first message from the client to the server in the OpAMP - protocol that is essential for beginning OpAMP message exchange. + This is the first message from the client to the server in the OpAMP + protocol that is essential for beginning OpAMP message exchange. - ReportsEffectiveConfig. The extension reports the Collector's - effective config on startup and any time the config changes. In - order to do this the opamp extension needs [access to the effective - config](https://github.com/open-telemetry/opentelemetry-collector/issues/6596). + effective config on startup and any time the config changes. In + order to do this the opamp extension needs [access to the effective + config](https://github.com/open-telemetry/opentelemetry-collector/issues/6596). - ReportsHealth. The extension reports Collector's health on startup - and any time the health changes. In order to do this the opamp - extension needs access to the health of the Collector. The very - basic health capability can be replicated by mirroring the - functionality of the healthcheck extension, a more advanced - capability depends on the [component status - reporting](https://github.com/open-telemetry/opentelemetry-collector/pull/6560). + and any time the health changes. In order to do this the opamp + extension needs access to the health of the Collector. The very + basic health capability can be replicated by mirroring the + functionality of the healthcheck extension, a more advanced + capability depends on the [component status + reporting](https://github.com/open-telemetry/opentelemetry-collector/pull/6560). The messages received from the opamp extension are forwarded by the Supervisor to the destination OpAMP Backend and replies to these @@ -600,25 +602,25 @@ is an [open issue](https://github.com/open-telemetry/opentelemetry-collector/issues/6599) to allow this. -*Open Question: when used with Supervisor do we want the Supervisor to +_Open Question: when used with Supervisor do we want the Supervisor to actively periodically query the health of the Collector or we can rely -on opamp extension to report the health when it changes?* +on opamp extension to report the health when it changes?_ ## Future Work - Decide if we want to have Supervisor-less AcceptsRemoteConfig - capability in the Collector. This currently can't be done by using - just an extension. At the minimum it requires a config Provider. + capability in the Collector. This currently can't be done by using + just an extension. At the minimum it requires a config Provider. - Consider extending the Supervisor to be able to manage multiple - Collector instances. + Collector instances. ## References - OpAMP Specification: - [https://github.com/open-telemetry/opamp-spec/blob/main/specification.md](https://github.com/open-telemetry/opamp-spec/blob/main/specification.md) + [https://github.com/open-telemetry/opamp-spec/blob/main/specification.md](https://github.com/open-telemetry/opamp-spec/blob/main/specification.md) - OpAMP client and server implementation in Go: - [https://github.com/open-telemetry/opamp-go](https://github.com/open-telemetry/opamp-go) + [https://github.com/open-telemetry/opamp-go](https://github.com/open-telemetry/opamp-go) - Example Supervisor implementation: - [https://github.com/open-telemetry/opamp-go/tree/main/internal/examples/supervisor](https://github.com/open-telemetry/opamp-go/tree/main/internal/examples/supervisor) + [https://github.com/open-telemetry/opamp-go/tree/main/internal/examples/supervisor](https://github.com/open-telemetry/opamp-go/tree/main/internal/examples/supervisor) - OpAMP Milestone in the Collector: - [https://github.com/open-telemetry/opentelemetry-collector/milestone/29](https://github.com/open-telemetry/opentelemetry-collector/milestone/29) + [https://github.com/open-telemetry/opentelemetry-collector/milestone/29](https://github.com/open-telemetry/opentelemetry-collector/milestone/29) diff --git a/cmd/opampsupervisor/supervisor/config/config.go b/cmd/opampsupervisor/supervisor/config/config.go index 8871aebff4176..0bb278ec3afb3 100644 --- a/cmd/opampsupervisor/supervisor/config/config.go +++ b/cmd/opampsupervisor/supervisor/config/config.go @@ -343,6 +343,7 @@ func DefaultSupervisor() Supervisor { ReportsHealth: true, ReportsRemoteConfig: false, ReportsAvailableComponents: false, + ReportsHeartbeat: true, }, Storage: Storage{ Directory: defaultStorageDir, diff --git a/cmd/opampsupervisor/supervisor/config/config_test.go b/cmd/opampsupervisor/supervisor/config/config_test.go index 1db50aea5ee3b..33295e7193959 100644 --- a/cmd/opampsupervisor/supervisor/config/config_test.go +++ b/cmd/opampsupervisor/supervisor/config/config_test.go @@ -514,7 +514,8 @@ func TestCapabilities_SupportedCapabilities(t *testing.T) { expectedAgentCapabilities: protobufs.AgentCapabilities_AgentCapabilities_ReportsStatus | protobufs.AgentCapabilities_AgentCapabilities_ReportsOwnMetrics | protobufs.AgentCapabilities_AgentCapabilities_ReportsEffectiveConfig | - protobufs.AgentCapabilities_AgentCapabilities_ReportsHealth, + protobufs.AgentCapabilities_AgentCapabilities_ReportsHealth | + protobufs.AgentCapabilities_AgentCapabilities_ReportsHeartbeat, }, { name: "Empty capabilities", @@ -534,6 +535,7 @@ func TestCapabilities_SupportedCapabilities(t *testing.T) { ReportsHealth: true, ReportsRemoteConfig: true, ReportsAvailableComponents: true, + ReportsHeartbeat: true, }, expectedAgentCapabilities: protobufs.AgentCapabilities_AgentCapabilities_ReportsStatus | protobufs.AgentCapabilities_AgentCapabilities_ReportsEffectiveConfig | @@ -545,7 +547,8 @@ func TestCapabilities_SupportedCapabilities(t *testing.T) { protobufs.AgentCapabilities_AgentCapabilities_ReportsRemoteConfig | protobufs.AgentCapabilities_AgentCapabilities_AcceptsRestartCommand | protobufs.AgentCapabilities_AgentCapabilities_AcceptsOpAMPConnectionSettings | - protobufs.AgentCapabilities_AgentCapabilities_ReportsAvailableComponents, + protobufs.AgentCapabilities_AgentCapabilities_ReportsAvailableComponents | + protobufs.AgentCapabilities_AgentCapabilities_ReportsHeartbeat, }, } @@ -620,6 +623,7 @@ capabilities: reports_remote_config: true accepts_restart_command: true accepts_opamp_connection_settings: true + reports_heartbeat: true storage: directory: %s @@ -662,6 +666,7 @@ telemetry: ReportsRemoteConfig: true, AcceptsRestartCommand: true, AcceptsOpAMPConnectionSettings: true, + ReportsHeartbeat: true, }, Storage: Storage{ Directory: filepath.Join(tmpDir, "storage"), From 431b80e90273f8daffef720b30ffcf37c51d980d Mon Sep 17 00:00:00 2001 From: Dakota Paasman <122491662+dpaasman00@users.noreply.github.com> Date: Fri, 5 Sep 2025 16:00:04 -0400 Subject: [PATCH 4/7] update doc and e2e test --- cmd/opampsupervisor/e2e_test.go | 4 +- cmd/opampsupervisor/specification/README.md | 107 +++++++++++--------- 2 files changed, 61 insertions(+), 50 deletions(-) diff --git a/cmd/opampsupervisor/e2e_test.go b/cmd/opampsupervisor/e2e_test.go index 34a5d42d988dc..26112a3b8ff8b 100644 --- a/cmd/opampsupervisor/e2e_test.go +++ b/cmd/opampsupervisor/e2e_test.go @@ -2527,14 +2527,14 @@ func TestSupervisorReportsHeartbeat(t *testing.T) { server.sendToSupervisor(&protobufs.ServerToAgent{ ConnectionSettings: &protobufs.ConnectionSettingsOffers{ Opamp: &protobufs.OpAMPConnectionSettings{ - HeartbeatIntervalSeconds: 5, + HeartbeatIntervalSeconds: 1, }, }, }) require.Eventually(t, func() bool { return heartbeatReport.Load() - }, 7*time.Second, 250*time.Millisecond) + }, 3*time.Second, 250*time.Millisecond) } // isHeartbeatMessage returns true if all fields of the message are nil. diff --git a/cmd/opampsupervisor/specification/README.md b/cmd/opampsupervisor/specification/README.md index 08bf2802f8a6f..b077fb2308580 100644 --- a/cmd/opampsupervisor/specification/README.md +++ b/cmd/opampsupervisor/specification/README.md @@ -10,7 +10,7 @@ for the Collector in 2 different ways: - As a Collector extension, with limited functionality, - As an external Supervisor, that implements all or most of OpAMP - capabilities. + capabilities. In discussions with users and Collector contributors we found that both of these approaches are wanted. This document describes how to implement @@ -37,34 +37,34 @@ Here is how a Supervisor-based management works: The Supervisor process does the following: - Implements the client-side of OpAMP protocol and communicates with - the OpAMP Backend. + the OpAMP Backend. - Starts/stops the Collector process as necessary. - Receives configuration from the OpAMP Backend and pushes it to the - Collector, using the Collector config.yaml file as an intermediary, - restarting the Collector process as necessary. + Collector, using the Collector config.yaml file as an intermediary, + restarting the Collector process as necessary. - Serves as a watchdog, restarts the Collector process if the - Collector crashes. -- Accepts an OpAMP connection from Collectors' [_opamp - extension_](#collectors-opamp-extension), receives the Collector's - AgentDescription, HealthStatus and EffectiveConfig messages and - forwards them to the OpAMP Backend. + Collector crashes. +- Accepts an OpAMP connection from Collectors' [*opamp + extension*](#collectors-opamp-extension), receives the Collector's + AgentDescription, HealthStatus and EffectiveConfig messages and + forwards them to the OpAMP Backend. - Optionally: downloads Collector executable packages offered by the - Backend and performs the Collector updates. + Backend and performs the Collector updates. - Optionally: configures Collector to collect Collector's own metrics - and report the metrics to the OTLP telemetry backend requested by - OpAMP Backend. + and report the metrics to the OTLP telemetry backend requested by + OpAMP Backend. - Optionally: collects Collector logs and sends them to the Telemetry - Backend via OTLP. + Backend via OTLP. Supervisor is implemented as a Go library that may be customized and rebuilt by vendors with useful default configurations, such as the OpAMP Backend endpoint to connect to, in order to minimize the manual configuration required. -_Important: the Supervisor needs to be highly stable, so we need to keep +*Important: the Supervisor needs to be highly stable, so we need to keep its complexity and functionality to minimum. The features listed in this section need a critical review and may be removed (responsibility moved -elsewhere, e.g. to the Collector itself)._ +elsewhere, e.g. to the Collector itself).* ### Supervisor Configuration @@ -224,6 +224,7 @@ telemetry: # Resource attributes. resource: service.namespace: otel-demo + ``` #### Notes on `agent::config_files`, `agent::args`, and `agent::env` @@ -277,13 +278,13 @@ Take the configuration below as an example: agent: executable: ./otel-binary config_files: - - "./custom-config.yaml" - - "./another-custom-config.yaml" + - './custom-config.yaml' + - './another-custom-config.yaml' args: - - "--feature-gates exporter.datadogexporter.UseLogsAgentExporter,exporter.datadogexporter.metricexportnativeclient" + - '--feature-gates exporter.datadogexporter.UseLogsAgentExporter,exporter.datadogexporter.metricexportnativeclient' env: - HOME: "/dev/home" - GO_HOME: "~/go" + HOME: '/dev/home' + GO_HOME: '~/go' ``` This results in the following Collector process invocation: @@ -328,8 +329,8 @@ Note: this capability must be manually enabled by the user via a AcceptsRemoteConfig setting in the supervisor config file and is disabled by default. -The Supervisor receives [_Remote -Configuration_](https://github.com/open-telemetry/opamp-spec/blob/main/specification.md#configuration) +The Supervisor receives [*Remote +Configuration*](https://github.com/open-telemetry/opamp-spec/blob/main/specification.md#configuration) from the OpAMP Backend, merges it with an optional local config file and writes it to the Collector's config file, then restarts the Collector. @@ -356,13 +357,13 @@ The Supervisor will locate all such entries while building the Collector config file and will delete the ones which are prohibited by the access control settings. -_Open Question: if after sanitizing the component's directory setting -the configuration becomes invalid what do we do?_ +*Open Question: if after sanitizing the component's directory setting +the configuration becomes invalid what do we do?* -_The sanitizing logic is hard-coded in the Supervisor and works for +*The sanitizing logic is hard-coded in the Supervisor and works for specific components only. In the future we will consider implementing a more generic safety mechanism that does not depend on the knowledge -about specific component behavior._ +about specific component behavior.* #### Bootstrapping @@ -472,8 +473,8 @@ The Supervisor will also write the Collector's log to a local log file. The path to the Collector log files will be printed in the Supervisor output. -_Open Question: instead of writing to a local log file do we want to -pipe Collector logs to Supervisor's log output?_ +*Open Question: instead of writing to a local log file do we want to +pipe Collector logs to Supervisor's log output?* ### Collector Executable Updates @@ -506,6 +507,16 @@ the next Collector start (at the minimum the version number to be included in AgentDescription is expected to change after the executable is updated). +### OpAMP Heartbeats + +OpAMP heartbeats are enabled by default in the Supervisor. They can be +disabled by setting `capabilities.reports_heartbeat` to `false`. The +default interval is 30 seconds, but this can be changed by the OpAMP +server sending a ServerToAgent message with the appropriate field set. +This causes the Supervisor to periodically send an empty OpAMP +AgentToServer message in order to keep the connection alive. +For more information see the [OpAMP specification](https://github.com/open-telemetry/opamp-spec/blob/main/specification.md#opampconnectionsettingsheartbeat_interval_seconds). + ### Addons Management The Collector currently does not have a concept of addons so this OpAMP @@ -539,19 +550,19 @@ Collector's configuration. The opamp extension implements an OpAMP client with a small subset of OpAMP agent capabilities: - ReportsStatus. The extension reports agent description and status. - This is the first message from the client to the server in the OpAMP - protocol that is essential for beginning OpAMP message exchange. + This is the first message from the client to the server in the OpAMP + protocol that is essential for beginning OpAMP message exchange. - ReportsEffectiveConfig. The extension reports the Collector's - effective config on startup and any time the config changes. In - order to do this the opamp extension needs [access to the effective - config](https://github.com/open-telemetry/opentelemetry-collector/issues/6596). + effective config on startup and any time the config changes. In + order to do this the opamp extension needs [access to the effective + config](https://github.com/open-telemetry/opentelemetry-collector/issues/6596). - ReportsHealth. The extension reports Collector's health on startup - and any time the health changes. In order to do this the opamp - extension needs access to the health of the Collector. The very - basic health capability can be replicated by mirroring the - functionality of the healthcheck extension, a more advanced - capability depends on the [component status - reporting](https://github.com/open-telemetry/opentelemetry-collector/pull/6560). + and any time the health changes. In order to do this the opamp + extension needs access to the health of the Collector. The very + basic health capability can be replicated by mirroring the + functionality of the healthcheck extension, a more advanced + capability depends on the [component status + reporting](https://github.com/open-telemetry/opentelemetry-collector/pull/6560). The messages received from the opamp extension are forwarded by the Supervisor to the destination OpAMP Backend and replies to these @@ -602,25 +613,25 @@ is an [open issue](https://github.com/open-telemetry/opentelemetry-collector/issues/6599) to allow this. -_Open Question: when used with Supervisor do we want the Supervisor to +*Open Question: when used with Supervisor do we want the Supervisor to actively periodically query the health of the Collector or we can rely -on opamp extension to report the health when it changes?_ +on opamp extension to report the health when it changes?* ## Future Work - Decide if we want to have Supervisor-less AcceptsRemoteConfig - capability in the Collector. This currently can't be done by using - just an extension. At the minimum it requires a config Provider. + capability in the Collector. This currently can't be done by using + just an extension. At the minimum it requires a config Provider. - Consider extending the Supervisor to be able to manage multiple - Collector instances. + Collector instances. ## References - OpAMP Specification: - [https://github.com/open-telemetry/opamp-spec/blob/main/specification.md](https://github.com/open-telemetry/opamp-spec/blob/main/specification.md) + [https://github.com/open-telemetry/opamp-spec/blob/main/specification.md](https://github.com/open-telemetry/opamp-spec/blob/main/specification.md) - OpAMP client and server implementation in Go: - [https://github.com/open-telemetry/opamp-go](https://github.com/open-telemetry/opamp-go) + [https://github.com/open-telemetry/opamp-go](https://github.com/open-telemetry/opamp-go) - Example Supervisor implementation: - [https://github.com/open-telemetry/opamp-go/tree/main/internal/examples/supervisor](https://github.com/open-telemetry/opamp-go/tree/main/internal/examples/supervisor) + [https://github.com/open-telemetry/opamp-go/tree/main/internal/examples/supervisor](https://github.com/open-telemetry/opamp-go/tree/main/internal/examples/supervisor) - OpAMP Milestone in the Collector: - [https://github.com/open-telemetry/opentelemetry-collector/milestone/29](https://github.com/open-telemetry/opentelemetry-collector/milestone/29) + [https://github.com/open-telemetry/opentelemetry-collector/milestone/29](https://github.com/open-telemetry/opentelemetry-collector/milestone/29) From f16dd12fed595f5f2eeb1467c25851a092d653ca Mon Sep 17 00:00:00 2001 From: Dakota Paasman <122491662+dpaasman00@users.noreply.github.com> Date: Mon, 8 Sep 2025 12:08:57 -0400 Subject: [PATCH 5/7] supervisor remembers and configures heartbeat interval --- cmd/opampsupervisor/e2e_test.go | 9 ++++++++- cmd/opampsupervisor/supervisor/supervisor.go | 16 ++++++++++++++++ 2 files changed, 24 insertions(+), 1 deletion(-) diff --git a/cmd/opampsupervisor/e2e_test.go b/cmd/opampsupervisor/e2e_test.go index 26112a3b8ff8b..e8d3965568fb4 100644 --- a/cmd/opampsupervisor/e2e_test.go +++ b/cmd/opampsupervisor/e2e_test.go @@ -2523,15 +2523,22 @@ func TestSupervisorReportsHeartbeat(t *testing.T) { waitForSupervisorConnection(server.supervisorConnected, true) - // Set the heartbeat interval to 5 seconds + // Set the heartbeat interval to 1 seconds server.sendToSupervisor(&protobufs.ServerToAgent{ ConnectionSettings: &protobufs.ConnectionSettingsOffers{ Opamp: &protobufs.OpAMPConnectionSettings{ + DestinationEndpoint: "ws://" + server.addr + "/v1/opamp", HeartbeatIntervalSeconds: 1, }, }, }) + // supervisor disconnects from the server + waitForSupervisorConnection(server.supervisorConnected, false) + + // supervisor reconnects to the server + waitForSupervisorConnection(server.supervisorConnected, true) + require.Eventually(t, func() bool { return heartbeatReport.Load() }, 3*time.Second, 250*time.Millisecond) diff --git a/cmd/opampsupervisor/supervisor/supervisor.go b/cmd/opampsupervisor/supervisor/supervisor.go index 6234f57346fff..48ff15d8337ec 100644 --- a/cmd/opampsupervisor/supervisor/supervisor.go +++ b/cmd/opampsupervisor/supervisor/supervisor.go @@ -186,6 +186,10 @@ type Supervisor struct { featureGates map[string]struct{} metrics *supervisorTelemetry.Metrics + + // heartbeatInterval is the interval the OpAMP client is configured to send heartbeats. + // Default is 30 seconds but can be overridden by the OpAMP server with an OpAMPConnectionSettings message. + heartbeatIntervalSeconds uint64 } func NewSupervisor(ctx context.Context, logger *zap.Logger, cfg config.Supervisor) (*Supervisor, error) { @@ -204,6 +208,7 @@ func NewSupervisor(ctx context.Context, logger *zap.Logger, cfg config.Superviso agentReady: atomic.Bool{}, agentReadyChan: make(chan struct{}, 1), metrics: &supervisorTelemetry.Metrics{}, + heartbeatIntervalSeconds: 30, } s.runCtx, s.runCtxCancel = context.WithCancel(ctx) @@ -713,6 +718,12 @@ func (s *Supervisor) startOpAMPClient() error { return err } + // Set heartbeat interval if the agent supports it + if s.config.Capabilities.ReportsHeartbeat { + d := time.Duration(s.heartbeatIntervalSeconds) * time.Second + settings.HeartbeatInterval = &d + } + s.telemetrySettings.Logger.Debug("Starting OpAMP client...") if err := s.opampClient.Start(s.runCtx, settings); err != nil { return err @@ -1027,6 +1038,11 @@ func (s *Supervisor) onOpampConnectionSettings(_ context.Context, settings *prot return err } + // Update the heartbeat interval if the agent supports it + if s.config.Capabilities.ReportsHeartbeat { + s.heartbeatIntervalSeconds = settings.HeartbeatIntervalSeconds + } + if err := s.stopOpAMPClient(); err != nil { s.telemetrySettings.Logger.Error("Cannot stop the OpAMP client", zap.Error(err)) return err From a39c0253655f202be38106cc9176bc02cb50294a Mon Sep 17 00:00:00 2001 From: Dakota Paasman <122491662+dpaasman00@users.noreply.github.com> Date: Mon, 8 Sep 2025 12:48:38 -0400 Subject: [PATCH 6/7] context --- cmd/opampsupervisor/e2e_test.go | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/cmd/opampsupervisor/e2e_test.go b/cmd/opampsupervisor/e2e_test.go index e8d3965568fb4..db261202ec36a 100644 --- a/cmd/opampsupervisor/e2e_test.go +++ b/cmd/opampsupervisor/e2e_test.go @@ -2518,7 +2518,7 @@ func TestSupervisorReportsHeartbeat(t *testing.T) { ) s, _ := newSupervisor(t, "reports_heartbeat", map[string]string{"url": server.addr}) - require.Nil(t, s.Start()) + require.Nil(t, s.Start(t.Context())) defer s.Shutdown() waitForSupervisorConnection(server.supervisorConnected, true) From ae1e140554efeca74f3716bce42b53117c27b213 Mon Sep 17 00:00:00 2001 From: Dakota Paasman <122491662+dpaasman00@users.noreply.github.com> Date: Mon, 8 Sep 2025 13:06:46 -0400 Subject: [PATCH 7/7] update capabilities e2e test --- cmd/opampsupervisor/e2e_test.go | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/cmd/opampsupervisor/e2e_test.go b/cmd/opampsupervisor/e2e_test.go index db261202ec36a..7b326f571f84f 100644 --- a/cmd/opampsupervisor/e2e_test.go +++ b/cmd/opampsupervisor/e2e_test.go @@ -882,7 +882,7 @@ func TestSupervisorConfiguresCapabilities(t *testing.T) { require.Eventually(t, func() bool { caps := capabilities.Load() - return caps == uint64(protobufs.AgentCapabilities_AgentCapabilities_ReportsStatus) + return caps == uint64(protobufs.AgentCapabilities_AgentCapabilities_ReportsStatus|protobufs.AgentCapabilities_AgentCapabilities_ReportsHeartbeat) }, 5*time.Second, 250*time.Millisecond) }