Skip to content

Commit 9b0007a

Browse files
authored
[core][metric] error handling of metric+event exporter agent (#57925)
This PR moves the error handling of metric+event exporter agent one level up, inside the `metrics_agent_client` callback. Previously, the errors handled were handled by either the metric or event recorder, which leads to confusion and buggy code. Test: - CI --------- Signed-off-by: Cuong Nguyen <[email protected]>
1 parent 4badd82 commit 9b0007a

File tree

4 files changed

+23
-17
lines changed

4 files changed

+23
-17
lines changed

src/ray/core_worker/core_worker_process.cc

Lines changed: 7 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -819,7 +819,13 @@ CoreWorkerProcessImpl::CoreWorkerProcessImpl(const CoreWorkerOptions &options)
819819
io_service_,
820820
*write_locked.Get()->client_call_manager_);
821821
metrics_agent_client_->WaitForServerReady([this](const Status &server_status) {
822-
stats::InitOpenTelemetryExporter(options_.metrics_agent_port, server_status);
822+
if (server_status.ok()) {
823+
stats::InitOpenTelemetryExporter(options_.metrics_agent_port);
824+
} else {
825+
RAY_LOG(ERROR) << "Failed to establish connection to the metrics exporter agent. "
826+
"Metrics will not be exported. "
827+
<< "Exporter agent status: " << server_status.ToString();
828+
}
823829
});
824830
}
825831
}

src/ray/gcs/gcs_server.cc

Lines changed: 5 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -295,13 +295,14 @@ void GcsServer::DoStart(const GcsInitData &gcs_init_data) {
295295

296296
// Init metrics and event exporter.
297297
metrics_agent_client_->WaitForServerReady([this](const Status &server_status) {
298-
stats::InitOpenTelemetryExporter(config_.metrics_agent_port, server_status);
299298
if (server_status.ok()) {
299+
stats::InitOpenTelemetryExporter(config_.metrics_agent_port);
300300
ray_event_recorder_->StartExportingEvents();
301301
} else {
302-
RAY_LOG(ERROR) << "Failed to establish connection to the event exporter. Events "
303-
"will not be exported. "
304-
<< "Event exporter status: " << server_status.ToString();
302+
RAY_LOG(ERROR)
303+
<< "Failed to establish connection to the event+metrics exporter agent. "
304+
"Events and metrics will not be exported. "
305+
<< "Exporter agent status: " << server_status.ToString();
305306
}
306307
});
307308

src/ray/raylet/main.cc

Lines changed: 10 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -967,10 +967,16 @@ int main(int argc, char *argv[]) {
967967
ray::stats::Init(global_tags, metrics_agent_port, ray::WorkerID::Nil());
968968
metrics_agent_client = std::make_unique<ray::rpc::MetricsAgentClientImpl>(
969969
"127.0.0.1", metrics_agent_port, main_service, *client_call_manager);
970-
metrics_agent_client->WaitForServerReady(
971-
[metrics_agent_port](const ray::Status &server_status) {
972-
ray::stats::InitOpenTelemetryExporter(metrics_agent_port, server_status);
973-
});
970+
metrics_agent_client->WaitForServerReady([metrics_agent_port](
971+
const ray::Status &server_status) {
972+
if (server_status.ok()) {
973+
ray::stats::InitOpenTelemetryExporter(metrics_agent_port);
974+
} else {
975+
RAY_LOG(ERROR) << "Failed to establish connection to the metrics exporter agent. "
976+
"Metrics will not be exported. "
977+
<< "Exporter agent status: " << server_status.ToString();
978+
}
979+
});
974980

975981
// Initialize event framework. This should be done after the node manager is
976982
// initialized.

src/ray/stats/stats.h

Lines changed: 1 addition & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -113,17 +113,10 @@ static inline void Init(
113113
StatsConfig::instance().SetIsInitialized(true);
114114
}
115115

116-
static inline void InitOpenTelemetryExporter(const int metrics_agent_port,
117-
const Status &metrics_agent_server_status) {
116+
static inline void InitOpenTelemetryExporter(const int metrics_agent_port) {
118117
if (!RayConfig::instance().enable_open_telemetry()) {
119118
return;
120119
}
121-
if (!metrics_agent_server_status.ok()) {
122-
RAY_LOG(ERROR) << "Failed to initialize OpenTelemetry exporter. Data will not be "
123-
"exported to the "
124-
<< "metrics agent. Server status: " << metrics_agent_server_status;
125-
return;
126-
}
127120
OpenTelemetryMetricRecorder::GetInstance().RegisterGrpcExporter(
128121
/*endpoint=*/std::string("127.0.0.1:") + std::to_string(metrics_agent_port),
129122
/*interval=*/

0 commit comments

Comments
 (0)