-
Notifications
You must be signed in to change notification settings - Fork 7k
[core] Fix "RayEventRecorder::StartExportingEvents() should be called only once." #57917
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Changes from all commits
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -41,30 +41,31 @@ void MetricsAgentClientImpl::WaitForServerReadyWithRetry( | |
| // Only log the first time we start the retry loop. | ||
| RAY_LOG(INFO) << "Initializing exporter ..."; | ||
| } | ||
| HealthCheck(rpc::HealthCheckRequest(), | ||
| [this, init_exporter_fn](auto &status, auto &&reply) { | ||
| if (status.ok() && !exporter_initialized_) { | ||
| init_exporter_fn(status); | ||
| exporter_initialized_ = true; | ||
| RAY_LOG(INFO) << "Exporter initialized."; | ||
| } | ||
| }); | ||
| if (retry_count >= max_retry) { | ||
| init_exporter_fn(Status::RpcError("The metrics agent server is not ready.", 14)); | ||
| return; | ||
| } | ||
| retry_count++; | ||
| retry_timer_->expires_after(std::chrono::milliseconds(retry_interval_ms)); | ||
| retry_timer_->async_wait( | ||
| [this, init_exporter_fn, retry_count, max_retry, retry_interval_ms]( | ||
| const boost::system::error_code &error) { | ||
| if (!error) { | ||
| WaitForServerReadyWithRetry( | ||
| init_exporter_fn, retry_count, max_retry, retry_interval_ms); | ||
| } else { | ||
| RAY_LOG(ERROR) << "Failed to initialize exporter. Data will not be exported to " | ||
| "the metrics agent."; | ||
| HealthCheck( | ||
| rpc::HealthCheckRequest(), | ||
| [this, init_exporter_fn, retry_count, max_retry, retry_interval_ms](auto &status, | ||
| auto &&reply) { | ||
| if (status.ok()) { | ||
| if (exporter_initialized_) { | ||
| return; | ||
| } | ||
| init_exporter_fn(status); | ||
| exporter_initialized_ = true; | ||
| RAY_LOG(INFO) << "Exporter initialized."; | ||
| return; | ||
|
Comment on lines
+48
to
+55
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. This block has a race condition on To solve this, you should protect this critical section. One way is to use a mutex. You would add if (status.ok()) {
absl::MutexLock lock(&exporter_mutex_);
if (exporter_initialized_) {
return;
}
init_exporter_fn(status);
exporter_initialized_ = true;
RAY_LOG(INFO) << "Exporter initialized.";
return;
}An alternative is to make
Contributor
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. thanks, |
||
| } | ||
| if (retry_count >= max_retry) { | ||
| init_exporter_fn(Status::RpcError( | ||
| "Running out of retries to initialize the metrics agent.", 14)); | ||
| return; | ||
| } | ||
| io_service_.post( | ||
| [this, init_exporter_fn, retry_count, max_retry, retry_interval_ms]() { | ||
| WaitForServerReadyWithRetry( | ||
| init_exporter_fn, retry_count + 1, max_retry, retry_interval_ms); | ||
| }, | ||
| "MetricsAgentClient.WaitForServerReadyWithRetry", | ||
| retry_interval_ms * 1000); | ||
|
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Bug: Refactored Retry Logic Causes Dangling PointerThe refactored retry logic uses Additional Locations (1)
Contributor
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. annoyingly this might be true
Contributor
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. or maybe not, the iocontext is "stopped" before the metric agent is destructed (https://github.com/ray-project/ray/blob/master/src/ray/gcs/gcs_server.cc#L331) There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Bug: Incorrect Retry Timing in Metrics ExporterThe |
||
| }); | ||
| } | ||
|
|
||
|
|
||
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -72,12 +72,12 @@ class MetricsAgentClientImpl : public MetricsAgentClient { | |
| MetricsAgentClientImpl(const std::string &address, | ||
| const int port, | ||
| instrumented_io_context &io_service, | ||
| rpc::ClientCallManager &client_call_manager) { | ||
| rpc::ClientCallManager &client_call_manager) | ||
| : io_service_(io_service) { | ||
|
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Bug: Async Callbacks Access Destroyed ObjectsThe |
||
| RAY_LOG(DEBUG) << "Initiate the metrics client of address:" | ||
| << BuildAddress(address, port); | ||
| grpc_client_ = | ||
| std::make_unique<GrpcClient<ReporterService>>(address, port, client_call_manager); | ||
| retry_timer_ = std::make_unique<boost::asio::steady_timer>(io_service); | ||
|
Contributor
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. node: we don't perform retry async anymore; instead, we'll retry during the callback of the connection healthcheck |
||
| }; | ||
|
|
||
| VOID_RPC_CLIENT_METHOD(ReporterService, | ||
|
|
@@ -89,7 +89,7 @@ class MetricsAgentClientImpl : public MetricsAgentClient { | |
| VOID_RPC_CLIENT_METHOD(ReporterService, | ||
| HealthCheck, | ||
| grpc_client_, | ||
| /*method_timeout_ms*/ -1, | ||
| /*method_timeout_ms*/ kMetricAgentInitRetryDelayMs, | ||
| override) | ||
|
|
||
| /// Wait for the server to be ready. Invokes the callback with the final readiness | ||
|
|
@@ -99,8 +99,8 @@ class MetricsAgentClientImpl : public MetricsAgentClient { | |
| private: | ||
| /// The RPC client. | ||
| std::unique_ptr<GrpcClient<ReporterService>> grpc_client_; | ||
| /// Timer for retrying to initialize the OpenTelemetry exporter. | ||
| std::unique_ptr<boost::asio::steady_timer> retry_timer_; | ||
| /// The io context to run the retry loop. | ||
| instrumented_io_context &io_service_; | ||
| /// Whether the exporter is initialized. | ||
| bool exporter_initialized_ = false; | ||
| /// Wait for the server to be ready with a retry count. Invokes the callback | ||
|
|
||
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
i'll refactor this call to use the same pattern in another follow up (to make this PR minimal)