-
Notifications
You must be signed in to change notification settings - Fork 7k
[core][stats-die/03] kill STATS in core worker component #58060
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Changes from all commits
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -185,8 +185,10 @@ class CoreWorkerProcessImpl { | |
| /// The client to export metrics to the metrics agent. | ||
| std::unique_ptr<ray::rpc::MetricsAgentClient> metrics_agent_client_; | ||
|
|
||
| ray::stats::Gauge task_by_state_gauge_{GetTaskByStateGaugeMetric()}; | ||
| ray::stats::Gauge actor_by_state_gauge_{GetActorByStateGaugeMetric()}; | ||
| std::unique_ptr<ray::stats::Gauge> task_by_state_gauge_; | ||
|
Contributor
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. note: I changed this to pointer to the metric initialization can happen after |
||
| std::unique_ptr<ray::stats::Gauge> actor_by_state_gauge_; | ||
| std::unique_ptr<ray::stats::Gauge> total_lineage_bytes_gauge_; | ||
| std::unique_ptr<ray::stats::Histogram> scheduler_placement_time_s_histogram_; | ||
| }; | ||
| } // namespace core | ||
| } // namespace ray | ||
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -32,6 +32,7 @@ | |
| #include "ray/core_worker/store_provider/memory_store/memory_store.h" | ||
| #include "ray/core_worker_rpc_client/core_worker_client_pool.h" | ||
| #include "ray/core_worker_rpc_client/fake_core_worker_client.h" | ||
| #include "ray/observability/fake_metric.h" | ||
| #include "ray/raylet_rpc_client/fake_raylet_client.h" | ||
| #include "ray/raylet_rpc_client/raylet_client_interface.h" | ||
|
|
||
|
|
@@ -494,7 +495,8 @@ class NormalTaskSubmitterTest : public testing::Test { | |
| JobID::Nil(), | ||
| rate_limiter, | ||
| [](const ObjectID &object_id) { return rpc::TensorTransport::OBJECT_STORE; }, | ||
| boost::asio::steady_timer(io_context)); | ||
| boost::asio::steady_timer(io_context), | ||
| fake_scheduler_placement_time_s_histogram_); | ||
| } | ||
|
|
||
| NodeID local_node_id; | ||
|
|
@@ -511,6 +513,7 @@ class NormalTaskSubmitterTest : public testing::Test { | |
| std::unique_ptr<MockLeasePolicy> lease_policy; | ||
| MockLeasePolicy *lease_policy_ptr = nullptr; | ||
| instrumented_io_context io_context; | ||
| ray::observability::FakeHistogram fake_scheduler_placement_time_s_histogram_; | ||
| }; | ||
|
|
||
| TEST_F(NormalTaskSubmitterTest, TestLocalityAwareSubmitOneTask) { | ||
|
|
@@ -1430,6 +1433,7 @@ void TestSchedulingKey(const std::shared_ptr<CoreWorkerMemoryStore> store, | |
| const TaskSpecification &same2, | ||
| const TaskSpecification &different) { | ||
| rpc::Address address; | ||
| ray::observability::FakeHistogram fake_scheduler_placement_time_s_histogram_; | ||
|
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Bug: Local Variable Passed to Constructor Causes Use-After-FreeVariable
Contributor
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. it's fine, the |
||
| auto local_node_id = NodeID::FromRandom(); | ||
| auto raylet_client = std::make_shared<MockRayletClient>(); | ||
| auto raylet_client_pool = std::make_shared<rpc::RayletClientPool>( | ||
|
|
@@ -1457,7 +1461,8 @@ void TestSchedulingKey(const std::shared_ptr<CoreWorkerMemoryStore> store, | |
| JobID::Nil(), | ||
| std::make_shared<StaticLeaseRequestRateLimiter>(1), | ||
| [](const ObjectID &object_id) { return rpc::TensorTransport::OBJECT_STORE; }, | ||
| boost::asio::steady_timer(io_context)); | ||
| boost::asio::steady_timer(io_context), | ||
| fake_scheduler_placement_time_s_histogram_); | ||
|
|
||
| submitter.SubmitTask(same1); | ||
| submitter.SubmitTask(same2); | ||
|
|
||
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
So I've lately gotten a bit of a crash course in this thing. These.... buckets seem a little off right? scheduler_placement_time_s is in seconds (I'm assuming because of the _s). So we're saying the scheduler placement time falls within 0.1s->1s->10s->100s->16 minutes -> 2.5+ hours
The latter buckets seem absurd right? What is the actual realistic spread of latency for the scheduler to place something? If the _s is misleading and actually it's not in seconds then we should fix that. But we should choose our buckets along the lines of
-->healthy range
-->elevated range
-->error range
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
make sense, the series of number must’ve been pulled out of a hat at some point ;) - i'll fix them on a PR on top of this, to keep this PR purely a refactoring
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
yeah look like the value is only meaningful in the range of less than 10s; most are actually below 0.1s so further breakdown of bucket less than 0.1s is probably more useful
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
I wonder if it even makes sense to have this metric in seconds at all. It seems like we'd want this to usually be sub second
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
make total sense
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
there you go #58217