Skip to content

Commit 04198f7

Browse files
committed
[core][stats-die/02] kill STATS in object manager component
Signed-off-by: Cuong Nguyen <[email protected]>
1 parent e51f803 commit 04198f7

25 files changed

+345
-207
lines changed

src/ray/common/metrics.h

Lines changed: 25 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -48,4 +48,29 @@ inline ray::stats::Gauge GetActorByStateGaugeMetric() {
4848
};
4949
}
5050

51+
inline ray::stats::Gauge GetObjectStoreMemoryGaugeMetric() {
52+
return ray::stats::Gauge{
53+
/*name=*/"object_store_memory",
54+
/*description=*/"Object store memory by various sub-kinds on this node",
55+
/*unit=*/"",
56+
/// Location:
57+
/// - MMAP_SHM: currently in shared memory(e.g. /dev/shm).
58+
/// - MMAP_DISK: memory that's fallback allocated on mmapped disk,
59+
/// e.g. /tmp.
60+
/// - WORKER_HEAP: ray objects smaller than ('max_direct_call_object_size',
61+
/// default 100KiB) stored in process memory, i.e. inlined return
62+
/// values, placeholders for objects stored in plasma store.
63+
/// - SPILLED: current number of bytes from objects spilled
64+
/// to external storage. Note this might be smaller than
65+
/// the physical storage incurred on the external storage because
66+
/// Ray might fuse spilled objects into a single file, so a deleted
67+
/// spill object might still exist in the spilled file. Check
68+
/// spilled object fusing for more details.
69+
/// ObjectState:
70+
/// - SEALED: sealed objects bytes (could be MMAP_SHM or MMAP_DISK)
71+
/// - UNSEALED: unsealed objects bytes (could be MMAP_SHM or MMAP_DISK)
72+
/*tag_keys=*/{"Location", "ObjectState"},
73+
};
74+
}
75+
5176
} // namespace ray

src/ray/core_worker/BUILD.bazel

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -272,6 +272,7 @@ ray_cc_library(
272272
":reference_counter_interface",
273273
"//src/ray/common:asio",
274274
"//src/ray/common:id",
275+
"//src/ray/common:metrics",
275276
"//src/ray/common:ray_config",
276277
"//src/ray/common:status",
277278
"//src/ray/raylet_ipc_client:raylet_ipc_client_interface",
@@ -423,6 +424,6 @@ ray_cc_library(
423424
name = "metrics",
424425
hdrs = ["metrics.h"],
425426
deps = [
426-
"//src/ray/stats:stats_lib",
427+
"//src/ray/stats:stats_metric",
427428
],
428429
)

src/ray/core_worker/store_provider/memory_store/memory_store.cc

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -597,8 +597,8 @@ MemoryStoreStats CoreWorkerMemoryStore::GetMemoryStoreStatisticalData() {
597597

598598
void CoreWorkerMemoryStore::RecordMetrics() {
599599
absl::MutexLock lock(&mu_);
600-
stats::STATS_object_store_memory.Record(num_local_objects_bytes_,
601-
{{stats::LocationKey, "WORKER_HEAP"}});
600+
object_store_memory_gauge_.Record(num_local_objects_bytes_,
601+
{{stats::LocationKey, "WORKER_HEAP"}});
602602
}
603603

604604
} // namespace core

src/ray/core_worker/store_provider/memory_store/memory_store.h

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -24,6 +24,7 @@
2424
#include "absl/synchronization/mutex.h"
2525
#include "ray/common/asio/asio_util.h"
2626
#include "ray/common/id.h"
27+
#include "ray/common/metrics.h"
2728
#include "ray/common/status.h"
2829
#include "ray/core_worker/context.h"
2930
#include "ray/core_worker/reference_counter_interface.h"
@@ -253,6 +254,8 @@ class CoreWorkerMemoryStore {
253254
std::function<std::shared_ptr<RayObject>(const RayObject &object,
254255
const ObjectID &object_id)>
255256
object_allocator_;
257+
258+
ray::stats::Gauge object_store_memory_gauge_{ray::GetObjectStoreMemoryGaugeMetric()};
256259
};
257260

258261
} // namespace core

src/ray/object_manager/BUILD.bazel

Lines changed: 12 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -6,6 +6,7 @@ ray_cc_library(
66
hdrs = ["object_manager.h"],
77
deps = [
88
":chunk_object_reader",
9+
":metrics",
910
":object_buffer_pool",
1011
":object_directory",
1112
":object_manager_common",
@@ -30,6 +31,7 @@ ray_cc_library(
3031
srcs = ["push_manager.cc"],
3132
hdrs = ["push_manager.h"],
3233
deps = [
34+
":metrics",
3335
"//src/ray/common:id",
3436
"//src/ray/stats:stats_metric",
3537
"@com_google_absl//absl/container:flat_hash_map",
@@ -41,6 +43,7 @@ ray_cc_library(
4143
srcs = ["pull_manager.cc"],
4244
hdrs = ["pull_manager.h"],
4345
deps = [
46+
":metrics",
4447
":object_manager_common",
4548
":ownership_object_directory",
4649
"//src/ray/common:id",
@@ -162,3 +165,12 @@ ray_cc_library(
162165
hdrs = ["object_reader.h"],
163166
deps = ["//src/ray/protobuf:common_cc_proto"],
164167
)
168+
169+
ray_cc_library(
170+
name = "metrics",
171+
hdrs = ["metrics.h"],
172+
deps = [
173+
"//src/ray/stats:stats_metric",
174+
"//src/ray/util:size_literals",
175+
],
176+
)

src/ray/object_manager/metrics.h

Lines changed: 177 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,177 @@
1+
// Copyright 2025 The Ray Authors.
2+
//
3+
// Licensed under the Apache License, Version 2.0 (the "License");
4+
// you may not use this file except in compliance with the License.
5+
// You may obtain a copy of the License at
6+
//
7+
// http://www.apache.org/licenses/LICENSE-2.0
8+
//
9+
// Unless required by applicable law or agreed to in writing, software
10+
// distributed under the License is distributed on an "AS IS" BASIS,
11+
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12+
// See the License for the specific language governing permissions and
13+
// limitations under the License.
14+
15+
#pragma once
16+
17+
#include "ray/stats/metric.h"
18+
#include "ray/util/size_literals.h"
19+
20+
namespace ray {
21+
22+
using ray::literals::operator""_MiB;
23+
24+
inline ray::stats::Histogram GetObjectStoreDistHistogramMetric() {
25+
return ray::stats::Histogram{
26+
/*name=*/"object_store_dist",
27+
/*description=*/"The distribution of object size in bytes",
28+
/*unit=*/"MiB",
29+
/*boundaries=*/
30+
{32_MiB,
31+
64_MiB,
32+
128_MiB,
33+
256_MiB,
34+
512_MiB,
35+
1024_MiB,
36+
2048_MiB,
37+
4096_MiB,
38+
8192_MiB,
39+
16384_MiB},
40+
/*tag_keys=*/{"Source"},
41+
};
42+
}
43+
44+
inline ray::stats::Gauge GetObjectStoreAvailableMemoryGaugeMetric() {
45+
return ray::stats::Gauge(
46+
/*name=*/"object_store_available_memory",
47+
/*description=*/"Amount of memory currently available in the object store.",
48+
/*unit=*/"bytes");
49+
}
50+
51+
inline ray::stats::Gauge GetObjectStoreUsedMemoryGaugeMetric() {
52+
return ray::stats::Gauge(
53+
/*name=*/"object_store_used_memory",
54+
/*description=*/"Amount of memory currently occupied in the object store.",
55+
/*unit=*/"bytes");
56+
}
57+
58+
inline ray::stats::Gauge GetObjectStoreFallbackMemoryGaugeMetric() {
59+
return ray::stats::Gauge(
60+
/*name=*/"object_store_fallback_memory",
61+
/*description=*/"Amount of memory in fallback allocations in the filesystem.",
62+
/*unit=*/"bytes");
63+
}
64+
65+
inline ray::stats::Gauge GetObjectStoreLocalObjectsGaugeMetric() {
66+
return ray::stats::Gauge(
67+
/*name=*/"object_store_num_local_objects",
68+
/*description=*/"Number of objects currently in the object store.",
69+
/*unit=*/"objects");
70+
}
71+
72+
inline ray::stats::Gauge GetObjectManagerPullRequestsGaugeMetric() {
73+
return ray::stats::Gauge(
74+
/*name=*/"object_manager_num_pull_requests",
75+
/*description=*/"Number of active pull requests for objects.",
76+
/*unit=*/"requests");
77+
}
78+
79+
inline ray::stats::Gauge GetObjectManagerBytesGaugeMetric() {
80+
return ray::stats::Gauge(
81+
/*name=*/"object_manager_bytes",
82+
/*description=*/
83+
"Number of bytes pushed or received by type {PushedFromLocalPlasma, "
84+
"PushedFromLocalDisk, Received}.",
85+
/*unit=*/"bytes",
86+
/*tag_keys=*/{"Type"});
87+
}
88+
89+
inline ray::stats::Gauge GetObjectManagerReceivedChunksGaugeMetric() {
90+
return ray::stats::Gauge(
91+
/*name=*/"object_manager_received_chunks",
92+
/*description=*/
93+
"Number object chunks received broken per type {Total, FailedTotal, "
94+
"FailedCancelled, FailedPlasmaFull}.",
95+
/*unit=*/"chunks",
96+
/*tag_keys=*/{"Type"});
97+
}
98+
99+
inline ray::stats::Gauge GetPullManagerUsageBytesGaugeMetric() {
100+
return ray::stats::Gauge(
101+
/*name=*/"pull_manager_usage_bytes",
102+
/*description=*/
103+
"The total number of bytes usage broken per type {Available, BeingPulled, Pinned}",
104+
/*unit=*/"bytes",
105+
/*tag_keys=*/{"Type"});
106+
}
107+
108+
inline ray::stats::Gauge GetPullManagerRequestedBundlesGaugeMetric() {
109+
return ray::stats::Gauge(
110+
/*name=*/"pull_manager_requested_bundles",
111+
/*description=*/
112+
"Number of requested bundles broken per type {Get, Wait, TaskArgs}.",
113+
/*unit=*/"bundles",
114+
/*tag_keys=*/{"Type"});
115+
}
116+
117+
inline ray::stats::Gauge GetPullManagerRequestsGaugeMetric() {
118+
return ray::stats::Gauge(
119+
/*name=*/"pull_manager_requests",
120+
/*description=*/"Number of pull requests broken per type {Queued, Active, Pinned}.",
121+
/*unit=*/"requests",
122+
/*tag_keys=*/{"Type"});
123+
}
124+
125+
inline ray::stats::Gauge GetPullManagerActiveBundlesGaugeMetric() {
126+
return ray::stats::Gauge(
127+
/*name=*/"pull_manager_active_bundles",
128+
/*description=*/"Number of active bundle requests",
129+
/*unit=*/"bundles",
130+
/*tag_keys=*/{"Type"});
131+
}
132+
133+
inline ray::stats::Gauge GetPullManagerRetriesTotalGaugeMetric() {
134+
return ray::stats::Gauge(
135+
/*name=*/"pull_manager_retries_total",
136+
/*description=*/"Number of cumulative pull retries.",
137+
/*unit=*/"retries",
138+
/*tag_keys=*/{"Type"});
139+
}
140+
141+
inline ray::stats::Gauge GetPullManagerNumObjectPinsGaugeMetric() {
142+
return ray::stats::Gauge(
143+
/*name=*/"pull_manager_num_object_pins",
144+
/*description=*/
145+
"Number of object pin attempts by the pull manager, can be {Success, Failure}.",
146+
/*unit=*/"pins",
147+
/*tag_keys=*/{"Type"});
148+
}
149+
150+
inline ray::stats::Histogram GetPullManagerObjectRequestTimeMsHistogramMetric() {
151+
return ray::stats::Histogram(
152+
/*name=*/"pull_manager_object_request_time_ms",
153+
/*description=*/
154+
"Time between initial object pull request and local pinning of the object. ",
155+
/*unit=*/"ms",
156+
/*boundaries=*/{1, 10, 100, 1000, 10000},
157+
/*tag_keys=*/{"Type"});
158+
}
159+
160+
inline ray::stats::Gauge GetPushManagerNumPushesRemainingGaugeMetric() {
161+
return ray::stats::Gauge(
162+
/*name=*/"push_manager_num_pushes_remaining",
163+
/*description=*/"Number of pushes not completed.",
164+
/*unit=*/"pushes",
165+
/*tag_keys=*/{"Type"});
166+
}
167+
168+
inline ray::stats::Gauge GetPushManagerChunksGaugeMetric() {
169+
return ray::stats::Gauge(
170+
/*name=*/"push_manager_chunks",
171+
/*description=*/
172+
"Number of object chunks transfer broken per type {InFlight, Remaining}.",
173+
/*unit=*/"chunks",
174+
/*tag_keys=*/{"Type"});
175+
}
176+
177+
} // namespace ray

src/ray/object_manager/object_manager.cc

Lines changed: 19 additions & 21 deletions
Original file line numberDiff line numberDiff line change
@@ -26,7 +26,6 @@
2626
#include "ray/common/asio/asio_util.h"
2727
#include "ray/object_manager/plasma/store_runner.h"
2828
#include "ray/object_manager/spilled_object_reader.h"
29-
#include "ray/stats/metric_defs.h"
3029
#include "ray/util/exponential_backoff.h"
3130

3231
namespace ray {
@@ -770,32 +769,31 @@ void ObjectManager::RecordMetrics() {
770769
push_manager_->RecordMetrics();
771770
// used_memory_ includes the fallback allocation, so we should add it again here
772771
// to calculate the exact available memory.
773-
ray_metric_object_store_available_memory_.Record(
772+
object_store_available_memory_gauge_.Record(
774773
config_.object_store_memory - used_memory_ +
775774
plasma::plasma_store_runner->GetFallbackAllocated());
776775
// Subtract fallback allocated memory. It is tracked separately by
777776
// `ObjectStoreFallbackMemory`.
778-
ray_metric_object_store_used_memory_.Record(
777+
object_store_used_memory_gauge_.Record(
779778
used_memory_ - plasma::plasma_store_runner->GetFallbackAllocated());
780-
ray_metric_object_store_fallback_memory_.Record(
779+
object_store_fallback_memory_gauge_.Record(
781780
plasma::plasma_store_runner->GetFallbackAllocated());
782-
ray_metric_object_store_local_objects_.Record(local_objects_.size());
783-
ray_metric_object_manager_pull_requests_.Record(pull_manager_->NumObjectPullRequests());
784-
785-
ray::stats::STATS_object_manager_bytes.Record(num_bytes_pushed_from_plasma_,
786-
"PushedFromLocalPlasma");
787-
ray::stats::STATS_object_manager_bytes.Record(num_bytes_pushed_from_disk_,
788-
"PushedFromLocalDisk");
789-
ray::stats::STATS_object_manager_bytes.Record(num_bytes_received_total_, "Received");
790-
791-
ray::stats::STATS_object_manager_received_chunks.Record(num_chunks_received_total_,
792-
"Total");
793-
ray::stats::STATS_object_manager_received_chunks.Record(
794-
num_chunks_received_total_failed_, "FailedTotal");
795-
ray::stats::STATS_object_manager_received_chunks.Record(num_chunks_received_cancelled_,
796-
"FailedCancelled");
797-
ray::stats::STATS_object_manager_received_chunks.Record(
798-
num_chunks_received_failed_due_to_plasma_, "FailedPlasmaFull");
781+
object_store_local_objects_gauge_.Record(local_objects_.size());
782+
object_manager_pull_requests_gauge_.Record(pull_manager_->NumObjectPullRequests());
783+
784+
object_manager_bytes_gauge_.Record(num_bytes_pushed_from_plasma_,
785+
{{"Type", "PushedFromLocalPlasma"}});
786+
object_manager_bytes_gauge_.Record(num_bytes_pushed_from_disk_,
787+
{{"Type", "PushedFromLocalDisk"}});
788+
object_manager_bytes_gauge_.Record(num_bytes_received_total_, {{"Type", "Received"}});
789+
object_manager_received_chunks_gauge_.Record(num_chunks_received_total_,
790+
{{"Type", "Total"}});
791+
object_manager_received_chunks_gauge_.Record(num_chunks_received_total_failed_,
792+
{{"Type", "FailedTotal"}});
793+
object_manager_received_chunks_gauge_.Record(num_chunks_received_cancelled_,
794+
{{"Type", "FailedCancelled"}});
795+
object_manager_received_chunks_gauge_.Record(num_chunks_received_failed_due_to_plasma_,
796+
{{"Type", "FailedPlasmaFull"}});
799797
}
800798

801799
void ObjectManager::FillObjectStoreStats(rpc::GetNodeStatsReply *reply) const {

src/ray/object_manager/object_manager.h

Lines changed: 13 additions & 25 deletions
Original file line numberDiff line numberDiff line change
@@ -521,31 +521,19 @@ class ObjectManager : public ObjectManagerInterface,
521521
/// plasma.
522522
size_t num_chunks_received_failed_due_to_plasma_ = 0;
523523

524-
/// Metrics
525-
ray::stats::Gauge ray_metric_object_store_available_memory_{
526-
/*name=*/"object_store_available_memory",
527-
/*description=*/"Amount of memory currently available in the object store.",
528-
/*unit=*/"bytes"};
529-
530-
ray::stats::Gauge ray_metric_object_store_used_memory_{
531-
/*name=*/"object_store_used_memory",
532-
/*description=*/"Amount of memory currently occupied in the object store.",
533-
/*unit=*/"bytes"};
534-
535-
ray::stats::Gauge ray_metric_object_store_fallback_memory_{
536-
/*name=*/"object_store_fallback_memory",
537-
/*description=*/"Amount of memory in fallback allocations in the filesystem.",
538-
/*unit=*/"bytes"};
539-
540-
ray::stats::Gauge ray_metric_object_store_local_objects_{
541-
/*name=*/"object_store_num_local_objects",
542-
/*description=*/"Number of objects currently in the object store.",
543-
/*unit=*/"objects"};
544-
545-
ray::stats::Gauge ray_metric_object_manager_pull_requests_{
546-
/*name=*/"object_manager_num_pull_requests",
547-
/*description=*/"Number of active pull requests for objects.",
548-
/*unit=*/"requests"};
524+
ray::stats::Gauge object_store_available_memory_gauge_{
525+
GetObjectStoreAvailableMemoryGaugeMetric()};
526+
ray::stats::Gauge object_store_used_memory_gauge_{
527+
ray::GetObjectStoreUsedMemoryGaugeMetric()};
528+
ray::stats::Gauge object_store_fallback_memory_gauge_{
529+
ray::GetObjectStoreFallbackMemoryGaugeMetric()};
530+
ray::stats::Gauge object_store_local_objects_gauge_{
531+
ray::GetObjectStoreLocalObjectsGaugeMetric()};
532+
ray::stats::Gauge object_manager_pull_requests_gauge_{
533+
ray::GetObjectManagerPullRequestsGaugeMetric()};
534+
ray::stats::Gauge object_manager_bytes_gauge_{ray::GetObjectManagerBytesGaugeMetric()};
535+
ray::stats::Gauge object_manager_received_chunks_gauge_{
536+
ray::GetObjectManagerReceivedChunksGaugeMetric()};
549537
};
550538

551539
} // namespace ray

0 commit comments

Comments
 (0)