Skip to content

Commit 43c2382

Browse files
committed
[core][stats-die/04] kill STATS in the common component
Signed-off-by: Cuong Nguyen <[email protected]>
1 parent f79eebd commit 43c2382

File tree

17 files changed

+139
-97
lines changed

17 files changed

+139
-97
lines changed

src/ray/common/BUILD.bazel

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -282,6 +282,7 @@ ray_cc_library(
282282
],
283283
deps = [
284284
":event_stats",
285+
":metrics",
285286
":ray_config",
286287
"//src/ray/util:array",
287288
"@boost//:asio",
@@ -311,6 +312,7 @@ ray_cc_library(
311312
"event_stats.h",
312313
],
313314
deps = [
315+
":metrics",
314316
":ray_config",
315317
"//src/ray/stats:stats_metric",
316318
"//src/ray/util:time",

src/ray/common/asio/instrumented_io_context.cc

Lines changed: 9 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -20,7 +20,6 @@
2020
#include "ray/common/asio/asio_chaos.h"
2121
#include "ray/common/asio/asio_util.h"
2222
#include "ray/stats/metric.h"
23-
#include "ray/stats/metric_defs.h"
2423

2524
namespace {
2625

@@ -35,7 +34,7 @@ void LagProbeLoop(instrumented_io_context &io_context,
3534
auto end = std::chrono::steady_clock::now();
3635
auto duration =
3736
std::chrono::duration_cast<std::chrono::milliseconds>(end - begin);
38-
ray::stats::STATS_io_context_event_loop_lag_ms.Record(
37+
io_context.io_context_event_loop_lag_ms_gauge_metric.Record(
3938
duration.count(),
4039
{
4140
{"Name", context_name.value_or(GetThreadName())},
@@ -106,8 +105,9 @@ void instrumented_io_context::post(std::function<void()> handler,
106105
auto stats_handle =
107106
event_stats_->RecordStart(std::move(name), emit_metrics_, 0, context_name_);
108107
handler = [handler = std::move(handler),
108+
event_stats = event_stats_,
109109
stats_handle = std::move(stats_handle)]() mutable {
110-
EventTracker::RecordExecution(handler, std::move(stats_handle));
110+
event_stats->RecordExecution(handler, std::move(stats_handle));
111111
};
112112
}
113113

@@ -129,9 +129,10 @@ void instrumented_io_context::dispatch(std::function<void()> handler, std::strin
129129
// GuardedHandlerStats synchronizes internal access, we can concurrently write to the
130130
// handler stats it->second from multiple threads without acquiring a table-level
131131
// readers lock in the callback.
132-
boost::asio::dispatch(
133-
*this,
134-
[handler = std::move(handler), stats_handle = std::move(stats_handle)]() mutable {
135-
EventTracker::RecordExecution(handler, std::move(stats_handle));
136-
});
132+
boost::asio::dispatch(*this,
133+
[event_stats = event_stats_,
134+
handler = std::move(handler),
135+
stats_handle = std::move(stats_handle)]() mutable {
136+
event_stats->RecordExecution(handler, std::move(stats_handle));
137+
});
137138
}

src/ray/common/asio/instrumented_io_context.h

Lines changed: 5 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -19,6 +19,7 @@
1919
#include <string>
2020

2121
#include "ray/common/event_stats.h"
22+
#include "ray/common/metrics.h"
2223
#include "ray/common/ray_config.h"
2324
#include "ray/util/logging.h"
2425

@@ -56,7 +57,10 @@ class instrumented_io_context : public boost::asio::io_context {
5657
/// for the provided handler.
5758
void dispatch(std::function<void()> handler, std::string name);
5859

59-
EventTracker &stats() const { return *event_stats_; };
60+
std::shared_ptr<EventTracker> stats() const { return event_stats_; };
61+
62+
ray::stats::Gauge io_context_event_loop_lag_ms_gauge_metric{
63+
ray::GetIoContextEventLoopLagMsGaugeMetric()};
6064

6165
private:
6266
/// The event stats tracker to use to record asio handler stats to.

src/ray/common/asio/periodical_runner.cc

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -106,7 +106,7 @@ void PeriodicalRunner::DoRunFnPeriodicallyInstrumented(
106106
// which the handler was elgible to execute on the event loop but was queued by the
107107
// event loop.
108108
auto stats_handle =
109-
io_service_.stats().RecordStart(name, false, period.total_nanoseconds());
109+
io_service_.stats()->RecordStart(name, false, period.total_nanoseconds());
110110
timer->async_wait(
111111
[weak_self = weak_from_this(),
112112
fn = std::move(fn),
@@ -115,7 +115,7 @@ void PeriodicalRunner::DoRunFnPeriodicallyInstrumented(
115115
stats_handle = std::move(stats_handle),
116116
name = std::move(name)](const boost::system::error_code &error) mutable {
117117
if (auto self = weak_self.lock(); self) {
118-
self->io_service_.stats().RecordExecution(
118+
self->io_service_.stats()->RecordExecution(
119119
[self,
120120
fn = std::move(fn),
121121
error,

src/ray/common/event_stats.cc

Lines changed: 16 additions & 14 deletions
Original file line numberDiff line numberDiff line change
@@ -21,7 +21,6 @@
2121
#include <utility>
2222

2323
#include "ray/stats/metric.h"
24-
#include "ray/stats/metric_defs.h"
2524
#include "ray/util/time.h"
2625

2726
namespace {
@@ -66,9 +65,9 @@ std::shared_ptr<StatsHandle> EventTracker::RecordStart(
6665
}
6766

6867
if (emit_metrics) {
69-
ray::stats::STATS_operation_count.Record(1, event_context_name.value_or(name));
70-
ray::stats::STATS_operation_active_count.Record(curr_count,
71-
event_context_name.value_or(name));
68+
operation_count_metric_.Record(1, {{"Name", event_context_name.value_or(name)}});
69+
operation_active_gauge_metric_.Record(curr_count,
70+
{{"Name", event_context_name.value_or(name)}});
7271
}
7372

7473
return std::make_shared<StatsHandle>(
@@ -89,10 +88,11 @@ void EventTracker::RecordEnd(std::shared_ptr<StatsHandle> handle) {
8988

9089
if (handle->emit_stats) {
9190
// Update event-specific stats.
92-
ray::stats::STATS_operation_run_time_ms.Record(
93-
execution_time_ns / 1000000, handle->context_name.value_or(handle->event_name));
94-
ray::stats::STATS_operation_active_count.Record(
95-
curr_count, handle->context_name.value_or(handle->event_name));
91+
operation_run_time_ms_histogram_metric_.Record(
92+
execution_time_ns / 1000000,
93+
{{"Name", handle->context_name.value_or(handle->event_name)}});
94+
operation_active_gauge_metric_.Record(
95+
curr_count, {{"Name", handle->context_name.value_or(handle->event_name)}});
9696
}
9797

9898
handle->end_or_execution_recorded = true;
@@ -135,13 +135,15 @@ void EventTracker::RecordExecution(const std::function<void()> &fn,
135135

136136
if (handle->emit_stats) {
137137
// Update event-specific stats.
138-
ray::stats::STATS_operation_run_time_ms.Record(
139-
execution_time_ns / 1000000, handle->context_name.value_or(handle->event_name));
140-
ray::stats::STATS_operation_active_count.Record(
141-
curr_count, handle->context_name.value_or(handle->event_name));
138+
operation_run_time_ms_histogram_metric_.Record(
139+
execution_time_ns / 1000000,
140+
{{"Name", handle->context_name.value_or(handle->event_name)}});
141+
operation_active_gauge_metric_.Record(
142+
curr_count, {{"Name", handle->context_name.value_or(handle->event_name)}});
142143
// Update global stats.
143-
ray::stats::STATS_operation_queue_time_ms.Record(
144-
queue_time_ns / 1000000, handle->context_name.value_or(handle->event_name));
144+
operation_queue_time_ms_histogram_metric_.Record(
145+
queue_time_ns / 1000000,
146+
{{"Name", handle->context_name.value_or(handle->event_name)}});
145147
}
146148

147149
{

src/ray/common/event_stats.h

Lines changed: 12 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -18,6 +18,7 @@
1818

1919
#include "absl/container/flat_hash_map.h"
2020
#include "absl/synchronization/mutex.h"
21+
#include "ray/common/metrics.h"
2122
#include "ray/common/ray_config.h"
2223
#include "ray/util/logging.h"
2324

@@ -132,14 +133,14 @@ class EventTracker {
132133
///
133134
/// \param fn The function to execute and instrument.
134135
/// \param handle An opaque stats handle returned by RecordStart().
135-
static void RecordExecution(const std::function<void()> &fn,
136-
std::shared_ptr<StatsHandle> handle);
136+
void RecordExecution(const std::function<void()> &fn,
137+
std::shared_ptr<StatsHandle> handle);
137138

138139
/// Records the end of an event. This is used in conjunction
139140
/// with RecordStart() to manually instrument an event.
140141
///
141142
/// \param handle An opaque stats handle returned by RecordStart().
142-
static void RecordEnd(std::shared_ptr<StatsHandle> handle);
143+
void RecordEnd(std::shared_ptr<StatsHandle> handle);
143144

144145
/// Returns a snapshot view of the global count, queueing, and execution statistics
145146
/// across all handlers.
@@ -188,4 +189,12 @@ class EventTracker {
188189

189190
/// Protects access to the per-handler post stats table.
190191
mutable absl::Mutex mutex_;
192+
193+
ray::stats::Count operation_count_metric_{ray::GetOperationCountCounterMetric()};
194+
ray::stats::Gauge operation_active_gauge_metric_{
195+
ray::GetOperationActiveCountGaugeMetric()};
196+
ray::stats::Histogram operation_run_time_ms_histogram_metric_{
197+
ray::GetOperationRunTimeMsHistogramMetric()};
198+
ray::stats::Histogram operation_queue_time_ms_histogram_metric_{
199+
ray::GetOperationQueueTimeMsHistogramMetric()};
191200
};

src/ray/common/metrics.h

Lines changed: 47 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -86,4 +86,51 @@ inline ray::stats::Histogram GetSchedulerPlacementTimeMsHistogramMetric() {
8686
};
8787
}
8888

89+
inline ray::stats::Gauge GetIoContextEventLoopLagMsGaugeMetric() {
90+
return ray::stats::Gauge{
91+
/*name=*/"io_context_event_loop_lag_ms",
92+
/*description=*/"The latency of a task from post to execution",
93+
/*unit=*/"ms",
94+
/*tag_keys=*/{"Name"},
95+
};
96+
}
97+
98+
inline ray::stats::Count GetOperationCountCounterMetric() {
99+
return ray::stats::Count{
100+
/*name=*/"operation_count",
101+
/*description=*/"operation count",
102+
/*unit=*/"",
103+
/*tag_keys=*/{"Name"},
104+
};
105+
}
106+
107+
inline ray::stats::Histogram GetOperationRunTimeMsHistogramMetric() {
108+
return ray::stats::Histogram{
109+
/*name=*/"operation_run_time_ms",
110+
/*description=*/"operation execution time",
111+
/*unit=*/"ms",
112+
/*boundaries=*/{1, 10, 100, 1000, 10000},
113+
/*tag_keys=*/{"Name"},
114+
};
115+
}
116+
117+
inline ray::stats::Histogram GetOperationQueueTimeMsHistogramMetric() {
118+
return ray::stats::Histogram{
119+
/*name=*/"operation_queue_time_ms",
120+
/*description=*/"operation queuing time",
121+
/*unit=*/"ms",
122+
/*boundaries=*/{1, 10, 100, 1000, 10000},
123+
/*tag_keys=*/{"Name"},
124+
};
125+
}
126+
127+
inline ray::stats::Gauge GetOperationActiveCountGaugeMetric() {
128+
return ray::stats::Gauge{
129+
/*name=*/"operation_active_count",
130+
/*description=*/"active operation number",
131+
/*unit=*/"",
132+
/*tag_keys=*/{"Name"},
133+
};
134+
}
135+
89136
} // namespace ray

src/ray/core_worker/core_worker.cc

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -453,10 +453,10 @@ CoreWorker::CoreWorker(
453453
periodical_runner_->RunFnPeriodically(
454454
[this] {
455455
RAY_LOG(INFO) << "Event stats:\n\n"
456-
<< io_service_.stats().StatsString() << "\n\n"
456+
<< io_service_.stats()->StatsString() << "\n\n"
457457
<< "-----------------\n"
458458
<< "Task execution event stats:\n"
459-
<< task_execution_service_.stats().StatsString() << "\n\n"
459+
<< task_execution_service_.stats()->StatsString() << "\n\n"
460460
<< "-----------------\n"
461461
<< "Task Event stats:\n"
462462
<< task_event_buffer_->DebugString() << "\n";

src/ray/core_worker/task_event_buffer.cc

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1012,7 +1012,7 @@ std::string TaskEventBufferImpl::DebugString() {
10121012

10131013
auto stats = stats_counter_.GetAll();
10141014
ss << "\nIO Service Stats:\n";
1015-
ss << io_service_.stats().StatsString();
1015+
ss << io_service_.stats()->StatsString();
10161016
ss << "\nOther Stats:"
10171017
<< "\n\tgcs_grpc_in_progress:" << gcs_grpc_in_progress_
10181018
<< "\n\tevent_aggregator_grpc_in_progress:" << event_aggregator_grpc_in_progress_

src/ray/gcs/gcs_server.cc

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -952,11 +952,11 @@ void GcsServer::PrintDebugState() const {
952952
RayConfig::instance().event_stats_print_interval_ms();
953953
if (event_stats_print_interval_ms != -1 && RayConfig::instance().event_stats()) {
954954
RAY_LOG(INFO) << "Main service Event stats:\n\n"
955-
<< io_context_provider_.GetDefaultIOContext().stats().StatsString()
955+
<< io_context_provider_.GetDefaultIOContext().stats()->StatsString()
956956
<< "\n\n";
957957
for (const auto &io_context : io_context_provider_.GetAllDedicatedIOContexts()) {
958958
RAY_LOG(INFO) << io_context->GetName() << " Event stats:\n\n"
959-
<< io_context->GetIoService().stats().StatsString() << "\n\n";
959+
<< io_context->GetIoService().stats()->StatsString() << "\n\n";
960960
}
961961
}
962962
}

0 commit comments

Comments
 (0)