From ca9802b9834d312097d548e04f2d8cc99961f9f0 Mon Sep 17 00:00:00 2001 From: Blaise Bruer Date: Wed, 24 Jul 2024 23:25:58 -0500 Subject: [PATCH] [Refactor] Complete metrics overhaul Metrics got an entire overhaul. Instead of relying on a broken prometheus library to publish our metrics, we now use the `tracing` library and with OpenTelemetry that we bind together then publish into a prometheus library. Metrics are now mostly derive-macros. This means that the struct can express what it wants to export and a help text. The library will choose if it is able to export it. Tracing now works by calling `.publish()` on the parent structs, those structs need to call `.publish()` on all the child members it wishes to publish data about. If a "group" is requested, use the `group!()` macro, which under-the-hood calls `tracing::span` with some special labels. At primitive layers, it will call the `publish!()` macro, which will call `tracing::event!()` macro under-the-hood with some special fields set. A custom `tracing::Subscriber` will intercept all the events and spans and convert them into a json-like object. This object can then be exported as real json or encoded into other formats like otel/prometheus. closes: #1164, #650, #384, #209 towards: #206 --- BUILD.bazel | 9 +- Cargo.lock | 230 +++++- Cargo.toml | 9 +- integration_tests/simple_prometheus_test.sh | 17 +- nativelink-error/BUILD.bazel | 1 + nativelink-error/Cargo.toml | 1 + nativelink-error/src/lib.rs | 13 + nativelink-metric-collector/BUILD.bazel | 63 ++ nativelink-metric-collector/Cargo.toml | 22 + nativelink-metric-collector/src/lib.rs | 21 + .../src/metrics_collection.rs | 88 +++ .../src/metrics_visitors.rs | 161 +++++ .../src/otel_exporter.rs | 75 ++ .../src/tracing_layers.rs | 148 ++++ .../tests/metric_collector_test.rs | 194 ++++++ nativelink-metric/BUILD.bazel | 35 + nativelink-metric/Cargo.toml | 12 + .../BUILD.bazel | 31 + .../nativelink-metric-macro-derive/Cargo.toml | 14 + .../nativelink-metric-macro-derive/src/lib.rs | 238 +++++++ nativelink-metric/src/lib.rs | 524 ++++++++++++++ nativelink-scheduler/BUILD.bazel | 2 + nativelink-scheduler/Cargo.toml | 3 +- nativelink-scheduler/src/action_scheduler.rs | 7 +- .../src/api_worker_scheduler.rs | 95 +-- .../src/awaited_action_db/awaited_action.rs | 33 +- .../src/awaited_action_db/mod.rs | 6 +- .../src/cache_lookup_scheduler.rs | 6 + .../src/default_scheduler_factory.rs | 42 +- nativelink-scheduler/src/grpc_scheduler.rs | 5 + .../src/memory_awaited_action_db.rs | 61 +- .../src/platform_property_manager.rs | 22 + .../src/property_modifier_scheduler.rs | 13 +- nativelink-scheduler/src/simple_scheduler.rs | 14 +- .../src/simple_scheduler_state_manager.rs | 18 +- nativelink-scheduler/src/worker.rs | 116 +--- nativelink-scheduler/src/worker_scheduler.rs | 9 +- .../tests/utils/mock_scheduler.rs | 4 + nativelink-service/BUILD.bazel | 3 +- nativelink-service/Cargo.toml | 2 +- nativelink-service/tests/ac_server_test.rs | 3 - nativelink-service/tests/bep_server_test.rs | 2 - .../tests/bytestream_server_test.rs | 2 - nativelink-service/tests/cas_server_test.rs | 2 - .../tests/worker_api_server_test.rs | 2 + nativelink-store/BUILD.bazel | 3 +- nativelink-store/Cargo.toml | 2 +- .../src/completeness_checking_store.rs | 31 +- nativelink-store/src/compression_store.rs | 9 +- nativelink-store/src/dedup_store.rs | 5 + nativelink-store/src/default_store_factory.rs | 31 +- nativelink-store/src/existence_cache_store.rs | 15 +- nativelink-store/src/fast_slow_store.rs | 44 +- nativelink-store/src/filesystem_store.rs | 42 +- nativelink-store/src/grpc_store.rs | 3 + nativelink-store/src/memory_store.rs | 14 +- nativelink-store/src/noop_store.rs | 13 + nativelink-store/src/redis_store.rs | 15 +- nativelink-store/src/ref_store.rs | 3 + nativelink-store/src/s3_store.rs | 6 + nativelink-store/src/shard_store.rs | 40 +- .../src/size_partitioning_store.rs | 24 +- nativelink-store/src/store_manager.rs | 17 +- nativelink-store/src/verify_store.rs | 42 +- .../tests/fast_slow_store_test.rs | 8 +- nativelink-util/BUILD.bazel | 2 +- nativelink-util/Cargo.toml | 2 +- nativelink-util/src/action_messages.rs | 96 ++- nativelink-util/src/common.rs | 14 + nativelink-util/src/digest_hasher.rs | 13 + nativelink-util/src/evicting_map.rs | 129 +--- nativelink-util/src/metrics_utils.rs | 654 ++++-------------- .../src/operation_state_manager.rs | 17 +- nativelink-util/src/platform_properties.rs | 25 +- nativelink-util/src/store_trait.rs | 18 +- nativelink-worker/BUILD.bazel | 1 + nativelink-worker/Cargo.toml | 1 + nativelink-worker/src/local_worker.rs | 135 ++-- .../src/running_actions_manager.rs | 124 +--- src/bin/nativelink.rs | 272 +++++--- 80 files changed, 2723 insertions(+), 1525 deletions(-) create mode 100644 nativelink-metric-collector/BUILD.bazel create mode 100644 nativelink-metric-collector/Cargo.toml create mode 100644 nativelink-metric-collector/src/lib.rs create mode 100644 nativelink-metric-collector/src/metrics_collection.rs create mode 100644 nativelink-metric-collector/src/metrics_visitors.rs create mode 100644 nativelink-metric-collector/src/otel_exporter.rs create mode 100644 nativelink-metric-collector/src/tracing_layers.rs create mode 100644 nativelink-metric-collector/tests/metric_collector_test.rs create mode 100644 nativelink-metric/BUILD.bazel create mode 100644 nativelink-metric/Cargo.toml create mode 100644 nativelink-metric/nativelink-metric-macro-derive/BUILD.bazel create mode 100644 nativelink-metric/nativelink-metric-macro-derive/Cargo.toml create mode 100644 nativelink-metric/nativelink-metric-macro-derive/src/lib.rs create mode 100644 nativelink-metric/src/lib.rs diff --git a/BUILD.bazel b/BUILD.bazel index 4338456b9..3ce902f18 100644 --- a/BUILD.bazel +++ b/BUILD.bazel @@ -16,6 +16,8 @@ rust_binary( deps = [ "//nativelink-config", "//nativelink-error", + "//nativelink-metric", + "//nativelink-metric-collector", "//nativelink-proto", "//nativelink-scheduler", "//nativelink-service", @@ -28,16 +30,21 @@ rust_binary( "@crates//:futures", "@crates//:hyper", "@crates//:mimalloc", + "@crates//:opentelemetry", + "@crates//:opentelemetry-prometheus", + "@crates//:opentelemetry_sdk", "@crates//:parking_lot", - "@crates//:prometheus-client", + "@crates//:prometheus", "@crates//:rustls-pemfile", "@crates//:scopeguard", + "@crates//:serde_json", "@crates//:serde_json5", "@crates//:tokio", "@crates//:tokio-rustls", "@crates//:tonic", "@crates//:tower", "@crates//:tracing", + "@crates//:tracing-subscriber", ], ) diff --git a/Cargo.lock b/Cargo.lock index bf90bf222..a6fe674ae 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -110,12 +110,6 @@ version = "1.0.86" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "b3d1d046238990b9cf5bcde22a3fb3584ee5cf65fb2765f454ed428c7a0063da" -[[package]] -name = "arc-cell" -version = "0.3.3" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "fec9da9adf9420d86def101bd5b4a227b0512d456b6a128b0d677fdf68e5f7b8" - [[package]] name = "arc-swap" version = "1.7.1" @@ -768,6 +762,12 @@ dependencies = [ "syn_derive", ] +[[package]] +name = "bumpalo" +version = "3.16.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "79296716171880943b8470b5f8d03aa55eb2e645a4874bdbb28adb49162e012c" + [[package]] name = "byte-unit" version = "5.1.4" @@ -1113,12 +1113,6 @@ dependencies = [ "windows-sys 0.48.0", ] -[[package]] -name = "dtoa" -version = "1.0.9" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "dcbb2bf8e87535c23f7a8a321e364ce21462d0ff10cb6407820e8e96dfff6653" - [[package]] name = "ecdsa" version = "0.14.8" @@ -1393,6 +1387,12 @@ version = "0.29.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "40ecd4077b5ae9fd2e9e169b102c6c330d0605168eb0e8bf79952b256dbefffd" +[[package]] +name = "glob" +version = "0.3.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "d2fabcfbdc87f4758337ca535fb41a6d701b65693ce38287d856d1674551ec9b" + [[package]] name = "group" version = "0.12.1" @@ -1688,6 +1688,15 @@ version = "1.0.11" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "49f1f14873335454500d59611f1cf4a4b0f786f9ac11f4312a78e4cf2566695b" +[[package]] +name = "js-sys" +version = "0.3.69" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "29c15563dc2726973df627357ce0c9ddddbea194836909d655df6a75d2cf296d" +dependencies = [ + "wasm-bindgen", +] + [[package]] name = "lazy_static" version = "1.5.0" @@ -1881,22 +1890,29 @@ dependencies = [ "mimalloc", "nativelink-config", "nativelink-error", + "nativelink-metric", + "nativelink-metric-collector", "nativelink-proto", "nativelink-scheduler", "nativelink-service", "nativelink-store", "nativelink-util", "nativelink-worker", + "opentelemetry", + "opentelemetry-prometheus", + "opentelemetry_sdk", "parking_lot", - "prometheus-client", + "prometheus", "rustls-pemfile 2.1.2", "scopeguard", + "serde_json", "serde_json5", "tokio", "tokio-rustls 0.25.0", "tonic", "tower", "tracing", + "tracing-subscriber", ] [[package]] @@ -1916,6 +1932,7 @@ name = "nativelink-error" version = "0.4.0" dependencies = [ "hex", + "nativelink-metric", "nativelink-proto", "prost", "prost-types", @@ -1934,6 +1951,43 @@ dependencies = [ "syn 2.0.68", ] +[[package]] +name = "nativelink-metric" +version = "0.4.0" +dependencies = [ + "async-lock", + "nativelink-metric-macro-derive", + "parking_lot", + "tokio", + "tracing", +] + +[[package]] +name = "nativelink-metric-collector" +version = "0.4.0" +dependencies = [ + "nativelink-error", + "nativelink-metric", + "opentelemetry", + "opentelemetry-prometheus", + "opentelemetry_sdk", + "parking_lot", + "prometheus", + "serde", + "serde_json", + "tracing", + "tracing-subscriber", +] + +[[package]] +name = "nativelink-metric-macro-derive" +version = "0.4.0" +dependencies = [ + "proc-macro2", + "quote", + "syn 2.0.68", +] + [[package]] name = "nativelink-proto" version = "0.4.0" @@ -1959,6 +2013,7 @@ dependencies = [ "nativelink-config", "nativelink-error", "nativelink-macro", + "nativelink-metric", "nativelink-proto", "nativelink-store", "nativelink-util", @@ -1992,13 +2047,13 @@ dependencies = [ "nativelink-config", "nativelink-error", "nativelink-macro", + "nativelink-metric", "nativelink-proto", "nativelink-scheduler", "nativelink-store", "nativelink-util", "parking_lot", "pretty_assertions", - "prometheus-client", "prost", "prost-types", "rand", @@ -2015,7 +2070,6 @@ dependencies = [ name = "nativelink-store" version = "0.4.0" dependencies = [ - "arc-cell", "async-lock", "async-trait", "aws-config", @@ -2039,6 +2093,7 @@ dependencies = [ "nativelink-config", "nativelink-error", "nativelink-macro", + "nativelink-metric", "nativelink-proto", "nativelink-util", "once_cell", @@ -2080,12 +2135,12 @@ dependencies = [ "nativelink-config", "nativelink-error", "nativelink-macro", + "nativelink-metric", "nativelink-proto", "parking_lot", "pin-project", "pin-project-lite", "pretty_assertions", - "prometheus-client", "prost", "prost-types", "rand", @@ -2114,6 +2169,7 @@ dependencies = [ "nativelink-config", "nativelink-error", "nativelink-macro", + "nativelink-metric", "nativelink-proto", "nativelink-scheduler", "nativelink-store", @@ -2211,12 +2267,68 @@ version = "0.1.5" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "ff011a302c396a5197692431fc1948019154afc178baf7d8e37367442a4601cf" +[[package]] +name = "opentelemetry" +version = "0.23.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "1b69a91d4893e713e06f724597ad630f1fa76057a5e1026c0ca67054a9032a76" +dependencies = [ + "futures-core", + "futures-sink", + "js-sys", + "once_cell", + "pin-project-lite", + "thiserror", +] + +[[package]] +name = "opentelemetry-prometheus" +version = "0.16.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "5e1a24eafe47b693cb938f8505f240dc26c71db60df9aca376b4f857e9653ec7" +dependencies = [ + "once_cell", + "opentelemetry", + "opentelemetry_sdk", + "prometheus", + "protobuf", +] + +[[package]] +name = "opentelemetry_sdk" +version = "0.23.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "ae312d58eaa90a82d2e627fd86e075cf5230b3f11794e2ed74199ebbe572d4fd" +dependencies = [ + "async-trait", + "futures-channel", + "futures-executor", + "futures-util", + "glob", + "lazy_static", + "once_cell", + "opentelemetry", + "ordered-float", + "percent-encoding", + "rand", + "thiserror", +] + [[package]] name = "option-ext" version = "0.2.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "04744f49eae99ab78e0d5c0b603ab218f515ea8cfe5a456d7629ad883a3b6e7d" +[[package]] +name = "ordered-float" +version = "4.2.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "19ff2cf528c6c03d9ed653d6c4ce1dc0582dc4af309790ad92f07c1cd551b0be" +dependencies = [ + "num-traits", +] + [[package]] name = "outref" version = "0.5.1" @@ -2447,26 +2559,18 @@ dependencies = [ ] [[package]] -name = "prometheus-client" -version = "0.21.2" +name = "prometheus" +version = "0.13.4" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "3c99afa9a01501019ac3a14d71d9f94050346f55ca471ce90c799a15c58f61e2" +checksum = "3d33c28a30771f7f96db69893f78b857f7450d7e0237e9c8fc6427a81bae7ed1" dependencies = [ - "dtoa", - "itoa", + "cfg-if", + "fnv", + "lazy_static", + "memchr", "parking_lot", - "prometheus-client-derive-encode", -] - -[[package]] -name = "prometheus-client-derive-encode" -version = "0.4.2" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "440f724eba9f6996b75d63681b0a92b06947f1457076d503a4d2e2c8f56442b8" -dependencies = [ - "proc-macro2", - "quote", - "syn 2.0.68", + "protobuf", + "thiserror", ] [[package]] @@ -2522,6 +2626,12 @@ dependencies = [ "prost", ] +[[package]] +name = "protobuf" +version = "2.28.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "106dd99e98437432fed6519dedecfade6a06a73bb7b2a1e019fdd2bee5778d94" + [[package]] name = "ptr_meta" version = "0.1.4" @@ -3806,6 +3916,60 @@ version = "0.11.0+wasi-snapshot-preview1" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "9c8d87e72b64a3b4db28d11ce29237c246188f4f51057d65a7eab63b7987e423" +[[package]] +name = "wasm-bindgen" +version = "0.2.92" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "4be2531df63900aeb2bca0daaaddec08491ee64ceecbee5076636a3b026795a8" +dependencies = [ + "cfg-if", + "wasm-bindgen-macro", +] + +[[package]] +name = "wasm-bindgen-backend" +version = "0.2.92" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "614d787b966d3989fa7bb98a654e369c762374fd3213d212cfc0251257e747da" +dependencies = [ + "bumpalo", + "log", + "once_cell", + "proc-macro2", + "quote", + "syn 2.0.68", + "wasm-bindgen-shared", +] + +[[package]] +name = "wasm-bindgen-macro" +version = "0.2.92" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "a1f8823de937b71b9460c0c34e25f3da88250760bec0ebac694b49997550d726" +dependencies = [ + "quote", + "wasm-bindgen-macro-support", +] + +[[package]] +name = "wasm-bindgen-macro-support" +version = "0.2.92" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "e94f17b526d0a461a191c78ea52bbce64071ed5c04c9ffe424dcb38f74171bb7" +dependencies = [ + "proc-macro2", + "quote", + "syn 2.0.68", + "wasm-bindgen-backend", + "wasm-bindgen-shared", +] + +[[package]] +name = "wasm-bindgen-shared" +version = "0.2.92" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "af190c94f2773fdb3729c55b007a722abb5384da03bc0986df4c289bf5567e96" + [[package]] name = "webpki-roots" version = "0.25.4" diff --git a/Cargo.toml b/Cargo.toml index 5d134e711..d03e54497 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -40,6 +40,8 @@ nativelink-service = { path = "nativelink-service" } nativelink-store = { path = "nativelink-store" } nativelink-util = { path = "nativelink-util" } nativelink-worker = { path = "nativelink-worker" } +nativelink-metric = { path = "nativelink-metric" } +nativelink-metric-collector = { path = "nativelink-metric-collector" } async-lock = "3.3.0" axum = "0.6.20" @@ -49,7 +51,6 @@ futures = "0.3.30" hyper = { version = "0.14.28" } mimalloc = "0.1.41" parking_lot = "0.12.2" -prometheus-client = "0.21.2" rustls-pemfile = "2.1.2" scopeguard = "1.2.0" serde_json5 = "0.1.0" @@ -58,3 +59,9 @@ tokio-rustls = "0.25.0" tonic = { version = "0.11.0", features = ["gzip", "tls"] } tower = "0.4.13" tracing = "0.1.40" +opentelemetry_sdk = { version = "0.23.0", features = ["metrics"] } +tracing-subscriber = "0.3.18" +opentelemetry = { version = "0.23.0", features = ["metrics"] } +prometheus = "0.13.4" +opentelemetry-prometheus = "0.16.0" +serde_json = "1.0.120" diff --git a/integration_tests/simple_prometheus_test.sh b/integration_tests/simple_prometheus_test.sh index 37028e06a..9a6309bd3 100644 --- a/integration_tests/simple_prometheus_test.sh +++ b/integration_tests/simple_prometheus_test.sh @@ -34,21 +34,14 @@ echo "$all_contents" # in the config file of integration tests for the CAS. echo 'Checking: nativelink_stores_AC_MAIN_STORE_evicting_map_max_bytes 500000000' grep -q 'nativelink_stores_AC_MAIN_STORE_evicting_map_max_bytes 500000000' <<< "$all_contents" -echo 'Checking: nativelink_stores_AC_MAIN_STORE_read_buff_size_bytes 32768' -grep -q 'nativelink_stores_AC_MAIN_STORE_read_buff_size_bytes 32768' <<< "$all_contents" -echo 'Checking: nativelink_stores_AC_MAIN_STORE_evicting_map_max_bytes 500000000' -grep -q 'nativelink_stores_AC_MAIN_STORE_evicting_map_max_bytes 500000000' <<< "$all_contents" +echo 'Checking: nativelink_stores_AC_MAIN_STORE_read_buffer_size 32768' +grep -q 'nativelink_stores_AC_MAIN_STORE_read_buffer_size 32768' <<< "$all_contents" +echo 'Checking: nativelink_stores_CAS_MAIN_STORE_inner_store_evicting_map_max_bytes 10000000000' +grep -q 'nativelink_stores_CAS_MAIN_STORE_inner_store_evicting_map_max_bytes 10000000000' <<< "$all_contents" # Ensure our store metrics are only published once. -count=$(grep 'nativelink_stores_AC_MAIN_STORE_evicting_map_max_bytes 500000000' <<< "$all_contents" | wc -l) +count=$(grep 'nativelink_stores_CAS_MAIN_STORE_inner_store_evicting_map_max_bytes 10000000000' <<< "$all_contents" | wc -l) if [[ $count -ne 1 ]]; then echo "Expected to find 1 instance of CAS_MAIN_STORE, but found $count" exit 1 fi - -# Check dynamic metrics in some of the stores. -# These are the most stable settings to test that are dymaic. -echo 'Checking: nativelink_stores_AC_MAIN_STORE_evicting_map_item_size_bytes{quantile="0.99"}' -grep -q 'nativelink_stores_AC_MAIN_STORE_evicting_map_item_size_bytes{quantile="0.99"}' <<< "$all_contents" -echo 'Checking: nativelink_stores_AC_MAIN_STORE_evicting_map_items_in_store_total 3' -grep -q 'nativelink_stores_AC_MAIN_STORE_evicting_map_items_in_store_total 3' <<< "$all_contents" diff --git a/nativelink-error/BUILD.bazel b/nativelink-error/BUILD.bazel index 8754501ea..841be517c 100644 --- a/nativelink-error/BUILD.bazel +++ b/nativelink-error/BUILD.bazel @@ -12,6 +12,7 @@ rust_library( ], visibility = ["//visibility:public"], deps = [ + "//nativelink-metric", "//nativelink-proto", "@crates//:hex", "@crates//:prost", diff --git a/nativelink-error/Cargo.toml b/nativelink-error/Cargo.toml index 7d2b96883..d687953f9 100644 --- a/nativelink-error/Cargo.toml +++ b/nativelink-error/Cargo.toml @@ -9,6 +9,7 @@ autobenches = false [dependencies] nativelink-proto = { path = "../nativelink-proto" } +nativelink-metric = { path = "../nativelink-metric" } hex = "0.4.3" prost = "0.12.4" diff --git a/nativelink-error/src/lib.rs b/nativelink-error/src/lib.rs index 0c4f69d2d..44e449dc3 100644 --- a/nativelink-error/src/lib.rs +++ b/nativelink-error/src/lib.rs @@ -12,6 +12,9 @@ // See the License for the specific language governing permissions and // limitations under the License. +use nativelink_metric::{ + MetricFieldData, MetricKind, MetricPublishKnownKindData, MetricsComponent, +}; use prost_types::TimestampError; use serde::{Deserialize, Serialize}; @@ -47,6 +50,16 @@ pub struct Error { pub messages: Vec, } +impl MetricsComponent for Error { + fn publish( + &self, + kind: MetricKind, + field_metadata: MetricFieldData, + ) -> Result { + self.to_string().publish(kind, field_metadata) + } +} + impl Error { pub fn new(code: Code, msg: String) -> Self { let mut msgs = Vec::with_capacity(1); diff --git a/nativelink-metric-collector/BUILD.bazel b/nativelink-metric-collector/BUILD.bazel new file mode 100644 index 000000000..aac4ac551 --- /dev/null +++ b/nativelink-metric-collector/BUILD.bazel @@ -0,0 +1,63 @@ +load( + "@rules_rust//rust:defs.bzl", + "rust_doc", + "rust_doc_test", + "rust_library", + "rust_test_suite", +) + +rust_library( + name = "nativelink-metric-collector", + srcs = [ + "src/lib.rs", + "src/metrics_collection.rs", + "src/metrics_visitors.rs", + "src/otel_exporter.rs", + "src/tracing_layers.rs", + ], + visibility = ["//visibility:public"], + deps = [ + "//nativelink-metric", + "@crates//:opentelemetry", + "@crates//:parking_lot", + "@crates//:serde", + "@crates//:tracing", + "@crates//:tracing-subscriber", + ], +) + +rust_test_suite( + name = "integration", + timeout = "short", + srcs = [ + "tests/metric_collector_test.rs", + ], + proc_macro_deps = [ + "//nativelink-macro", + ], + deps = [ + ":nativelink-metric-collector", + "//nativelink-error", + "//nativelink-metric", + "@crates//:opentelemetry", + "@crates//:opentelemetry-prometheus", + "@crates//:opentelemetry_sdk", + "@crates//:prometheus", + "@crates//:serde_json", + "@crates//:tokio", + "@crates//:tracing", + "@crates//:tracing-subscriber", + ], +) + +rust_doc( + name = "docs", + crate = ":nativelink-metric-collector", + visibility = ["//visibility:public"], +) + +rust_doc_test( + name = "doc_test", + timeout = "short", + crate = ":nativelink-metric-collector", +) diff --git a/nativelink-metric-collector/Cargo.toml b/nativelink-metric-collector/Cargo.toml new file mode 100644 index 000000000..ec739f7eb --- /dev/null +++ b/nativelink-metric-collector/Cargo.toml @@ -0,0 +1,22 @@ +[package] +name = "nativelink-metric-collector" +version = "0.4.0" +edition = "2021" +rust-version = "1.79.0" + +[dependencies] +nativelink-metric = { path = "../nativelink-metric" } + +opentelemetry = { version = "0.23.0", features = ["metrics"] } +parking_lot = "0.12.2" +serde = "1.0.204" +tracing = "0.1.40" +tracing-subscriber = "0.3.18" + +[dev-dependencies] +nativelink-error = { path = "../nativelink-error" } + +opentelemetry_sdk = { version = "0.23.0", features = ["metrics"] } +opentelemetry-prometheus = "0.16.0" +prometheus = "0.13.4" +serde_json = "1.0.120" diff --git a/nativelink-metric-collector/src/lib.rs b/nativelink-metric-collector/src/lib.rs new file mode 100644 index 000000000..9316abdc6 --- /dev/null +++ b/nativelink-metric-collector/src/lib.rs @@ -0,0 +1,21 @@ +// Copyright 2024 The NativeLink Authors. All rights reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +pub use otel_exporter::otel_export; +pub use tracing_layers::MetricsCollectorLayer; + +mod metrics_collection; +mod metrics_visitors; +mod otel_exporter; +mod tracing_layers; diff --git a/nativelink-metric-collector/src/metrics_collection.rs b/nativelink-metric-collector/src/metrics_collection.rs new file mode 100644 index 000000000..592f522d5 --- /dev/null +++ b/nativelink-metric-collector/src/metrics_collection.rs @@ -0,0 +1,88 @@ +// Copyright 2024 The NativeLink Authors. All rights reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +use std::borrow::Cow; +use std::collections::HashMap; +use std::ops::{Deref, DerefMut}; + +use serde::Serialize; + +use crate::metrics_visitors::CollectionKind; + +/// The final-metric primitive value that was collected with type. +#[derive(Debug, Serialize)] +#[serde(untagged)] +pub enum CollectedMetricPrimitiveValue { + Counter(u64), + String(Cow<'static, str>), +} + +/// The final-metric primitive field that was collected. +#[derive(Default, Debug)] +pub struct CollectedMetricPrimitive { + pub value: Option, + pub help: String, + pub value_type: CollectionKind, +} + +impl Serialize for CollectedMetricPrimitive { + fn serialize(&self, serializer: S) -> Result + where + S: serde::Serializer, + { + match &self.value { + Some(CollectedMetricPrimitiveValue::Counter(value)) => serializer.serialize_u64(*value), + Some(CollectedMetricPrimitiveValue::String(value)) => serializer.serialize_str(value), + None => serializer.serialize_none(), + } + } +} + +/// Key-value represented output. +pub type CollectedMetricChildren = HashMap; + +/// The type of the collected metric (eg: nested vs primitive). +#[derive(Debug, Serialize)] +#[serde(untagged)] +pub enum CollectedMetrics { + Primitive(CollectedMetricPrimitive), + Component(Box), +} + +impl CollectedMetrics { + pub fn new_component() -> Self { + Self::Component(Box::default()) + } +} + +/// The root metric component that was collected. +#[derive(Default, Debug, Serialize)] +pub struct RootMetricCollectedMetrics { + #[serde(flatten)] + inner: CollectedMetricChildren, +} + +impl Deref for RootMetricCollectedMetrics { + type Target = CollectedMetricChildren; + + fn deref(&self) -> &Self::Target { + &self.inner + } +} + +impl DerefMut for RootMetricCollectedMetrics { + fn deref_mut(&mut self) -> &mut Self::Target { + &mut self.inner + } +} diff --git a/nativelink-metric-collector/src/metrics_visitors.rs b/nativelink-metric-collector/src/metrics_visitors.rs new file mode 100644 index 000000000..dc3525c54 --- /dev/null +++ b/nativelink-metric-collector/src/metrics_visitors.rs @@ -0,0 +1,161 @@ +// Copyright 2024 The NativeLink Authors. All rights reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +use std::borrow::Cow; +use std::fmt::Debug; + +use nativelink_metric::MetricKind; +use serde::Serialize; +use tracing::field::{Field, Visit}; + +use crate::metrics_collection::{CollectedMetricPrimitive, CollectedMetricPrimitiveValue}; + +/// The type of the collected primitive metric. +#[derive(Default, Debug, Serialize)] +pub enum CollectionKind { + #[default] + Counter = 0, + String = 1, +} + +impl From for CollectionKind { + fn from(kind: MetricKind) -> Self { + match kind { + MetricKind::Counter => CollectionKind::Counter, + MetricKind::String => CollectionKind::String, + _ => CollectionKind::String, + } + } +} + +/// The final-metric primitive value and type that was collected. +#[derive(Debug)] +enum ValueWithPrimitiveType { + String(String), + U64(u64), +} + +impl Default for ValueWithPrimitiveType { + fn default() -> Self { + ValueWithPrimitiveType::U64(0) + } +} + +/// An intermediate structed that will have it's contents populated +/// by the `tracing` layer for a given field. +/// This is done by implementing the `Visit` trait and asking the +/// `tracing` library to visit the fields of the captured event +/// and populate this struct. +#[derive(Default, Debug)] +pub struct MetricDataVisitor { + pub name: String, + value: ValueWithPrimitiveType, + help: String, + value_type: Option, +} + +impl From for CollectedMetricPrimitive { + fn from(visitor: MetricDataVisitor) -> Self { + let (value, derived_type) = match visitor.value { + ValueWithPrimitiveType::String(s) => ( + CollectedMetricPrimitiveValue::String(Cow::Owned(s)), + CollectionKind::String, + ), + ValueWithPrimitiveType::U64(u) => ( + CollectedMetricPrimitiveValue::Counter(u), + CollectionKind::Counter, + ), + }; + CollectedMetricPrimitive { + value: Some(value), + help: visitor.help, + value_type: visitor.value_type.unwrap_or(derived_type), + } + } +} + +impl Visit for MetricDataVisitor { + fn record_debug(&mut self, _field: &Field, _value: &dyn Debug) {} + + fn record_f64(&mut self, field: &Field, value: f64) { + if field.name() == "__value" { + self.value = ValueWithPrimitiveType::String(value.to_string()) + } + } + fn record_i64(&mut self, field: &Field, value: i64) { + if field.name() == "__value" { + match u64::try_from(value) { + Ok(v) => self.value = ValueWithPrimitiveType::U64(v), + Err(_) => self.value = ValueWithPrimitiveType::String(value.to_string()), + } + } + } + fn record_u64(&mut self, field: &Field, value: u64) { + match field.name() { + "__value" => self.value = ValueWithPrimitiveType::U64(value), + "__type" => self.value_type = Some(MetricKind::from(value).into()), + "__help" => self.help = value.to_string(), + "__name" => self.name = value.to_string(), + field => panic!("UNKNOWN FIELD {field}"), + } + } + fn record_i128(&mut self, field: &Field, value: i128) { + if field.name() == "__value" { + match u64::try_from(value) { + Ok(v) => self.value = ValueWithPrimitiveType::U64(v), + Err(_) => self.value = ValueWithPrimitiveType::String(value.to_string()), + } + } + } + fn record_u128(&mut self, field: &Field, value: u128) { + if field.name() == "__value" { + match u64::try_from(value) { + Ok(v) => self.value = ValueWithPrimitiveType::U64(v), + Err(_) => self.value = ValueWithPrimitiveType::String(value.to_string()), + } + } + } + fn record_bool(&mut self, field: &Field, value: bool) { + if field.name() == "__value" { + self.value = ValueWithPrimitiveType::U64(u64::from(value)); + } + } + fn record_str(&mut self, field: &Field, value: &str) { + match field.name() { + "__value" => self.value = ValueWithPrimitiveType::String(value.to_string()), + "__help" => self.help = value.to_string(), + "__name" => self.name = value.to_string(), + field => panic!("UNKNOWN FIELD {field}"), + } + } + fn record_error(&mut self, _field: &Field, _value: &(dyn std::error::Error + 'static)) {} +} + +/// An intermediate structed that will have it's contents populated +/// by the `tracing` layer for a given field. +/// This is the same as `MetricDataVisitor` but only captures info +/// about a given span on span creation. +pub struct SpanFields { + pub name: Cow<'static, str>, +} + +impl Visit for SpanFields { + fn record_debug(&mut self, _field: &Field, _value: &dyn Debug) {} + + fn record_str(&mut self, field: &Field, value: &str) { + if field.name() == "__name" { + self.name = Cow::Owned(value.to_string()); + } + } +} diff --git a/nativelink-metric-collector/src/otel_exporter.rs b/nativelink-metric-collector/src/otel_exporter.rs new file mode 100644 index 000000000..397aed358 --- /dev/null +++ b/nativelink-metric-collector/src/otel_exporter.rs @@ -0,0 +1,75 @@ +// Copyright 2024 The NativeLink Authors. All rights reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +use opentelemetry::metrics::Meter; +use tracing::info; + +use crate::metrics_collection::{ + CollectedMetricChildren, CollectedMetricPrimitive, CollectedMetricPrimitiveValue, + CollectedMetrics, RootMetricCollectedMetrics, +}; + +/// The maximum length of a metric name that otel supports. +/// Going beyond this limit causes otel to complain. +const MAX_METRIC_NAME_LENGTH: usize = 256; + +/// Export the collected metrics to the OpenTelemetry meter. +pub fn otel_export( + mut root_prefix: String, + meter: &Meter, + root_collected_metrics: &RootMetricCollectedMetrics, +) { + if !root_prefix.is_empty() { + root_prefix.push('_'); + } + process_children(&mut root_prefix, meter, root_collected_metrics); +} + +fn process_children(prefix: &mut String, meter: &Meter, children: &CollectedMetricChildren) { + for (name, child) in children { + prefix.push_str(name); + let mut added_prefix_len = name.len(); + match child { + CollectedMetrics::Primitive(primitive) => { + process_primitive(prefix, meter, primitive); + } + CollectedMetrics::Component(component) => { + prefix.push('_'); + added_prefix_len += 1; + process_children(prefix, meter, component); + } + } + prefix.truncate(prefix.len() - added_prefix_len); + } +} + +fn process_primitive(prefix: &mut String, meter: &Meter, primitive: &CollectedMetricPrimitive) { + match &primitive.value { + Some(CollectedMetricPrimitiveValue::Counter(value)) => { + if prefix.len() > MAX_METRIC_NAME_LENGTH { + info!("Metric name longer than 256 characters: {}", prefix); + return; + } + let counter = meter + .u64_counter(prefix.clone()) + .with_description(primitive.help.clone()) + .init(); + counter.add(*value, &[]); + } + Some(CollectedMetricPrimitiveValue::String(_value)) => { + // We don't publish strings in metrics. + } + None => {} + } +} diff --git a/nativelink-metric-collector/src/tracing_layers.rs b/nativelink-metric-collector/src/tracing_layers.rs new file mode 100644 index 000000000..21d25c2f8 --- /dev/null +++ b/nativelink-metric-collector/src/tracing_layers.rs @@ -0,0 +1,148 @@ +// Copyright 2024 The NativeLink Authors. All rights reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +use std::borrow::Cow; +use std::collections::HashMap; +use std::fmt::Debug; +use std::marker::PhantomData; +use std::ops::DerefMut; +use std::sync::Arc; + +use parking_lot::Mutex; +use tracing::span::Attributes; +use tracing::subscriber::Interest; +use tracing::{Event, Id, Metadata, Subscriber}; +use tracing_subscriber::layer::Context; +use tracing_subscriber::registry::SpanRef; +use tracing_subscriber::Layer; + +use crate::metrics_collection::{ + CollectedMetricChildren, CollectedMetricPrimitive, CollectedMetrics, RootMetricCollectedMetrics, +}; +use crate::metrics_visitors::{MetricDataVisitor, SpanFields}; + +/// The layer that is given to `tracing` to collect metrics. +/// The output of the metrics will be populated in the `root_collected_metrics` +/// field. +pub struct MetricsCollectorLayer { + spans: Mutex>, + root_collected_metrics: Arc>, + _subscriber: PhantomData, +} + +impl MetricsCollectorLayer { + /// Creates a new `MetricsCollectorLayer` and returns it along with the + /// `root_collected_metrics` that will be populated with the collected metrics. + pub fn new() -> (Self, Arc>) { + let root_collected_metrics = Arc::new(Mutex::new(RootMetricCollectedMetrics::default())); + ( + MetricsCollectorLayer { + spans: Mutex::new(HashMap::new()), + root_collected_metrics: root_collected_metrics.clone(), + _subscriber: PhantomData, + }, + root_collected_metrics, + ) + } +} + +impl Layer for MetricsCollectorLayer +where + S: Subscriber + for<'a> tracing_subscriber::registry::LookupSpan<'a> + Debug, +{ + fn enabled(&self, metadata: &Metadata<'_>, _ctx: Context<'_, S>) -> bool { + metadata.target() == "nativelink_metric" + } + + fn on_new_span(&self, attrs: &Attributes<'_>, id: &Id, _ctx: Context<'_, S>) { + let mut span_fields = SpanFields { + name: Cow::Borrowed(attrs.metadata().name()), + }; + // Store the current metadata values map representing the current span. + // We need to 'snapshot' the current span, because when a more recent + // span (such as the one being initialized) updates, these values will + // be overwritten. + attrs.values().record(&mut span_fields); + + self.spans.lock().insert(id.clone(), span_fields); + } + + fn on_event(&self, event: &Event<'_>, ctx: Context<'_, S>) { + let mut event_visitor = MetricDataVisitor::default(); + // First, we populate the MetricDataVisitor we are interested + // in from the event. + event.record(&mut event_visitor); + // This represents the field we are concerned with updating or + // initializing. + let name = event_visitor.name.clone(); + + let mut root_collected_metrics = self.root_collected_metrics.lock(); + let collected_component = root_collected_metrics.deref_mut().deref_mut(); + + // Find out which span we are currently in and retrieve its metadata. + // It is possible to not be in a span in the tracing library. + // If we are not in a span, we assume you want your metrics published + // in the root of the collected metrics. + if let Some(current_span) = ctx.lookup_current() { + let mut known_spans = self.spans.lock(); + // By default tracing starts you at the bottom of the span tree, + // but we want to start at the root of the tree and walk down, + // so invert it. + let span_iter = current_span.scope().from_root(); + // Find the layer in our output struct we are going to populate + // the data into. + let collected_component = + find_component(span_iter, known_spans.deref_mut(), collected_component); + + // Get the new value from the event and update it in the component. + let primitive = CollectedMetricPrimitive::from(event_visitor); + collected_component.insert(name, CollectedMetrics::Primitive(primitive)); + } else { + let primitive = CollectedMetricPrimitive::from(event_visitor); + collected_component.insert(name, CollectedMetrics::Primitive(primitive)); + } + } + + fn register_callsite(&self, _metadata: &'static Metadata<'static>) -> Interest { + Interest::always() + } +} + +fn find_component<'a, 'b, S, I>( + mut iter: I, + known_spans: &'a mut HashMap, + mut collected_component: &'a mut CollectedMetricChildren, +) -> &'a mut CollectedMetricChildren +where + S: Subscriber + for<'c> tracing_subscriber::registry::LookupSpan<'c> + Debug, + I: Iterator>, +{ + let Some(span) = iter.next() else { + // Once there are no more nested spans, we have reached a leaf field. + return collected_component; + }; + let span_fields = known_spans.get(&span.id()).expect("Span not found"); + // LayerMap> + // This is a hashmap of the existing data for the layer + let collected_metric = collected_component + .entry(span_fields.name.to_string()) + .or_insert_with(CollectedMetrics::new_component); + + collected_component = match collected_metric { + CollectedMetrics::Component(component) => component.deref_mut(), + _ => panic!("Expected to be component"), + }; + // DFS the iterator of keys and return the first leaf found matching the name query. + find_component(iter, known_spans, collected_component) +} diff --git a/nativelink-metric-collector/tests/metric_collector_test.rs b/nativelink-metric-collector/tests/metric_collector_test.rs new file mode 100644 index 000000000..c45263a57 --- /dev/null +++ b/nativelink-metric-collector/tests/metric_collector_test.rs @@ -0,0 +1,194 @@ +// Copyright 2024 The NativeLink Authors. All rights reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +use std::collections::HashMap; +use std::fmt::Debug; +use std::io::{BufRead, Cursor}; +use std::marker::PhantomData; +use std::str::from_utf8; + +use nativelink_error::Error; +use nativelink_metric::{MetricFieldData, MetricKind, MetricsComponent}; +use nativelink_metric_collector::{otel_export, MetricsCollectorLayer}; +use opentelemetry::metrics::MeterProvider; +use opentelemetry_sdk::metrics::SdkMeterProvider; +use prometheus::{Encoder, TextEncoder}; +use serde_json::Value; +use tracing_subscriber::layer::SubscriberExt; + +#[derive(MetricsComponent)] +pub struct MultiStruct { + #[metric(help = "dummy help pub_u64")] + pub pub_u64: u64, + + #[metric(help = "Dummy help str")] + str: String, + + _no_metric_str: String, + _no_metric_u64: u64, + + #[metric(group = "foo")] + sub_struct_group: Foo<'static, String>, + + #[metric] + sub_struct: Foo<'static, String>, +} + +#[derive(MetricsComponent)] +struct Foo<'a, T: Debug + Send + Sync> { + #[metric(help = "help str1", handler = ToString::to_string)] + custom_handler_num_str: u64, + + #[metric(help = "help str2", handler = ToString::to_string, kind = "counter")] + custom_handler_num_counter: u64, + + _bar: &'a PhantomData, +} + +// Note: Special case to not use nativelink-test macro. We want this test +// to be very lightweight and not depend on other crates. +#[test] +fn test_metric_collector() -> Result<(), Error> { + let multi_struct = MultiStruct { + pub_u64: 1, + str: "str_data".to_string(), + _no_metric_str: "no_metric_str".to_string(), + _no_metric_u64: 2, + sub_struct_group: Foo { + custom_handler_num_str: 3, + custom_handler_num_counter: 4, + _bar: &PhantomData, + }, + sub_struct: Foo { + custom_handler_num_str: 5, + custom_handler_num_counter: 6, + _bar: &PhantomData, + }, + }; + let (layer, output_metrics) = MetricsCollectorLayer::new(); + let subscriber = tracing_subscriber::registry().with(layer); + + tracing::subscriber::with_default(subscriber, || { + MetricsComponent::publish( + &multi_struct, + MetricKind::Component, + MetricFieldData::default(), + ) + .unwrap(); + }); + + let output_json_data = serde_json::to_string(&*output_metrics.lock()).unwrap(); + let final_output_metrics: HashMap = + serde_json::from_str(&output_json_data).unwrap(); + let expected_json_data = r#"{"custom_handler_num_str":"5","str":"str_data","foo":{"custom_handler_num_counter":4,"custom_handler_num_str":"3"},"pub_u64":1,"custom_handler_num_counter":6}"#; + let expected_value: HashMap = serde_json::from_str(expected_json_data).unwrap(); + + // We cannot compare the strings directly as the order + // of the keys in the JSON string can be different. + // instead we go to string then back to anonymous hashmaps + // then validate the values. + assert_eq!(final_output_metrics, expected_value); + // To ensure the round trip is correct, we compare the length of the + // output JSON string and the expected JSON string. + assert_eq!(output_json_data.len(), expected_json_data.len()); + // Ensure the double round trip is also correct and not an + // encoding issue. + assert_eq!( + serde_json::to_string(&final_output_metrics).unwrap().len(), + expected_json_data.len() + ); + + Ok(()) +} + +// Note: Special case to not use nativelink-test macro. We want this test +// to be very lightweight and not depend on other crates. +#[test] +fn test_prometheus_exporter() -> Result<(), Error> { + let multi_struct = MultiStruct { + pub_u64: 1, + str: "str_data".to_string(), + _no_metric_str: "no_metric_str".to_string(), + _no_metric_u64: 2, + sub_struct_group: Foo { + custom_handler_num_str: 3, + custom_handler_num_counter: 4, + _bar: &PhantomData, + }, + sub_struct: Foo { + custom_handler_num_str: 5, + custom_handler_num_counter: 6, + _bar: &PhantomData, + }, + }; + let (layer, output_metrics) = MetricsCollectorLayer::new(); + let subscriber = tracing_subscriber::registry().with(layer); + + tracing::subscriber::with_default(subscriber, || { + MetricsComponent::publish( + &multi_struct, + MetricKind::Component, + MetricFieldData::default(), + ) + .unwrap(); + }); + + let registry = prometheus::Registry::new(); + let exporter = opentelemetry_prometheus::exporter() + .with_registry(registry.clone()) + .without_counter_suffixes() + .without_scope_info() + .build() + .unwrap(); + + // Prepare our OpenTelemetry collector/exporter. + let provider = SdkMeterProvider::builder().with_reader(exporter).build(); + let meter = provider.meter("nativelink"); + + // Export the metrics to OpenTelemetry. + otel_export("nativelink".to_string(), &meter, &output_metrics.lock()); + + // Translate the OpenTelemetry metrics to Prometheus format and encode + // them into a hyper::Response. + let mut result = vec![]; + TextEncoder::new() + .encode(®istry.gather(), &mut result) + .unwrap(); + + let mut output: Vec = Cursor::new(from_utf8(&result).unwrap()) + .lines() + .map(|v| v.unwrap()) + .collect(); + let mut expected_output: Vec = Cursor::new(r#" +# HELP nativelink_custom_handler_num_counter help str2 +# HELP nativelink_foo_custom_handler_num_counter help str2 +# HELP nativelink_pub_u64 dummy help pub_u64 +# HELP target_info Target metadata +# TYPE nativelink_custom_handler_num_counter counter +# TYPE nativelink_foo_custom_handler_num_counter counter +# TYPE nativelink_pub_u64 counter +# TYPE target_info gauge +nativelink_custom_handler_num_counter 6 +nativelink_foo_custom_handler_num_counter 4 +nativelink_pub_u64 1 +target_info{service_name="unknown_service",telemetry_sdk_language="rust",telemetry_sdk_name="opentelemetry",telemetry_sdk_version="0.23.0"} 1 +"#.trim()).lines().map(|v| v.unwrap()).collect(); + + // We need to sort because the output order is non-deterministic. + output.sort(); + expected_output.sort(); + + assert_eq!(output, expected_output); + Ok(()) +} diff --git a/nativelink-metric/BUILD.bazel b/nativelink-metric/BUILD.bazel new file mode 100644 index 000000000..32e7d6a91 --- /dev/null +++ b/nativelink-metric/BUILD.bazel @@ -0,0 +1,35 @@ +load( + "@rules_rust//rust:defs.bzl", + "rust_doc", + "rust_doc_test", + "rust_library", +) + +rust_library( + name = "nativelink-metric", + srcs = [ + "src/lib.rs", + ], + proc_macro_deps = [ + "//nativelink-metric/nativelink-metric-macro-derive", + ], + visibility = ["//visibility:public"], + deps = [ + "@crates//:async-lock", + "@crates//:parking_lot", + "@crates//:tokio", + "@crates//:tracing", + ], +) + +rust_doc( + name = "docs", + crate = ":nativelink-metric", + visibility = ["//visibility:public"], +) + +rust_doc_test( + name = "doc_test", + timeout = "short", + crate = ":nativelink-metric", +) diff --git a/nativelink-metric/Cargo.toml b/nativelink-metric/Cargo.toml new file mode 100644 index 000000000..9a6a2d9d4 --- /dev/null +++ b/nativelink-metric/Cargo.toml @@ -0,0 +1,12 @@ +[package] +name = "nativelink-metric" +version = "0.4.0" +edition = "2021" + +[dependencies] +nativelink-metric-macro-derive = { path = "nativelink-metric-macro-derive" } + +async-lock = "3.3.0" +parking_lot = "0.12.2" +tracing = "0.1.40" +tokio = { version = "1.37.0", features = ["sync"] } diff --git a/nativelink-metric/nativelink-metric-macro-derive/BUILD.bazel b/nativelink-metric/nativelink-metric-macro-derive/BUILD.bazel new file mode 100644 index 000000000..c1a24a35a --- /dev/null +++ b/nativelink-metric/nativelink-metric-macro-derive/BUILD.bazel @@ -0,0 +1,31 @@ +load( + "@rules_rust//rust:defs.bzl", + "rust_doc", + "rust_doc_test", + "rust_proc_macro", +) + +rust_proc_macro( + name = "nativelink-metric-macro-derive", + srcs = [ + "src/lib.rs", + ], + visibility = ["//visibility:public"], + deps = [ + "@crates//:proc-macro2", + "@crates//:quote", + "@crates//:syn", + ], +) + +rust_doc( + name = "docs", + crate = ":nativelink-metric-macro-derive", + visibility = ["//visibility:public"], +) + +rust_doc_test( + name = "doc_test", + timeout = "short", + crate = ":nativelink-metric-macro-derive", +) diff --git a/nativelink-metric/nativelink-metric-macro-derive/Cargo.toml b/nativelink-metric/nativelink-metric-macro-derive/Cargo.toml new file mode 100644 index 000000000..07f79baed --- /dev/null +++ b/nativelink-metric/nativelink-metric-macro-derive/Cargo.toml @@ -0,0 +1,14 @@ +[package] +name = "nativelink-metric-macro-derive" +version = "0.4.0" +edition = "2021" + +[lib] +proc-macro = true + +[dependencies] +# TODO(allada) We currently need to pin these to specific version. +# Some down-stream can't be upgraded just yet. +proc-macro2 = { version = "=1.0.86", features = ["proc-macro", "span-locations"] } +quote = "=1.0.36" +syn = { version = "=2.0.68", features = ["extra-traits", "full", "fold"] } diff --git a/nativelink-metric/nativelink-metric-macro-derive/src/lib.rs b/nativelink-metric/nativelink-metric-macro-derive/src/lib.rs new file mode 100644 index 000000000..8b8d9c51f --- /dev/null +++ b/nativelink-metric/nativelink-metric-macro-derive/src/lib.rs @@ -0,0 +1,238 @@ +// Copyright 2024 The NativeLink Authors. All rights reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +use core::panic; + +use proc_macro::TokenStream; +use quote::{format_ident, quote, ToTokens}; +use syn::parse::{Parse, ParseStream}; +use syn::{ + parse_macro_input, Attribute, DeriveInput, Ident, ImplGenerics, LitStr, TypeGenerics, + WhereClause, +}; + +/// Holds the type of group for the metric. For example, if a metric +/// has no group it'll be `None`, if it has a static group name it'll +/// be `StaticGroupName(name_of_metric)`. +#[derive(Default, Debug)] +enum GroupType { + #[default] + None, + StaticGroupName(Ident), +} + +impl Parse for GroupType { + fn parse(input: ParseStream) -> syn::Result { + if input.is_empty() { + return Ok(GroupType::None); + } + let group_str: LitStr = input.parse()?; + let group = format_ident!("{}", group_str.value()); + Ok(GroupType::StaticGroupName(group)) + } +} + +impl ToTokens for GroupType { + fn to_tokens(&self, tokens: &mut proc_macro2::TokenStream) { + match self { + GroupType::None => { + quote! { "" } + } + GroupType::StaticGroupName(group) => quote! { stringify!(#group) }, + } + .to_tokens(tokens); + } +} + +/// Holds the type of the metric. If the metric was not specified +/// it'll be `Default`, which will try to resolve the type from the +/// [`MetricsComponent::publish()`] method that got executed based on +/// the type of the field. +#[derive(Debug)] +enum MetricKind { + Default, + Counter, + String, + Component, +} + +impl Parse for MetricKind { + fn parse(input: ParseStream) -> syn::Result { + let kind_str: LitStr = input.parse()?; + match kind_str.value().as_str() { + "counter" => Ok(MetricKind::Counter), + "string" => Ok(MetricKind::String), + "component" => Ok(MetricKind::Component), + "default" => Ok(MetricKind::Default), + _ => Err(syn::Error::new(kind_str.span(), "Invalid metric type")), + } + } +} + +impl ToTokens for MetricKind { + fn to_tokens(&self, tokens: &mut proc_macro2::TokenStream) { + match self { + MetricKind::Counter => quote! { ::nativelink_metric::MetricKind::Counter }, + MetricKind::String => quote! { ::nativelink_metric::MetricKind::String }, + MetricKind::Component => quote! { ::nativelink_metric::MetricKind::Component }, + MetricKind::Default => quote! { ::nativelink_metric::MetricKind::Default }, + } + .to_tokens(tokens); + } +} + +/// Holds general information about a specific field that is to be published. +#[derive(Debug)] +struct MetricFieldMetaData<'a> { + field_name: &'a Ident, + metric_kind: MetricKind, + help: Option, + group: GroupType, + handler: Option, +} + +impl<'a> MetricFieldMetaData<'a> { + fn try_from(field_name: &'a Ident, attr: &Attribute) -> syn::Result { + let mut result = MetricFieldMetaData { + field_name, + metric_kind: MetricKind::Default, + help: None, + group: GroupType::None, + handler: None, + }; + // If the attribute is just a path, it has no args, so use defaults. + if let syn::Meta::Path(_) = attr.meta { + return Ok(result); + } + attr.parse_args_with(syn::meta::parser(|meta| { + if meta.path.is_ident("help") { + result.help = meta.value()?.parse()?; + } else if meta.path.is_ident("kind") { + result.metric_kind = meta.value()?.parse()?; + } else if meta.path.is_ident("group") { + result.group = meta.value()?.parse()?; + } else if meta.path.is_ident("handler") { + result.handler = Some(meta.value()?.parse()?); + } + Ok(()) + }))?; + Ok(result) + } +} + +/// Holds the template information about the struct. This is needed +/// to create the `MetricsComponent` impl. +#[derive(Debug)] +struct Generics<'a> { + impl_generics: ImplGenerics<'a>, + ty_generics: TypeGenerics<'a>, + where_clause: Option<&'a WhereClause>, +} + +/// Holds metadata about the struct that is having MetricsComponent +/// implemented. +#[derive(Debug)] +struct MetricStruct<'a> { + name: &'a Ident, + metric_fields: Vec>, + generics: Generics<'a>, +} + +impl<'a> ToTokens for MetricStruct<'a> { + fn to_tokens(&self, tokens: &mut proc_macro2::TokenStream) { + let name = &self.name; + let impl_generics = &self.generics.impl_generics; + let ty_generics = &self.generics.ty_generics; + let where_clause = &self.generics.where_clause; + + let metric_fields = self.metric_fields.iter().map(|field| { + let field_name = &field.field_name; + let group = &field.group; + + let help = match field.help.as_ref() { + Some(help) => quote! { #help }, + None => quote! { "" }, + }; + let value = match &field.handler { + Some(handler) => quote! { &#handler(&self.#field_name) }, + None => quote! { &self.#field_name }, + }; + let metric_kind = &field.metric_kind; + quote! { + ::nativelink_metric::publish!( + stringify!(#field_name), + #value, + #metric_kind, + #help, + #group + ); + } + }); + quote! { + impl #impl_generics ::nativelink_metric::MetricsComponent for #name #ty_generics #where_clause { + fn publish(&self, kind: ::nativelink_metric::MetricKind, field_metadata: ::nativelink_metric::MetricFieldData) -> Result<::nativelink_metric::MetricPublishKnownKindData, ::nativelink_metric::Error> { + #( #metric_fields )* + Ok(::nativelink_metric::MetricPublishKnownKindData::Component) + } + } + }.to_tokens(tokens); + } +} + +#[proc_macro_derive(MetricsComponent, attributes(metric))] +pub fn metrics_component_derive(input: TokenStream) -> TokenStream { + let input = parse_macro_input!(input as DeriveInput); + let data = match &input.data { + syn::Data::Struct(data) => data, + _ => panic!("MetricsComponent can only be derived for structs"), + }; + + let mut metric_fields = vec![]; + match &data.fields { + syn::Fields::Named(fields) => { + fields.named.iter().for_each(|field| { + field.attrs.iter().for_each(|attr| { + if attr.path().is_ident("metric") { + metric_fields.push( + MetricFieldMetaData::try_from(field.ident.as_ref().unwrap(), attr) + .unwrap(), + ); + } + }); + }); + } + syn::Fields::Unnamed(_) => { + panic!("Unnamed fields are not supported"); + } + syn::Fields::Unit => { + panic!("Unit structs are not supported"); + } + } + + let (impl_generics, ty_generics, where_clause) = input.generics.split_for_impl(); + let metrics_struct = MetricStruct { + name: &input.ident, + metric_fields, + generics: Generics { + impl_generics, + ty_generics, + where_clause, + }, + }; + // This line is intentionally left here to make debugging + // easier. If you want to see the output of the macro, just + // uncomment this line and run the tests. + // panic!("{}", quote! { #metrics_struct }); + TokenStream::from(quote! { #metrics_struct }) +} diff --git a/nativelink-metric/src/lib.rs b/nativelink-metric/src/lib.rs new file mode 100644 index 000000000..cf0695168 --- /dev/null +++ b/nativelink-metric/src/lib.rs @@ -0,0 +1,524 @@ +// Copyright 2024 The NativeLink Authors. All rights reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +use std::borrow::Cow; +use std::collections::{BTreeMap, BTreeSet, HashMap, HashSet}; +use std::sync::atomic::{AtomicI64, AtomicU64, Ordering}; +use std::sync::{Arc, Weak}; +use std::time::{Duration, SystemTime, UNIX_EPOCH}; + +pub use nativelink_metric_macro_derive::MetricsComponent; +pub use tracing::{error as __metric_error, info as __metric_event, info_span as __metric_span}; + +/// Error type for the metrics library. +// Note: We do not use the nativelink-error struct because +// we'd end up in a circular dependency if we did, because +// nativelink-error uses the metrics library. +#[derive(Debug)] +pub struct Error(String); + +impl std::fmt::Display for Error { + fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { + write!(f, "{}", self.0) + } +} + +impl std::error::Error for Error {} + +/// Holds metadata about the field that is being published. +#[derive(Default, Clone)] +pub struct MetricFieldData<'a> { + pub name: Cow<'a, str>, + pub help: Cow<'a, str>, + pub group: Cow<'a, str>, +} + +/// The final primtive data that is being published with the kind. +#[derive(Debug)] +pub enum MetricPublishKnownKindData { + Counter(u64), + String(String), + Component, +} + +/// The kind of metric that is being published. +// Note: This enum will be translate in-and-out +// of a u64 when traversing the `tracing::event` +// boundary for efficiency reasons. +#[derive(Clone, Copy, Debug)] +#[repr(u8)] +pub enum MetricKind { + Default = 0, + Counter = 1, + String = 2, + Component = 3, +} + +impl From for MetricKind { + fn from(value: u64) -> Self { + match value { + 0 => MetricKind::Default, + 1 => MetricKind::Counter, + 2 => MetricKind::String, + 3 => MetricKind::Component, + _ => MetricKind::Default, + } + } +} + +impl MetricKind { + pub fn into_known_kind(&self, default_kind: MetricKind) -> MetricPublishKnownKindData { + let mut this = *self; + if matches!(self, MetricKind::Default) { + this = default_kind; + } + match this { + MetricKind::Counter => MetricPublishKnownKindData::Counter(0), + MetricKind::String => MetricPublishKnownKindData::String(String::new()), + MetricKind::Component => MetricPublishKnownKindData::Component, + MetricKind::Default => unreachable!("Default should have been handled"), + } + } +} + +/// The trait that all components that can be published must implement. +pub trait MetricsComponent { + fn publish( + &self, + kind: MetricKind, + _field_metadata: MetricFieldData, + ) -> Result; +} + +pub trait RootMetricsComponent: MetricsComponent + Send + Sync { + fn publish( + &self, + kind: MetricKind, + field_metadata: MetricFieldData, + ) -> Result { + MetricsComponent::publish(self, kind, field_metadata) + } +} + +impl MetricsComponent for Option { + fn publish( + &self, + kind: MetricKind, + field_metadata: MetricFieldData, + ) -> Result { + match self { + Some(value) => value.publish(kind, field_metadata), + None => Ok(MetricPublishKnownKindData::Component), + } + } +} + +impl MetricsComponent for tokio::sync::watch::Sender { + fn publish( + &self, + kind: MetricKind, + field_metadata: MetricFieldData, + ) -> Result { + self.borrow().publish(kind, field_metadata) + } +} + +impl MetricsComponent for Arc { + fn publish( + &self, + kind: MetricKind, + field_metadata: MetricFieldData, + ) -> Result { + self.as_ref().publish(kind, field_metadata) + } +} + +impl MetricsComponent for HashSet { + fn publish( + &self, + kind: MetricKind, + field_metadata: MetricFieldData, + ) -> Result { + for (i, item) in self.iter().enumerate() { + let guard = group!(i).entered(); + let publish_result = item.publish(kind, field_metadata.clone())?; + drop(guard); + match publish_result { + MetricPublishKnownKindData::Counter(value) => { + publish!( + i, + &value, + MetricKind::Counter, + field_metadata.help.to_string() + ); + } + MetricPublishKnownKindData::String(value) => { + publish!( + i, + &value, + MetricKind::String, + field_metadata.help.to_string() + ); + } + MetricPublishKnownKindData::Component => {} + } + } + Ok(MetricPublishKnownKindData::Component) + } +} + +impl MetricsComponent for HashMap { + fn publish( + &self, + kind: MetricKind, + field_metadata: MetricFieldData, + ) -> Result { + for (key, item) in self.iter() { + let guard = group!(key).entered(); + let publish_result = item.publish(kind, field_metadata.clone())?; + drop(guard); + match publish_result { + MetricPublishKnownKindData::Counter(value) => { + publish!( + key, + &value, + MetricKind::Counter, + field_metadata.help.to_string() + ); + } + MetricPublishKnownKindData::String(value) => { + publish!( + key, + &value, + MetricKind::String, + field_metadata.help.to_string() + ); + } + MetricPublishKnownKindData::Component => {} + } + } + Ok(MetricPublishKnownKindData::Component) + } +} + +impl MetricsComponent for BTreeMap { + fn publish( + &self, + kind: MetricKind, + field_metadata: MetricFieldData, + ) -> Result { + for (key, item) in self.iter() { + group!(key).in_scope(|| item.publish(kind, field_metadata.clone()))?; + } + Ok(MetricPublishKnownKindData::Component) + } +} + +impl MetricsComponent for BTreeSet { + fn publish( + &self, + kind: MetricKind, + field_metadata: MetricFieldData, + ) -> Result { + for (i, item) in self.iter().enumerate() { + group!(i).in_scope(|| item.publish(kind, field_metadata.clone()))?; + } + Ok(MetricPublishKnownKindData::Component) + } +} + +impl MetricsComponent for Vec { + fn publish( + &self, + kind: MetricKind, + field_metadata: MetricFieldData, + ) -> Result { + for (i, item) in self.iter().enumerate() { + group!(i).in_scope(|| item.publish(kind, field_metadata.clone()))?; + } + Ok(MetricPublishKnownKindData::Component) + } +} + +impl MetricsComponent for Weak { + fn publish( + &self, + kind: MetricKind, + field_metadata: MetricFieldData, + ) -> Result { + let Some(this) = self.upgrade() else { + return Ok(MetricPublishKnownKindData::Component); + }; + this.as_ref().publish(kind, field_metadata) + } +} + +impl MetricsComponent for Result +where + T: MetricsComponent, + E: MetricsComponent, +{ + fn publish( + &self, + kind: MetricKind, + field_metadata: MetricFieldData, + ) -> Result { + match self { + Ok(value) => value.publish(kind, field_metadata), + Err(value) => value.publish(kind, field_metadata), + } + } +} + +impl MetricsComponent for Duration { + fn publish( + &self, + kind: MetricKind, + field_metadata: MetricFieldData, + ) -> Result { + self.as_secs_f64().publish(kind, field_metadata) + } +} + +impl MetricsComponent for SystemTime { + fn publish( + &self, + kind: MetricKind, + field_metadata: MetricFieldData, + ) -> Result { + match SystemTime::now().duration_since(UNIX_EPOCH) { + Ok(n) => n.as_secs().publish(kind, field_metadata), + Err(_) => Err(Error("SystemTime before UNIX EPOCH!".to_string())), + } + } +} + +impl MetricsComponent for f64 { + fn publish( + &self, + _kind: MetricKind, + _field_metadata: MetricFieldData, + ) -> Result { + Ok(MetricPublishKnownKindData::String(self.to_string())) + } +} + +impl MetricsComponent for bool { + fn publish( + &self, + kind: MetricKind, + field_metadata: MetricFieldData, + ) -> Result { + let value = u64::from(*self); + value.publish(kind, field_metadata) + } +} + +impl MetricsComponent for i32 { + fn publish( + &self, + kind: MetricKind, + field_metadata: MetricFieldData, + ) -> Result { + let value = u64::try_from(*self) + .map_err(|_| Error(format!("Could not convert {self} to u64 in metrics lib")))?; + value.publish(kind, field_metadata) + } +} + +impl MetricsComponent for u64 { + fn publish( + &self, + kind: MetricKind, + _field_metadata: MetricFieldData, + ) -> Result { + let mut known_kind_data = kind.into_known_kind(MetricKind::Counter); + match &mut known_kind_data { + MetricPublishKnownKindData::Counter(data) => { + *data = *self; + } + MetricPublishKnownKindData::String(data) => { + *data = self.to_string(); + } + MetricPublishKnownKindData::Component => {} + } + Ok(known_kind_data) + } +} + +impl MetricsComponent for i64 { + fn publish( + &self, + kind: MetricKind, + field_metadata: MetricFieldData, + ) -> Result { + let value = u64::try_from(*self) + .map_err(|_| Error(format!("Could not convert {self} to u64 in metrics lib")))?; + value.publish(kind, field_metadata) + } +} + +impl MetricsComponent for u32 { + fn publish( + &self, + kind: MetricKind, + field_metadata: MetricFieldData, + ) -> Result { + u64::from(*self).publish(kind, field_metadata) + } +} + +impl MetricsComponent for usize { + fn publish( + &self, + kind: MetricKind, + field_metadata: MetricFieldData, + ) -> Result { + let value = u64::try_from(*self) + .map_err(|_| Error(format!("Could not convert {self} to u64 in metrics lib")))?; + value.publish(kind, field_metadata) + } +} + +impl MetricsComponent for AtomicU64 { + fn publish( + &self, + kind: MetricKind, + field_metadata: MetricFieldData, + ) -> Result { + self.load(Ordering::Acquire).publish(kind, field_metadata) + } +} + +impl MetricsComponent for AtomicI64 { + fn publish( + &self, + kind: MetricKind, + field_metadata: MetricFieldData, + ) -> Result { + self.load(Ordering::Acquire).publish(kind, field_metadata) + } +} + +impl MetricsComponent for String { + fn publish( + &self, + kind: MetricKind, + _field_metadata: MetricFieldData, + ) -> Result { + let mut known_kind_data = kind.into_known_kind(MetricKind::String); + match &mut known_kind_data { + MetricPublishKnownKindData::Counter(data) => { + *data = self.parse::().map_err(|_| { + Error(format!( + "Could not convert String '{self}' to u64 in metrics lib" + )) + })?; + } + MetricPublishKnownKindData::String(data) => { + data.clone_from(self); + } + MetricPublishKnownKindData::Component => {} + } + Ok(known_kind_data) + } +} + +impl MetricsComponent for async_lock::Mutex { + fn publish( + &self, + kind: MetricKind, + field_metadata: MetricFieldData, + ) -> Result { + // It is safe to block in the publishing thread. + let lock = self.lock_blocking(); + lock.publish(kind, field_metadata) + } +} + +impl MetricsComponent for parking_lot::Mutex { + fn publish( + &self, + kind: MetricKind, + field_metadata: MetricFieldData, + ) -> Result { + // It is safe to block in the publishing thread. + let lock = self.lock(); + lock.publish(kind, field_metadata) + } +} + +impl MetricsComponent for parking_lot::RwLock { + fn publish( + &self, + kind: MetricKind, + field_metadata: MetricFieldData, + ) -> Result { + // It is safe to block in the publishing thread. + let lock = self.read(); + lock.publish(kind, field_metadata) + } +} + +#[macro_export] +macro_rules! group { + ($name:expr) => { + $crate::__metric_span!(target: "nativelink_metric", "", __name = $name.to_string()) + }; +} + +#[macro_export] +macro_rules! publish { + ($name:expr, $value:expr, $metric_kind:expr, $help:expr) => { + $crate::publish!($name, $value, $metric_kind, $help, "") + }; + ($name:expr, $value:expr, $metric_kind:expr, $help:expr, $group:expr) => { + { + let _maybe_entered = if !$group.is_empty() { + Some($crate::group!($group).entered()) + } else { + None + }; + let name = $name.to_string(); + let field_metadata = $crate::MetricFieldData { + name: ::std::borrow::Cow::Borrowed(&name), + help: $help.into(), + group: $group.into(), + }; + match $crate::MetricsComponent::publish($value, $metric_kind, field_metadata)? { + $crate::MetricPublishKnownKindData::Counter(value) => { + $crate::__metric_event!( + target: "nativelink_metric", + __value = value, + __type = $crate::MetricKind::Counter as u8, + __help = $help.to_string(), + __name = name + ); + } + $crate::MetricPublishKnownKindData::String(value) => { + $crate::__metric_event!( + target: "nativelink_metric", + __value = value, + __type = $crate::MetricKind::String as u8, + __help = $help.to_string(), + __name = name + ); + } + $crate::MetricPublishKnownKindData::Component => { + // Do nothing, data already published. + } + } + } + }; +} diff --git a/nativelink-scheduler/BUILD.bazel b/nativelink-scheduler/BUILD.bazel index 0b1b156e8..787bbaac8 100644 --- a/nativelink-scheduler/BUILD.bazel +++ b/nativelink-scheduler/BUILD.bazel @@ -33,6 +33,7 @@ rust_library( deps = [ "//nativelink-config", "//nativelink-error", + "//nativelink-metric", "//nativelink-proto", "//nativelink-store", "//nativelink-util", @@ -79,6 +80,7 @@ rust_test_suite( ":nativelink-scheduler", "//nativelink-config", "//nativelink-error", + "//nativelink-metric", "//nativelink-proto", "//nativelink-store", "//nativelink-util", diff --git a/nativelink-scheduler/Cargo.toml b/nativelink-scheduler/Cargo.toml index 119f5e4bf..ce5ad5885 100644 --- a/nativelink-scheduler/Cargo.toml +++ b/nativelink-scheduler/Cargo.toml @@ -8,6 +8,7 @@ nativelink-error = { path = "../nativelink-error" } nativelink-config = { path = "../nativelink-config" } nativelink-util = { path = "../nativelink-util" } nativelink-proto = { path = "../nativelink-proto" } +nativelink-metric = { path = "../nativelink-metric" } # TODO(aaronmondal): This should not be a dependency. Move the corresponding # files somewhere else. @@ -32,7 +33,7 @@ tracing = "0.1.40" redis = { version = "0.25.2", features = ["aio", "tokio", "json"] } serde = "1.0.203" redis-macros = "0.3.0" -serde_json = "1.0.117" +serde_json = "1.0.120" static_assertions = "1.1.0" [dev-dependencies] diff --git a/nativelink-scheduler/src/action_scheduler.rs b/nativelink-scheduler/src/action_scheduler.rs index 5a2b9be81..c23ed996f 100644 --- a/nativelink-scheduler/src/action_scheduler.rs +++ b/nativelink-scheduler/src/action_scheduler.rs @@ -18,8 +18,8 @@ use std::sync::Arc; use async_trait::async_trait; use futures::Future; use nativelink_error::Error; +use nativelink_metric::RootMetricsComponent; use nativelink_util::action_messages::{ActionInfo, ActionState, ClientOperationId}; -use nativelink_util::metrics_utils::Registry; use crate::platform_property_manager::PlatformPropertyManager; @@ -38,7 +38,7 @@ pub trait ActionListener: Sync + Send + Unpin { /// ActionScheduler interface is responsible for interactions between the scheduler /// and action related operations. #[async_trait] -pub trait ActionScheduler: Sync + Send + Unpin { +pub trait ActionScheduler: Sync + Send + Unpin + RootMetricsComponent + 'static { /// Returns the platform property manager. async fn get_platform_property_manager( &self, @@ -57,7 +57,4 @@ pub trait ActionScheduler: Sync + Send + Unpin { &self, client_operation_id: &ClientOperationId, ) -> Result>>, Error>; - - /// Register the metrics for the action scheduler. - fn register_metrics(self: Arc, _registry: &mut Registry) {} } diff --git a/nativelink-scheduler/src/api_worker_scheduler.rs b/nativelink-scheduler/src/api_worker_scheduler.rs index be7c23614..621c64bf7 100644 --- a/nativelink-scheduler/src/api_worker_scheduler.rs +++ b/nativelink-scheduler/src/api_worker_scheduler.rs @@ -12,17 +12,20 @@ // See the License for the specific language governing permissions and // limitations under the License. -use std::collections::HashMap; +use std::ops::{Deref, DerefMut}; use std::sync::Arc; use async_lock::Mutex; use lru::LruCache; use nativelink_config::schedulers::WorkerAllocationStrategy; use nativelink_error::{error_if, make_err, make_input_err, Code, Error, ResultExt}; +use nativelink_metric::{ + group, MetricFieldData, MetricKind, MetricPublishKnownKindData, MetricsComponent, + RootMetricsComponent, +}; use nativelink_util::action_messages::{ActionInfo, ActionStage, OperationId, WorkerId}; -use nativelink_util::metrics_utils::{Collector, CollectorState, MetricsComponent, Registry}; use nativelink_util::operation_state_manager::WorkerStateManager; -use nativelink_util::platform_properties::{PlatformProperties, PlatformPropertyValue}; +use nativelink_util::platform_properties::PlatformProperties; use tokio::sync::Notify; use tonic::async_trait; use tracing::{event, Level}; @@ -31,12 +34,48 @@ use crate::platform_property_manager::PlatformPropertyManager; use crate::worker::{Worker, WorkerTimestamp, WorkerUpdate}; use crate::worker_scheduler::WorkerScheduler; +struct Workers(LruCache); + +impl Deref for Workers { + type Target = LruCache; + + fn deref(&self) -> &Self::Target { + &self.0 + } +} + +impl DerefMut for Workers { + fn deref_mut(&mut self) -> &mut Self::Target { + &mut self.0 + } +} + +// Note: This could not be a derive macro because this derive-macro +// does n ot support LruCache and nameless field structs. +impl MetricsComponent for Workers { + fn publish( + &self, + _kind: MetricKind, + _field_metadata: MetricFieldData, + ) -> Result { + let _enter = group!("workers").entered(); + for (worker_id, worker) in self.iter() { + let _enter = group!(worker_id).entered(); + worker.publish(MetricKind::Component, MetricFieldData::default())?; + } + Ok(MetricPublishKnownKindData::Component) + } +} + /// A collection of workers that are available to run tasks. +#[derive(MetricsComponent)] struct ApiWorkerSchedulerImpl { /// A `LruCache` of workers availabled based on `allocation_strategy`. - workers: LruCache, + #[metric(group = "workers")] + workers: Workers, /// The worker state manager. + #[metric(group = "worker_state_manager")] worker_state_manager: Arc, /// The allocation strategy for workers. allocation_strategy: WorkerAllocationStrategy, @@ -51,7 +90,7 @@ impl ApiWorkerSchedulerImpl { worker_id: &WorkerId, timestamp: WorkerTimestamp, ) -> Result<(), Error> { - let worker = self.workers.peek_mut(worker_id).ok_or_else(|| { + let worker = self.workers.0.peek_mut(worker_id).ok_or_else(|| { make_input_err!( "Worker not found in worker map in refresh_lifetime() {}", worker_id @@ -269,11 +308,16 @@ impl ApiWorkerSchedulerImpl { } } +#[derive(MetricsComponent)] pub struct ApiWorkerScheduler { + #[metric] inner: Mutex, + #[metric(group = "platform_property_manager")] platform_property_manager: Arc, - /// Timeout of how long to evict workers if no response in this given amount of time in seconds. + #[metric( + help = "Timeout of how long to evict workers if no response in this given amount of time in seconds." + )] worker_timeout_s: u64, } @@ -287,7 +331,7 @@ impl ApiWorkerScheduler { ) -> Arc { Arc::new(Self { inner: Mutex::new(ApiWorkerSchedulerImpl { - workers: LruCache::unbounded(), + workers: Workers(LruCache::unbounded()), worker_state_manager, allocation_strategy, worker_change_notify, @@ -437,41 +481,6 @@ impl WorkerScheduler for ApiWorkerScheduler { let mut inner = self.inner.lock().await; inner.set_drain_worker(worker_id, is_draining).await } - - fn register_metrics(self: Arc, registry: &mut Registry) { - self.inner - .lock_blocking() - .worker_state_manager - .clone() - .register_metrics(registry); - registry.register_collector(Box::new(Collector::new(&self))); - } } -impl MetricsComponent for ApiWorkerScheduler { - fn gather_metrics(&self, c: &mut CollectorState) { - let inner = self.inner.lock_blocking(); - let mut props = HashMap::<&String, u64>::new(); - for (_worker_id, worker) in inner.workers.iter() { - c.publish_with_labels( - "workers", - worker, - "", - vec![("worker_id".into(), worker.id.to_string().into())], - ); - for (property, prop_value) in &worker.platform_properties.properties { - let current_value = props.get(&property).unwrap_or(&0); - if let PlatformPropertyValue::Minimum(worker_value) = prop_value { - props.insert(property, *current_value + *worker_value); - } - } - } - for (property, prop_value) in props { - c.publish( - format!("{property}_available_properties"), - &prop_value, - format!("Total sum of available properties for {property}"), - ); - } - } -} +impl RootMetricsComponent for ApiWorkerScheduler {} diff --git a/nativelink-scheduler/src/awaited_action_db/awaited_action.rs b/nativelink-scheduler/src/awaited_action_db/awaited_action.rs index eff7b3e01..0b5535e72 100644 --- a/nativelink-scheduler/src/awaited_action_db/awaited_action.rs +++ b/nativelink-scheduler/src/awaited_action_db/awaited_action.rs @@ -15,6 +15,9 @@ use std::sync::Arc; use std::time::{SystemTime, UNIX_EPOCH}; +use nativelink_metric::{ + MetricFieldData, MetricKind, MetricPublishKnownKindData, MetricsComponent, +}; use nativelink_util::action_messages::{ ActionInfo, ActionStage, ActionState, OperationId, WorkerId, }; @@ -26,31 +29,49 @@ use static_assertions::{assert_eq_size, const_assert, const_assert_eq}; #[derive(Debug, Clone, Copy)] struct AwaitedActionVersion(u64); +impl MetricsComponent for AwaitedActionVersion { + fn publish( + &self, + _kind: MetricKind, + _field_metadata: MetricFieldData, + ) -> Result { + Ok(MetricPublishKnownKindData::Counter(self.0)) + } +} + /// An action that is being awaited on and last known state. -#[derive(Debug, Clone)] +#[derive(Debug, Clone, MetricsComponent)] pub struct AwaitedAction { /// The current version of the action. + #[metric(help = "The version of the AwaitedAction")] version: AwaitedActionVersion, /// The action that is being awaited on. + #[metric(help = "The action info of the AwaitedAction")] action_info: Arc, /// The operation id of the action. + #[metric(help = "The operation id of the AwaitedAction")] operation_id: OperationId, /// The currentsort key used to order the actions. + #[metric(help = "The sort key of the AwaitedAction")] sort_key: AwaitedActionSortKey, /// The time the action was last updated. + #[metric(help = "The last time the worker updated the AwaitedAction")] last_worker_updated_timestamp: SystemTime, /// Worker that is currently running this action, None if unassigned. + #[metric(help = "The worker id of the AwaitedAction")] worker_id: Option, /// The current state of the action. + #[metric(help = "The state of the AwaitedAction")] state: Arc, /// Number of attempts the job has been tried. + #[metric(help = "The number of attempts the AwaitedAction has been tried")] pub attempts: usize, } @@ -135,6 +156,16 @@ impl AwaitedAction { #[repr(transparent)] pub struct AwaitedActionSortKey(u64); +impl MetricsComponent for AwaitedActionSortKey { + fn publish( + &self, + _kind: MetricKind, + _field_metadata: MetricFieldData, + ) -> Result { + Ok(MetricPublishKnownKindData::Counter(self.0)) + } +} + impl AwaitedActionSortKey { #[rustfmt::skip] const fn new(priority: i32, insert_timestamp: u32) -> Self { diff --git a/nativelink-scheduler/src/awaited_action_db/mod.rs b/nativelink-scheduler/src/awaited_action_db/mod.rs index 1d3cc623d..6d5576363 100644 --- a/nativelink-scheduler/src/awaited_action_db/mod.rs +++ b/nativelink-scheduler/src/awaited_action_db/mod.rs @@ -19,8 +19,8 @@ use std::sync::Arc; pub use awaited_action::{AwaitedAction, AwaitedActionSortKey}; use futures::{Future, Stream}; use nativelink_error::Error; +use nativelink_metric::MetricsComponent; use nativelink_util::action_messages::{ActionInfo, ClientOperationId, OperationId}; -use nativelink_util::metrics_utils::MetricsComponent; mod awaited_action; @@ -34,9 +34,11 @@ pub enum SortedAwaitedActionState { } /// A struct pointing to an AwaitedAction that can be sorted. -#[derive(Debug, Clone)] +#[derive(Debug, Clone, MetricsComponent)] pub struct SortedAwaitedAction { + #[metric(help = "The sort key of the AwaitedAction")] pub sort_key: AwaitedActionSortKey, + #[metric(help = "The operation id")] pub operation_id: OperationId, } diff --git a/nativelink-scheduler/src/cache_lookup_scheduler.rs b/nativelink-scheduler/src/cache_lookup_scheduler.rs index 672118a09..fb09b9db4 100644 --- a/nativelink-scheduler/src/cache_lookup_scheduler.rs +++ b/nativelink-scheduler/src/cache_lookup_scheduler.rs @@ -19,6 +19,7 @@ use std::sync::Arc; use async_trait::async_trait; use futures::Future; use nativelink_error::{make_err, Code, Error, ResultExt}; +use nativelink_metric::{MetricsComponent, RootMetricsComponent}; use nativelink_proto::build::bazel::remote::execution::v2::{ ActionResult as ProtoActionResult, GetActionResultRequest, }; @@ -52,12 +53,15 @@ type CheckActions = HashMap< )>, >; +#[derive(MetricsComponent)] pub struct CacheLookupScheduler { /// A reference to the AC to find existing actions in. /// To prevent unintended issues, this store should probably be a CompletenessCheckingStore. + #[metric(group = "ac_store")] ac_store: Store, /// The "real" scheduler to use to perform actions if they were not found /// in the action cache. + #[metric(group = "action_scheduler")] action_scheduler: Arc, /// Actions that are currently performing a CacheCheck. inflight_cache_checks: Arc>, @@ -308,3 +312,5 @@ impl ActionScheduler for CacheLookupScheduler { .await } } + +impl RootMetricsComponent for CacheLookupScheduler {} diff --git a/nativelink-scheduler/src/default_scheduler_factory.rs b/nativelink-scheduler/src/default_scheduler_factory.rs index 304f8534f..ea6a552dc 100644 --- a/nativelink-scheduler/src/default_scheduler_factory.rs +++ b/nativelink-scheduler/src/default_scheduler_factory.rs @@ -12,13 +12,11 @@ // See the License for the specific language governing permissions and // limitations under the License. -use std::collections::HashSet; use std::sync::Arc; use nativelink_config::schedulers::SchedulerConfig; use nativelink_error::{Error, ResultExt}; use nativelink_store::store_manager::StoreManager; -use nativelink_util::metrics_utils::Registry; use crate::action_scheduler::ActionScheduler; use crate::cache_lookup_scheduler::CacheLookupScheduler; @@ -35,22 +33,13 @@ pub type SchedulerFactoryResults = ( pub fn scheduler_factory( scheduler_type_cfg: &SchedulerConfig, store_manager: &StoreManager, - scheduler_metrics: &mut Registry, ) -> Result { - let mut visited_schedulers = HashSet::new(); - inner_scheduler_factory( - scheduler_type_cfg, - store_manager, - Some(scheduler_metrics), - &mut visited_schedulers, - ) + inner_scheduler_factory(scheduler_type_cfg, store_manager) } fn inner_scheduler_factory( scheduler_type_cfg: &SchedulerConfig, store_manager: &StoreManager, - maybe_scheduler_metrics: Option<&mut Registry>, - visited_schedulers: &mut HashSet, ) -> Result { let scheduler: SchedulerFactoryResults = match scheduler_type_cfg { SchedulerConfig::simple(config) => { @@ -63,7 +52,7 @@ fn inner_scheduler_factory( .get_store(&config.ac_store) .err_tip(|| format!("'ac_store': '{}' does not exist", config.ac_store))?; let (action_scheduler, worker_scheduler) = - inner_scheduler_factory(&config.scheduler, store_manager, None, visited_schedulers) + inner_scheduler_factory(&config.scheduler, store_manager) .err_tip(|| "In nested CacheLookupScheduler construction")?; let cache_lookup_scheduler = Arc::new(CacheLookupScheduler::new( ac_store, @@ -73,7 +62,7 @@ fn inner_scheduler_factory( } SchedulerConfig::property_modifier(config) => { let (action_scheduler, worker_scheduler) = - inner_scheduler_factory(&config.scheduler, store_manager, None, visited_schedulers) + inner_scheduler_factory(&config.scheduler, store_manager) .err_tip(|| "In nested PropertyModifierScheduler construction")?; let property_modifier_scheduler = Arc::new(PropertyModifierScheduler::new( config, @@ -83,30 +72,5 @@ fn inner_scheduler_factory( } }; - if let Some(scheduler_metrics) = maybe_scheduler_metrics { - if let Some(action_scheduler) = &scheduler.0 { - // We need a way to prevent our scheduler form having `register_metrics()` called multiple times. - // This is the equivalent of grabbing a uintptr_t in C++, storing it in a set, and checking if it's - // already been visited. We can't use the Arc's pointer directly because it has two interfaces - // (ActionScheduler and WorkerScheduler) and we need to be able to know if the underlying scheduler - // has already been visited, not just the trait. `Any` could be used, but that'd require some rework - // of all the schedulers. This is the most simple way to do it. Rust's uintptr_t is usize. - let action_scheduler_uintptr: usize = - Arc::as_ptr(action_scheduler).cast::<()>() as usize; - if !visited_schedulers.contains(&action_scheduler_uintptr) { - visited_schedulers.insert(action_scheduler_uintptr); - action_scheduler.clone().register_metrics(scheduler_metrics); - } - } - if let Some(worker_scheduler) = &scheduler.1 { - let worker_scheduler_uintptr: usize = - Arc::as_ptr(worker_scheduler).cast::<()>() as usize; - if !visited_schedulers.contains(&worker_scheduler_uintptr) { - visited_schedulers.insert(worker_scheduler_uintptr); - worker_scheduler.clone().register_metrics(scheduler_metrics); - } - } - } - Ok(scheduler) } diff --git a/nativelink-scheduler/src/grpc_scheduler.rs b/nativelink-scheduler/src/grpc_scheduler.rs index 45956e139..0f24aa07b 100644 --- a/nativelink-scheduler/src/grpc_scheduler.rs +++ b/nativelink-scheduler/src/grpc_scheduler.rs @@ -22,6 +22,7 @@ use async_trait::async_trait; use futures::stream::unfold; use futures::TryFutureExt; use nativelink_error::{make_err, Code, Error, ResultExt}; +use nativelink_metric::{MetricsComponent, RootMetricsComponent}; use nativelink_proto::build::bazel::remote::execution::v2::capabilities_client::CapabilitiesClient; use nativelink_proto::build::bazel::remote::execution::v2::execution_client::ExecutionClient; use nativelink_proto::build::bazel::remote::execution::v2::{ @@ -50,7 +51,9 @@ use crate::action_scheduler::{ActionListener, ActionScheduler}; use crate::default_action_listener::DefaultActionListener; use crate::platform_property_manager::PlatformPropertyManager; +#[derive(MetricsComponent)] pub struct GrpcScheduler { + #[metric(group = "property_managers")] platform_property_managers: Mutex>>, retrier: Retrier, connection_manager: ConnectionManager, @@ -318,3 +321,5 @@ impl ActionScheduler for GrpcScheduler { } } } + +impl RootMetricsComponent for GrpcScheduler {} diff --git a/nativelink-scheduler/src/memory_awaited_action_db.rs b/nativelink-scheduler/src/memory_awaited_action_db.rs index d8ddf4aa9..49cde1281 100644 --- a/nativelink-scheduler/src/memory_awaited_action_db.rs +++ b/nativelink-scheduler/src/memory_awaited_action_db.rs @@ -22,6 +22,7 @@ use async_trait::async_trait; use futures::{FutureExt, Stream}; use nativelink_config::stores::EvictionPolicy; use nativelink_error::{error_if, make_err, Code, Error, ResultExt}; +use nativelink_metric::MetricsComponent; use nativelink_util::action_messages::{ ActionInfo, ActionStage, ActionState, ActionUniqueKey, ActionUniqueQualifier, ClientOperationId, OperationId, @@ -29,7 +30,6 @@ use nativelink_util::action_messages::{ use nativelink_util::chunked_stream::ChunkedStream; use nativelink_util::evicting_map::{EvictingMap, LenEntry}; use nativelink_util::instant_wrapper::InstantWrapper; -use nativelink_util::metrics_utils::{CollectorState, MetricsComponent}; use nativelink_util::operation_state_manager::ActionStateResult; use nativelink_util::spawn; use nativelink_util::task::JoinHandleDropGuard; @@ -268,12 +268,17 @@ impl ActionStateResult for ClientActionStateResult, + #[metric(group = "cache_check")] cache_check: BTreeSet, + #[metric(group = "queued")] queued: BTreeSet, + #[metric(group = "executing")] executing: BTreeSet, + #[metric(group = "completed")] completed: BTreeSet, } @@ -342,11 +347,14 @@ impl SortedAwaitedActions { } /// The database for storing the state of all actions. +#[derive(MetricsComponent)] pub struct AwaitedActionDbImpl I> { /// A lookup table to lookup the state of an action by its client operation id. + #[metric(group = "client_operation_ids")] client_operation_to_awaited_action: EvictingMap, I>, /// A lookup table to lookup the state of an action by its worker operation id. + #[metric(group = "operation_ids")] operation_id_to_awaited_action: BTreeMap>, /// A lookup table to lookup the state of an action by its unique qualifier. @@ -356,9 +364,11 @@ pub struct AwaitedActionDbImpl I> { /// based on the [`AwaitedActionSortKey`] of the [`AwaitedAction`]. /// /// See [`AwaitedActionSortKey`] for more information on the ordering. + #[metric(group = "sorted_action_infos")] sorted_action_info_hash_keys: SortedAwaitedActions, /// The number of connected clients for each operation id. + #[metric(group = "connected_clients_for_operation_id")] connected_clients_for_operation_id: HashMap, /// Where to send notifications about important events related to actions. @@ -833,7 +843,9 @@ impl I + Clone + Send + Sync> AwaitedActionDbI } } +#[derive(MetricsComponent)] pub struct MemoryAwaitedActionDb I> { + #[metric] inner: Arc>>, _handle_awaited_action_events: JoinHandleDropGuard<()>, } @@ -978,48 +990,3 @@ impl I + Clone + Send + Sync + 'static> Awaite .await } } - -impl I + Send + Sync + 'static> MetricsComponent - for MemoryAwaitedActionDb -{ - fn gather_metrics(&self, c: &mut CollectorState) { - let inner = self.inner.lock_blocking(); - c.publish( - "action_state_unknown_total", - &inner.sorted_action_info_hash_keys.unknown.len(), - "Number of actions wih the current state of unknown.", - ); - c.publish( - "action_state_cache_check_total", - &inner.sorted_action_info_hash_keys.cache_check.len(), - "Number of actions wih the current state of cache_check.", - ); - c.publish( - "action_state_queued_total", - &inner.sorted_action_info_hash_keys.queued.len(), - "Number of actions wih the current state of queued.", - ); - c.publish( - "action_state_executing_total", - &inner.sorted_action_info_hash_keys.executing.len(), - "Number of actions wih the current state of executing.", - ); - c.publish( - "action_state_completed_total", - &inner.sorted_action_info_hash_keys.completed.len(), - "Number of actions wih the current state of completed.", - ); - // TODO(allada) This is legacy and should be removed in the future. - c.publish( - "active_actions_total", - &inner.sorted_action_info_hash_keys.executing.len(), - "(LEGACY) The number of running actions.", - ); - // TODO(allada) This is legacy and should be removed in the future. - c.publish( - "queued_actions_total", - &inner.sorted_action_info_hash_keys.queued.len(), - "(LEGACY) The number actions in the queue.", - ); - } -} diff --git a/nativelink-scheduler/src/platform_property_manager.rs b/nativelink-scheduler/src/platform_property_manager.rs index ba3d41ca4..20203483c 100644 --- a/nativelink-scheduler/src/platform_property_manager.rs +++ b/nativelink-scheduler/src/platform_property_manager.rs @@ -16,6 +16,9 @@ use std::collections::HashMap; use nativelink_config::schedulers::PropertyType; use nativelink_error::{make_input_err, Code, Error, ResultExt}; +use nativelink_metric::{ + group, MetricFieldData, MetricKind, MetricPublishKnownKindData, MetricsComponent, +}; use nativelink_util::platform_properties::PlatformPropertyValue; /// Helps manage known properties and conversion into `PlatformPropertyValue`. @@ -23,6 +26,25 @@ pub struct PlatformPropertyManager { known_properties: HashMap, } +// TODO(allada) We cannot use the `MetricsComponent` trait here because +// the `PropertyType` lives in the `nativelink-config` crate which is not +// a dependency of the `nativelink-metric-collector` crate. +impl MetricsComponent for PlatformPropertyManager { + fn publish( + &self, + _kind: MetricKind, + field_metadata: MetricFieldData, + ) -> Result { + let _enter = group!("known_properties").entered(); + for (k, v) in &self.known_properties { + group!(k).in_scope(|| { + format!("{v:?}").publish(MetricKind::String, field_metadata.clone()) + })?; + } + Ok(MetricPublishKnownKindData::Component) + } +} + impl PlatformPropertyManager { #[must_use] pub const fn new(known_properties: HashMap) -> Self { diff --git a/nativelink-scheduler/src/property_modifier_scheduler.rs b/nativelink-scheduler/src/property_modifier_scheduler.rs index 77f3b897d..723d4d1de 100644 --- a/nativelink-scheduler/src/property_modifier_scheduler.rs +++ b/nativelink-scheduler/src/property_modifier_scheduler.rs @@ -20,16 +20,19 @@ use std::sync::Arc; use async_trait::async_trait; use nativelink_config::schedulers::{PropertyModification, PropertyType}; use nativelink_error::{Error, ResultExt}; +use nativelink_metric::{MetricsComponent, RootMetricsComponent}; use nativelink_util::action_messages::{ActionInfo, ClientOperationId}; -use nativelink_util::metrics_utils::Registry; use parking_lot::Mutex; use crate::action_scheduler::{ActionListener, ActionScheduler}; use crate::platform_property_manager::PlatformPropertyManager; +#[derive(MetricsComponent)] pub struct PropertyModifierScheduler { modifications: Vec, + #[metric(group = "scheduler")] scheduler: Arc, + #[metric(group = "property_manager")] property_managers: Mutex>>, } @@ -125,10 +128,6 @@ impl ActionScheduler for PropertyModifierScheduler { .find_by_client_operation_id(client_operation_id) .await } - - // Register metrics for the underlying ActionScheduler. - fn register_metrics(self: Arc, registry: &mut Registry) { - let scheduler_registry = registry.sub_registry_with_prefix("property_modifier"); - self.scheduler.clone().register_metrics(scheduler_registry); - } } + +impl RootMetricsComponent for PropertyModifierScheduler {} diff --git a/nativelink-scheduler/src/simple_scheduler.rs b/nativelink-scheduler/src/simple_scheduler.rs index d0f30ae44..3eb198311 100644 --- a/nativelink-scheduler/src/simple_scheduler.rs +++ b/nativelink-scheduler/src/simple_scheduler.rs @@ -20,11 +20,11 @@ use async_trait::async_trait; use futures::Future; use nativelink_config::stores::EvictionPolicy; use nativelink_error::{Error, ResultExt}; +use nativelink_metric::{MetricsComponent, RootMetricsComponent}; use nativelink_util::action_messages::{ ActionInfo, ActionStage, ActionState, ClientOperationId, OperationId, WorkerId, }; use nativelink_util::instant_wrapper::InstantWrapper; -use nativelink_util::metrics_utils::Registry; use nativelink_util::operation_state_manager::{ ActionStateResult, ActionStateResultStream, ClientStateManager, MatchingEngineStateManager, OperationFilter, OperationStageFlags, OrderDirection, @@ -95,14 +95,17 @@ impl ActionListener for SimpleSchedulerActionListener { /// Engine used to manage the queued/running tasks and relationship with /// the worker nodes. All state on how the workers and actions are interacting /// should be held in this struct. +#[derive(MetricsComponent)] pub struct SimpleScheduler { /// Manager for matching engine side of the state manager. matching_engine_state_manager: Arc, /// Manager for client state of this scheduler. + #[metric(group = "client_state_manager")] client_state_manager: Arc, /// Manager for platform of this scheduler. + #[metric(group = "platform_properties")] platform_property_manager: Arc, /// A `Workers` pool that contains all workers that are available to execute actions in a priority @@ -382,13 +385,6 @@ impl ActionScheduler for SimpleScheduler { })?; Ok(maybe_receiver) } - - fn register_metrics(self: Arc, registry: &mut Registry) { - self.client_state_manager.clone().register_metrics(registry); - self.matching_engine_state_manager - .clone() - .register_metrics(registry); - } } #[async_trait] @@ -438,3 +434,5 @@ impl WorkerScheduler for SimpleScheduler { .await } } + +impl RootMetricsComponent for SimpleScheduler {} diff --git a/nativelink-scheduler/src/simple_scheduler_state_manager.rs b/nativelink-scheduler/src/simple_scheduler_state_manager.rs index f8bd2fc4d..60f7d3882 100644 --- a/nativelink-scheduler/src/simple_scheduler_state_manager.rs +++ b/nativelink-scheduler/src/simple_scheduler_state_manager.rs @@ -18,11 +18,11 @@ use std::sync::Arc; use async_trait::async_trait; use futures::{future, stream, StreamExt, TryStreamExt}; use nativelink_error::{make_err, Code, Error, ResultExt}; +use nativelink_metric::MetricsComponent; use nativelink_util::action_messages::{ ActionInfo, ActionResult, ActionStage, ActionState, ActionUniqueQualifier, ClientOperationId, ExecutionMetadata, OperationId, WorkerId, }; -use nativelink_util::metrics_utils::{Collector, CollectorState, MetricsComponent, Registry}; use nativelink_util::operation_state_manager::{ ActionStateResult, ActionStateResultStream, ClientStateManager, MatchingEngineStateManager, OperationFilter, OperationStageFlags, OrderDirection, WorkerStateManager, @@ -126,8 +126,10 @@ fn apply_filter_predicate(awaited_action: &AwaitedAction, filter: &OperationFilt /// Scheduler state includes the actions that are queued, active, and recently completed. /// It also includes the workers that are available to execute actions based on allocation /// strategy. +#[derive(MetricsComponent)] pub struct SimpleSchedulerStateManager { /// Database for storing the state of all actions. + #[metric(group = "action_db")] action_db: T, /// Notify matching engine that work needs to be done. @@ -136,6 +138,7 @@ pub struct SimpleSchedulerStateManager { /// Maximum number of times a job can be retried. // TODO(allada) This should be a scheduler decorator instead // of always having it on every SimpleScheduler. + #[metric(help = "Maximum number of times a job can be retried")] max_job_retries: usize, } @@ -464,17 +467,4 @@ impl MatchingEngineStateManager for SimpleSchedulerStateMana self.inner_update_operation(operation_id, maybe_worker_id, stage_result) .await } - - /// Register metrics with the registry. - fn register_metrics(self: Arc, registry: &mut Registry) { - // TODO(allada) We only register the metrics in one of the components instead of - // all three because it's a bit tricky to separate the metrics for each component. - registry.register_collector(Box::new(Collector::new(&self))); - } -} - -impl MetricsComponent for SimpleSchedulerStateManager { - fn gather_metrics(&self, c: &mut CollectorState) { - c.publish("", &self.action_db, ""); - } } diff --git a/nativelink-scheduler/src/worker.rs b/nativelink-scheduler/src/worker.rs index 883c7c1a9..4f2b5a78a 100644 --- a/nativelink-scheduler/src/worker.rs +++ b/nativelink-scheduler/src/worker.rs @@ -18,13 +18,12 @@ use std::sync::Arc; use std::time::{SystemTime, UNIX_EPOCH}; use nativelink_error::{make_err, Code, Error, ResultExt}; +use nativelink_metric::MetricsComponent; use nativelink_proto::com::github::trace_machina::nativelink::remote_execution::{ update_for_worker, ConnectionResult, StartExecute, UpdateForWorker, }; use nativelink_util::action_messages::{ActionInfo, OperationId, WorkerId}; -use nativelink_util::metrics_utils::{ - CollectorState, CounterWithTime, FuncCounterWrapper, MetricsComponent, -}; +use nativelink_util::metrics_utils::{CounterWithTime, FuncCounterWrapper}; use nativelink_util::platform_properties::{PlatformProperties, PlatformPropertyValue}; use tokio::sync::mpsc::UnboundedSender; @@ -41,8 +40,10 @@ pub enum WorkerUpdate { /// Represents a connection to a worker and used as the medium to /// interact with the worker from the client/scheduler. +#[derive(MetricsComponent)] pub struct Worker { /// Unique identifier of the worker. + #[metric(help = "The unique identifier of the worker.")] pub id: WorkerId, /// Properties that describe the capabilities of this worker. @@ -57,15 +58,19 @@ pub struct Worker { /// Timestamp of last time this worker had been communicated with. // Warning: Do not update this timestamp without updating the placement of the worker in // the LRUCache in the Workers struct. + #[metric(help = "Last time this worker was communicated with.")] pub last_update_timestamp: WorkerTimestamp, /// Whether the worker rejected the last action due to back pressure. + #[metric(help = "If the worker is paused.")] pub is_paused: bool, /// Whether the worker is draining. + #[metric(help = "If the worker is draining.")] pub is_draining: bool, /// Stats about the worker. + #[metric] metrics: Arc, } @@ -235,109 +240,16 @@ impl Hash for Worker { } } -#[derive(Default)] +#[derive(Default, MetricsComponent)] struct Metrics { + #[metric(help = "The timestamp of when this worker connected.")] connected_timestamp: u64, + #[metric(help = "The number of actions completed for this worker.")] actions_completed: CounterWithTime, + #[metric(help = "The number of actions started for this worker.")] run_action: FuncCounterWrapper, + #[metric(help = "The number of keep_alive sent to this worker.")] keep_alive: FuncCounterWrapper, + #[metric(help = "The number of notify_disconnect sent to this worker.")] notify_disconnect: CounterWithTime, } - -impl MetricsComponent for Worker { - fn gather_metrics(&self, c: &mut CollectorState) { - c.publish_with_labels( - "connected_timestamp", - &self.metrics.connected_timestamp, - "The timestamp of when this worker connected.", - vec![("worker_id".into(), format!("{}", self.id).into())], - ); - c.publish_with_labels( - "actions_completed", - &self.metrics.actions_completed, - "The number of actions completed for this worker.", - vec![("worker_id".into(), format!("{}", self.id).into())], - ); - c.publish_with_labels( - "run_action", - &self.metrics.run_action, - "The number of actions started for this worker.", - vec![("worker_id".into(), format!("{}", self.id).into())], - ); - c.publish_with_labels( - "keep_alive", - &self.metrics.keep_alive, - "The number of keep_alive sent to this worker.", - vec![("worker_id".into(), format!("{}", self.id).into())], - ); - c.publish_with_labels( - "notify_disconnect", - &self.metrics.notify_disconnect, - "The number of notify_disconnect sent to this worker.", - vec![("worker_id".into(), format!("{}", self.id).into())], - ); - - // Publish info about current state of worker. - c.publish_with_labels( - "is_paused", - &self.is_paused, - "If this worker is paused.", - vec![("worker_id".into(), format!("{}", self.id).into())], - ); - c.publish_with_labels( - "is_draining", - &self.is_draining, - "If this worker is draining.", - vec![("worker_id".into(), format!("{}", self.id).into())], - ); - for action_info in self.running_action_infos.values() { - let action_name = action_info.unique_qualifier.to_string(); - c.publish_with_labels( - "timeout", - &action_info.timeout, - "Timeout of the running action.", - vec![("digest".into(), action_name.clone().into())], - ); - c.publish_with_labels( - "priority", - &action_info.priority, - "Priority of the running action.", - vec![("digest".into(), action_name.clone().into())], - ); - c.publish_with_labels( - "load_timestamp", - &action_info.load_timestamp, - "When this action started to be loaded from the CAS.", - vec![("digest".into(), action_name.clone().into())], - ); - c.publish_with_labels( - "insert_timestamp", - &action_info.insert_timestamp, - "When this action was created.", - vec![("digest".into(), action_name.clone().into())], - ); - } - for (prop_name, prop_type_and_value) in &self.platform_properties.properties { - match prop_type_and_value { - PlatformPropertyValue::Exact(value) - | PlatformPropertyValue::Priority(value) - | PlatformPropertyValue::Unknown(value) => { - c.publish_with_labels( - "platform_properties", - value, - "The platform properties state.", - vec![("property_name".into(), prop_name.to_string().into())], - ); - } - PlatformPropertyValue::Minimum(value) => { - c.publish_with_labels( - "platform_properties", - value, - "The platform properties state.", - vec![("property_name".into(), prop_name.to_string().into())], - ); - } - }; - } - } -} diff --git a/nativelink-scheduler/src/worker_scheduler.rs b/nativelink-scheduler/src/worker_scheduler.rs index 10298c69c..e1acd8c80 100644 --- a/nativelink-scheduler/src/worker_scheduler.rs +++ b/nativelink-scheduler/src/worker_scheduler.rs @@ -12,12 +12,10 @@ // See the License for the specific language governing permissions and // limitations under the License. -use std::sync::Arc; - use async_trait::async_trait; use nativelink_error::Error; +use nativelink_metric::RootMetricsComponent; use nativelink_util::action_messages::{ActionStage, OperationId, WorkerId}; -use nativelink_util::metrics_utils::Registry; use crate::platform_property_manager::PlatformPropertyManager; use crate::worker::{Worker, WorkerTimestamp}; @@ -25,7 +23,7 @@ use crate::worker::{Worker, WorkerTimestamp}; /// WorkerScheduler interface is responsible for interactions between the scheduler /// and worker related operations. #[async_trait] -pub trait WorkerScheduler: Sync + Send + Unpin { +pub trait WorkerScheduler: Sync + Send + Unpin + RootMetricsComponent + 'static { /// Returns the platform property manager. fn get_platform_property_manager(&self) -> &PlatformPropertyManager; @@ -56,7 +54,4 @@ pub trait WorkerScheduler: Sync + Send + Unpin { /// Sets if the worker is draining or not. async fn set_drain_worker(&self, worker_id: &WorkerId, is_draining: bool) -> Result<(), Error>; - - /// Register the metrics for the worker scheduler. - fn register_metrics(self: Arc, _registry: &mut Registry) {} } diff --git a/nativelink-scheduler/tests/utils/mock_scheduler.rs b/nativelink-scheduler/tests/utils/mock_scheduler.rs index bf4362cc5..f878a79f2 100644 --- a/nativelink-scheduler/tests/utils/mock_scheduler.rs +++ b/nativelink-scheduler/tests/utils/mock_scheduler.rs @@ -17,6 +17,7 @@ use std::sync::Arc; use async_trait::async_trait; use nativelink_error::{make_input_err, Error}; +use nativelink_metric::{MetricsComponent, RootMetricsComponent}; use nativelink_scheduler::action_scheduler::{ActionListener, ActionScheduler}; use nativelink_scheduler::platform_property_manager::PlatformPropertyManager; use nativelink_util::action_messages::{ActionInfo, ClientOperationId}; @@ -35,6 +36,7 @@ enum ActionSchedulerReturns { FindExistingAction(Result>>, Error>), } +#[derive(MetricsComponent)] pub struct MockActionScheduler { rx_call: Mutex>, tx_call: mpsc::UnboundedSender, @@ -183,3 +185,5 @@ impl ActionScheduler for MockActionScheduler { } } } + +impl RootMetricsComponent for MockActionScheduler {} diff --git a/nativelink-service/BUILD.bazel b/nativelink-service/BUILD.bazel index f8f47072f..d44af8cbb 100644 --- a/nativelink-service/BUILD.bazel +++ b/nativelink-service/BUILD.bazel @@ -23,6 +23,7 @@ rust_library( deps = [ "//nativelink-config", "//nativelink-error", + "//nativelink-metric", "//nativelink-proto", "//nativelink-scheduler", "//nativelink-store", @@ -60,6 +61,7 @@ rust_test_suite( deps = [ "//nativelink-config", "//nativelink-error", + "//nativelink-metric", "//nativelink-proto", "//nativelink-scheduler", "//nativelink-service", @@ -71,7 +73,6 @@ rust_test_suite( "@crates//:hyper", "@crates//:maplit", "@crates//:pretty_assertions", - "@crates//:prometheus-client", "@crates//:prost", "@crates//:prost-types", "@crates//:serde_json5", diff --git a/nativelink-service/Cargo.toml b/nativelink-service/Cargo.toml index 983d0f5d7..9c9f15061 100644 --- a/nativelink-service/Cargo.toml +++ b/nativelink-service/Cargo.toml @@ -27,11 +27,11 @@ uuid = { version = "1.8.0", features = ["v4"] } [dev-dependencies] nativelink-macro = { path = "../nativelink-macro" } +nativelink-metric = { path = "../nativelink-metric" } async-trait = "0.1.80" async-lock = "3.3.0" hyper = "0.14.28" maplit = "1.0.2" pretty_assertions = "1.4.0" -prometheus-client = "0.21.2" prost-types = "0.12.4" diff --git a/nativelink-service/tests/ac_server_test.rs b/nativelink-service/tests/ac_server_test.rs index 2a6be0fb4..ba1362a88 100644 --- a/nativelink-service/tests/ac_server_test.rs +++ b/nativelink-service/tests/ac_server_test.rs @@ -29,7 +29,6 @@ use nativelink_store::store_manager::StoreManager; use nativelink_util::common::DigestInfo; use nativelink_util::store_trait::StoreLike; use pretty_assertions::assert_eq; -use prometheus_client::registry::Registry; use prost::Message; use tonic::{Code, Request, Response, Status}; @@ -60,7 +59,6 @@ async fn make_store_manager() -> Result, Error> { nativelink_config::stores::MemoryStore::default(), ), &store_manager, - Some(&mut ::default()), None, ) .await?, @@ -72,7 +70,6 @@ async fn make_store_manager() -> Result, Error> { nativelink_config::stores::MemoryStore::default(), ), &store_manager, - Some(&mut ::default()), None, ) .await?, diff --git a/nativelink-service/tests/bep_server_test.rs b/nativelink-service/tests/bep_server_test.rs index 11c582ac7..27594d5b4 100644 --- a/nativelink-service/tests/bep_server_test.rs +++ b/nativelink-service/tests/bep_server_test.rs @@ -39,7 +39,6 @@ use nativelink_util::buf_channel::make_buf_channel_pair; use nativelink_util::common::encode_stream_proto; use nativelink_util::store_trait::{Store, StoreKey, StoreLike}; use pretty_assertions::assert_eq; -use prometheus_client::registry::Registry; use prost::Message; use prost_types::Timestamp; use tonic::codec::{Codec, ProstCodec}; @@ -57,7 +56,6 @@ async fn make_store_manager() -> Result, Error> { nativelink_config::stores::MemoryStore::default(), ), &store_manager, - Some(&mut ::default()), None, ) .await?, diff --git a/nativelink-service/tests/bytestream_server_test.rs b/nativelink-service/tests/bytestream_server_test.rs index 81c1c4060..2a29c8929 100644 --- a/nativelink-service/tests/bytestream_server_test.rs +++ b/nativelink-service/tests/bytestream_server_test.rs @@ -37,7 +37,6 @@ use nativelink_util::store_trait::StoreLike; use nativelink_util::task::JoinHandleDropGuard; use nativelink_util::{background_spawn, spawn}; use pretty_assertions::assert_eq; -use prometheus_client::registry::Registry; use tokio::io::DuplexStream; use tokio::sync::mpsc::unbounded_channel; use tokio::task::yield_now; @@ -60,7 +59,6 @@ async fn make_store_manager() -> Result, Error> { nativelink_config::stores::MemoryStore::default(), ), &store_manager, - Some(&mut ::default()), None, ) .await?, diff --git a/nativelink-service/tests/cas_server_test.rs b/nativelink-service/tests/cas_server_test.rs index 7805c7be4..65e0219af 100644 --- a/nativelink-service/tests/cas_server_test.rs +++ b/nativelink-service/tests/cas_server_test.rs @@ -35,7 +35,6 @@ use nativelink_util::common::DigestInfo; use nativelink_util::digest_hasher::DigestHasherFunc; use nativelink_util::store_trait::{StoreKey, StoreLike}; use pretty_assertions::assert_eq; -use prometheus_client::registry::Registry; use prost_types::Timestamp; use tonic::{Code, Request}; @@ -54,7 +53,6 @@ async fn make_store_manager() -> Result, Error> { nativelink_config::stores::MemoryStore::default(), ), &store_manager, - Some(&mut ::default()), None, ) .await?, diff --git a/nativelink-service/tests/worker_api_server_test.rs b/nativelink-service/tests/worker_api_server_test.rs index 31df1d287..fc472765f 100644 --- a/nativelink-service/tests/worker_api_server_test.rs +++ b/nativelink-service/tests/worker_api_server_test.rs @@ -22,6 +22,7 @@ use nativelink_config::cas_server::WorkerApiConfig; use nativelink_config::schedulers::WorkerAllocationStrategy; use nativelink_error::{Error, ResultExt}; use nativelink_macro::nativelink_test; +use nativelink_metric::MetricsComponent; use nativelink_proto::build::bazel::remote::execution::v2::{ ActionResult as ProtoActionResult, ExecuteResponse, ExecutedActionMetadata, LogFile, OutputDirectory, OutputFile, OutputSymlink, @@ -61,6 +62,7 @@ enum WorkerStateManagerReturns { UpdateOperation(Result<(), Error>), } +#[derive(MetricsComponent)] struct MockWorkerStateManager { rx_call: Arc>>, tx_call: mpsc::UnboundedSender, diff --git a/nativelink-store/BUILD.bazel b/nativelink-store/BUILD.bazel index 6007642a4..47a58766d 100644 --- a/nativelink-store/BUILD.bazel +++ b/nativelink-store/BUILD.bazel @@ -37,9 +37,9 @@ rust_library( deps = [ "//nativelink-config", "//nativelink-error", + "//nativelink-metric", "//nativelink-proto", "//nativelink-util", - "@crates//:arc-cell", "@crates//:async-lock", "@crates//:aws-config", "@crates//:aws-sdk-s3", @@ -100,6 +100,7 @@ rust_test_suite( ":nativelink-store", "//nativelink-config", "//nativelink-error", + "//nativelink-metric", "//nativelink-proto", "//nativelink-util", "@crates//:async-lock", diff --git a/nativelink-store/Cargo.toml b/nativelink-store/Cargo.toml index aa3d55d05..9571f2317 100644 --- a/nativelink-store/Cargo.toml +++ b/nativelink-store/Cargo.toml @@ -8,8 +8,8 @@ nativelink-error = { path = "../nativelink-error" } nativelink-config = { path = "../nativelink-config" } nativelink-util = { path = "../nativelink-util" } nativelink-proto = { path = "../nativelink-proto" } +nativelink-metric = { path = "../nativelink-metric" } -arc-cell = "0.3.3" async-lock = "3.3.0" async-trait = "0.1.80" aws-config = "1.4.0" diff --git a/nativelink-store/src/completeness_checking_store.rs b/nativelink-store/src/completeness_checking_store.rs index 898e1cdaf..de6343d6f 100644 --- a/nativelink-store/src/completeness_checking_store.rs +++ b/nativelink-store/src/completeness_checking_store.rs @@ -20,15 +20,14 @@ use async_trait::async_trait; use futures::stream::{FuturesUnordered, StreamExt}; use futures::{select, FutureExt, TryFutureExt}; use nativelink_error::{make_err, Code, Error, ResultExt}; +use nativelink_metric::MetricsComponent; use nativelink_proto::build::bazel::remote::execution::v2::{ ActionResult as ProtoActionResult, OutputDirectory as ProtoOutputDirectory, Tree as ProtoTree, }; use nativelink_util::buf_channel::{DropCloserReadHalf, DropCloserWriteHalf}; use nativelink_util::common::DigestInfo; use nativelink_util::health_utils::{default_health_status_indicator, HealthStatusIndicator}; -use nativelink_util::metrics_utils::{ - Collector, CollectorState, CounterWithTime, MetricsComponent, Registry, -}; +use nativelink_util::metrics_utils::CounterWithTime; use nativelink_util::store_trait::{Store, StoreDriver, StoreKey, StoreLike, UploadSizeInfo}; use parking_lot::Mutex; use tokio::sync::Notify; @@ -105,11 +104,14 @@ async fn check_output_directories<'a>( Ok(()) } +#[derive(MetricsComponent)] pub struct CompletenessCheckingStore { cas_store: Store, ac_store: Store, + #[metric(help = "Incomplete entries hit in CompletenessCheckingStore")] incomplete_entries_counter: CounterWithTime, + #[metric(help = "Complete entries hit in CompletenessCheckingStore")] complete_entries_counter: CounterWithTime, } @@ -385,29 +387,6 @@ impl StoreDriver for CompletenessCheckingStore { fn as_any_arc(self: Arc) -> Arc { self } - - fn register_metrics(self: Arc, registry: &mut Registry) { - self.cas_store - .register_metrics(registry.sub_registry_with_prefix("cas_store")); - self.ac_store - .register_metrics(registry.sub_registry_with_prefix("ac_store")); - registry.register_collector(Box::new(Collector::new(&self))); - } -} - -impl MetricsComponent for CompletenessCheckingStore { - fn gather_metrics(&self, c: &mut CollectorState) { - c.publish( - "incomplete_entries_counter", - &self.incomplete_entries_counter, - "Incomplete entries hit in CompletenessCheckingStore", - ); - c.publish( - "complete_entries_counter", - &self.complete_entries_counter, - "Complete entries hit in CompletenessCheckingStore", - ); - } } default_health_status_indicator!(CompletenessCheckingStore); diff --git a/nativelink-store/src/compression_store.rs b/nativelink-store/src/compression_store.rs index 22de23f2e..fb3884015 100644 --- a/nativelink-store/src/compression_store.rs +++ b/nativelink-store/src/compression_store.rs @@ -24,11 +24,11 @@ use bytes::{Buf, BufMut, BytesMut}; use futures::future::FutureExt; use lz4_flex::block::{compress_into, decompress_into, get_maximum_output_size}; use nativelink_error::{error_if, make_err, Code, Error, ResultExt}; +use nativelink_metric::MetricsComponent; use nativelink_util::buf_channel::{ make_buf_channel_pair, DropCloserReadHalf, DropCloserWriteHalf, }; use nativelink_util::health_utils::{default_health_status_indicator, HealthStatusIndicator}; -use nativelink_util::metrics_utils::Registry; use nativelink_util::spawn; use nativelink_util::store_trait::{Store, StoreDriver, StoreKey, StoreLike, UploadSizeInfo}; use serde::{Deserialize, Serialize}; @@ -209,7 +209,9 @@ impl UploadState { /// Note: Currently using get_part() and trying to read part of the data will /// result in the entire contents being read from the inner store but will /// only send the contents requested. +#[derive(MetricsComponent)] pub struct CompressionStore { + #[metric(group = "inner_store")] inner_store: Store, config: nativelink_config::stores::Lz4Config, bincode_options: BincodeOptions, @@ -623,11 +625,6 @@ impl StoreDriver for CompressionStore { fn as_any_arc(self: Arc) -> Arc { self } - - fn register_metrics(self: Arc, registry: &mut Registry) { - let inner_store_registry = registry.sub_registry_with_prefix("inner_store"); - self.inner_store.register_metrics(inner_store_registry); - } } default_health_status_indicator!(CompressionStore); diff --git a/nativelink-store/src/dedup_store.rs b/nativelink-store/src/dedup_store.rs index 63a992e17..e0c79905e 100644 --- a/nativelink-store/src/dedup_store.rs +++ b/nativelink-store/src/dedup_store.rs @@ -21,6 +21,7 @@ use bincode::config::{FixintEncoding, WithOtherIntEncoding}; use bincode::{DefaultOptions, Options}; use futures::stream::{self, FuturesOrdered, StreamExt, TryStreamExt}; use nativelink_error::{make_err, Code, Error, ResultExt}; +use nativelink_metric::MetricsComponent; use nativelink_util::buf_channel::{DropCloserReadHalf, DropCloserWriteHalf}; use nativelink_util::common::DigestInfo; use nativelink_util::fastcdc::FastCDC; @@ -43,10 +44,14 @@ pub struct DedupIndex { pub entries: Vec, } +#[derive(MetricsComponent)] pub struct DedupStore { + #[metric(group = "index_store")] index_store: Store, + #[metric(group = "content_store")] content_store: Store, fast_cdc_decoder: FastCDC, + #[metric(help = "Maximum number of concurrent fetches per get")] max_concurrent_fetch_per_get: usize, bincode_options: WithOtherIntEncoding, } diff --git a/nativelink-store/src/default_store_factory.rs b/nativelink-store/src/default_store_factory.rs index 336393121..f5fc767c5 100644 --- a/nativelink-store/src/default_store_factory.rs +++ b/nativelink-store/src/default_store_factory.rs @@ -20,7 +20,6 @@ use futures::{Future, TryStreamExt}; use nativelink_config::stores::StoreConfig; use nativelink_error::Error; use nativelink_util::health_utils::HealthRegistryBuilder; -use nativelink_util::metrics_utils::Registry; use nativelink_util::store_trait::{Store, StoreDriver}; use crate::completeness_checking_store::CompletenessCheckingStore; @@ -45,7 +44,6 @@ type FutureMaybeStore<'a> = Box> + 'a>; pub fn store_factory<'a>( backend: &'a StoreConfig, store_manager: &'a Arc, - maybe_store_metrics: Option<&'a mut Registry>, maybe_health_registry_builder: Option<&'a mut HealthRegistryBuilder>, ) -> Pin> { Box::pin(async move { @@ -55,36 +53,36 @@ pub fn store_factory<'a>( StoreConfig::redis_store(config) => RedisStore::new(config)?, StoreConfig::verify(config) => VerifyStore::new( config, - store_factory(&config.backend, store_manager, None, None).await?, + store_factory(&config.backend, store_manager, None).await?, ), StoreConfig::compression(config) => CompressionStore::new( *config.clone(), - store_factory(&config.backend, store_manager, None, None).await?, + store_factory(&config.backend, store_manager, None).await?, )?, StoreConfig::dedup(config) => DedupStore::new( config, - store_factory(&config.index_store, store_manager, None, None).await?, - store_factory(&config.content_store, store_manager, None, None).await?, + store_factory(&config.index_store, store_manager, None).await?, + store_factory(&config.content_store, store_manager, None).await?, ), StoreConfig::existence_cache(config) => ExistenceCacheStore::new( config, - store_factory(&config.backend, store_manager, None, None).await?, + store_factory(&config.backend, store_manager, None).await?, ), StoreConfig::completeness_checking(config) => CompletenessCheckingStore::new( - store_factory(&config.backend, store_manager, None, None).await?, - store_factory(&config.cas_store, store_manager, None, None).await?, + store_factory(&config.backend, store_manager, None).await?, + store_factory(&config.cas_store, store_manager, None).await?, ), StoreConfig::fast_slow(config) => FastSlowStore::new( config, - store_factory(&config.fast, store_manager, None, None).await?, - store_factory(&config.slow, store_manager, None, None).await?, + store_factory(&config.fast, store_manager, None).await?, + store_factory(&config.slow, store_manager, None).await?, ), StoreConfig::filesystem(config) => ::new(config).await?, StoreConfig::ref_store(config) => RefStore::new(config, Arc::downgrade(store_manager)), StoreConfig::size_partitioning(config) => SizePartitioningStore::new( config, - store_factory(&config.lower_store, store_manager, None, None).await?, - store_factory(&config.upper_store, store_manager, None, None).await?, + store_factory(&config.lower_store, store_manager, None).await?, + store_factory(&config.upper_store, store_manager, None).await?, ), StoreConfig::grpc(config) => GrpcStore::new(config).await?, StoreConfig::noop => NoopStore::new(), @@ -92,18 +90,13 @@ pub fn store_factory<'a>( let stores = config .stores .iter() - .map(|store_config| { - store_factory(&store_config.store, store_manager, None, None) - }) + .map(|store_config| store_factory(&store_config.store, store_manager, None)) .collect::>() .try_collect::>() .await?; ShardStore::new(config, stores)? } }; - if let Some(store_metrics) = maybe_store_metrics { - store.clone().register_metrics(store_metrics); - } if let Some(health_registry_builder) = maybe_health_registry_builder { store.clone().register_health(health_registry_builder); diff --git a/nativelink-store/src/existence_cache_store.rs b/nativelink-store/src/existence_cache_store.rs index a39791a3f..ccfcde079 100644 --- a/nativelink-store/src/existence_cache_store.rs +++ b/nativelink-store/src/existence_cache_store.rs @@ -19,11 +19,11 @@ use std::time::SystemTime; use async_trait::async_trait; use nativelink_config::stores::{EvictionPolicy, ExistenceCacheStore as ExistenceCacheStoreConfig}; use nativelink_error::{error_if, Error, ResultExt}; +use nativelink_metric::MetricsComponent; use nativelink_util::buf_channel::{DropCloserReadHalf, DropCloserWriteHalf}; use nativelink_util::common::DigestInfo; use nativelink_util::evicting_map::{EvictingMap, LenEntry}; use nativelink_util::health_utils::{default_health_status_indicator, HealthStatusIndicator}; -use nativelink_util::metrics_utils::{CollectorState, MetricsComponent, Registry}; use nativelink_util::store_trait::{Store, StoreDriver, StoreKey, StoreLike, UploadSizeInfo}; #[derive(Clone, Debug)] @@ -41,7 +41,9 @@ impl LenEntry for ExistanceItem { } } +#[derive(MetricsComponent)] pub struct ExistenceCacheStore { + #[metric(group = "inner_store")] inner_store: Store, existence_cache: EvictingMap, } @@ -210,17 +212,6 @@ impl StoreDriver for ExistenceCacheStore { fn as_any_arc(self: Arc) -> Arc { self } - - fn register_metrics(self: Arc, registry: &mut Registry) { - let inner_store_registry = registry.sub_registry_with_prefix("inner_store"); - self.inner_store.register_metrics(inner_store_registry); - } -} - -impl MetricsComponent for ExistenceCacheStore { - fn gather_metrics(&self, c: &mut CollectorState) { - self.existence_cache.gather_metrics(c) - } } default_health_status_indicator!(ExistenceCacheStore); diff --git a/nativelink-store/src/fast_slow_store.rs b/nativelink-store/src/fast_slow_store.rs index dc4962d6b..3ce160e36 100644 --- a/nativelink-store/src/fast_slow_store.rs +++ b/nativelink-store/src/fast_slow_store.rs @@ -22,12 +22,12 @@ use std::sync::{Arc, Weak}; use async_trait::async_trait; use futures::{join, FutureExt}; use nativelink_error::{make_err, Code, Error, ResultExt}; +use nativelink_metric::MetricsComponent; use nativelink_util::buf_channel::{ make_buf_channel_pair, DropCloserReadHalf, DropCloserWriteHalf, }; use nativelink_util::fs; use nativelink_util::health_utils::{default_health_status_indicator, HealthStatusIndicator}; -use nativelink_util::metrics_utils::{CollectorState, MetricsComponent, Registry}; use nativelink_util::store_trait::{ slow_update_store_with_file, Store, StoreDriver, StoreKey, StoreLike, StoreOptimizations, UploadSizeInfo, @@ -40,10 +40,14 @@ use nativelink_util::store_trait::{ // client to hang up while the data is buffered. An alternative is to possibly make a // "BufferedStore" that could be placed on the "slow" store that would hang up early // if data is in the buffer. +#[derive(MetricsComponent)] pub struct FastSlowStore { + #[metric(group = "fast_store")] fast_store: Store, + #[metric(group = "slow_store")] slow_store: Store, weak_self: Weak, + #[metric] metrics: FastSlowStoreMetrics, } @@ -383,46 +387,18 @@ impl StoreDriver for FastSlowStore { fn as_any_arc(self: Arc) -> Arc { self } - - fn register_metrics(self: Arc, registry: &mut Registry) { - let fast_store_registry = registry.sub_registry_with_prefix("fast"); - self.fast_store.register_metrics(fast_store_registry); - let slow_store_registry = registry.sub_registry_with_prefix("slow"); - self.slow_store.register_metrics(slow_store_registry); - } } -#[derive(Default)] +#[derive(Default, MetricsComponent)] struct FastSlowStoreMetrics { + #[metric(help = "Hit count for the fast store")] fast_store_hit_count: AtomicU64, + #[metric(help = "Downloaded bytes from the fast store")] fast_store_downloaded_bytes: AtomicU64, + #[metric(help = "Hit count for the slow store")] slow_store_hit_count: AtomicU64, + #[metric(help = "Downloaded bytes from the slow store")] slow_store_downloaded_bytes: AtomicU64, } -impl MetricsComponent for FastSlowStoreMetrics { - fn gather_metrics(&self, c: &mut CollectorState) { - c.publish( - "fast_store_hit_count", - &self.fast_store_hit_count, - "Hit count for the fast store", - ); - c.publish( - "fast_store_downloaded_bytes", - &self.fast_store_downloaded_bytes, - "Downloaded bytes from the fast store", - ); - c.publish( - "slow_store_hit_count", - &self.slow_store_hit_count, - "Hit count for the slow store", - ); - c.publish( - "slow_store_downloaded_bytes", - &self.slow_store_downloaded_bytes, - "Downloaded bytes from the slow store", - ); - } -} - default_health_status_indicator!(FastSlowStore); diff --git a/nativelink-store/src/filesystem_store.rs b/nativelink-store/src/filesystem_store.rs index feb2aa27b..2748322cb 100644 --- a/nativelink-store/src/filesystem_store.rs +++ b/nativelink-store/src/filesystem_store.rs @@ -27,13 +27,13 @@ use filetime::{set_file_atime, FileTime}; use futures::stream::{StreamExt, TryStreamExt}; use futures::{Future, TryFutureExt}; use nativelink_error::{make_err, make_input_err, Code, Error, ResultExt}; +use nativelink_metric::MetricsComponent; use nativelink_util::buf_channel::{ make_buf_channel_pair, DropCloserReadHalf, DropCloserWriteHalf, }; use nativelink_util::common::{fs, DigestInfo}; use nativelink_util::evicting_map::{EvictingMap, LenEntry}; use nativelink_util::health_utils::{HealthRegistryBuilder, HealthStatus, HealthStatusIndicator}; -use nativelink_util::metrics_utils::{Collector, CollectorState, MetricsComponent, Registry}; use nativelink_util::store_trait::{StoreDriver, StoreKey, StoreOptimizations, UploadSizeInfo}; use nativelink_util::{background_spawn, spawn_blocking}; use tokio::io::{AsyncReadExt, AsyncSeekExt, AsyncWriteExt, SeekFrom}; @@ -48,14 +48,17 @@ const DEFAULT_BUFF_SIZE: usize = 32 * 1024; // Default block size of all major filesystems is 4KB const DEFAULT_BLOCK_SIZE: u64 = 4 * 1024; -#[derive(Debug)] +#[derive(Debug, MetricsComponent)] pub struct SharedContext { // Used in testing to know how many active drop() spawns are running. // TODO(allada) It is probably a good idea to use a spin lock during // destruction of the store to ensure that all files are actually // deleted (similar to how it is done in tests). + #[metric(help = "Number of active drop spawns")] pub active_drop_spawns: AtomicU64, + #[metric(help = "Path to the configured temp path")] temp_path: String, + #[metric(help = "Path to the configured content path")] content_path: String, } @@ -514,10 +517,15 @@ async fn prune_temp_path(temp_path: &str) -> Result<(), Error> { Ok(()) } +#[derive(MetricsComponent)] pub struct FilesystemStore { + #[metric] shared_context: Arc, + #[metric(group = "evicting_map")] evicting_map: Arc, SystemTime>>, + #[metric(help = "Block size of the configured filesystem")] block_size: u64, + #[metric(help = "Size of the configured read buffer size")] read_buffer_size: usize, weak_self: Weak, sleep_fn: fn(Duration) -> Sleep, @@ -910,41 +918,11 @@ impl StoreDriver for FilesystemStore { self } - fn register_metrics(self: Arc, registry: &mut Registry) { - registry.register_collector(Box::new(Collector::new(&self))); - } - fn register_health(self: Arc, registry: &mut HealthRegistryBuilder) { registry.register_indicator(self); } } -impl MetricsComponent for FilesystemStore { - fn gather_metrics(&self, c: &mut CollectorState) { - c.publish( - "read_buff_size_bytes", - &self.read_buffer_size, - "Size of the configured read buffer size", - ); - c.publish( - "active_drop_spawns_total", - &self.shared_context.active_drop_spawns, - "Number of active drop spawns", - ); - c.publish( - "temp_path", - &self.shared_context.temp_path, - "Path to the configured temp path", - ); - c.publish( - "content_path", - &self.shared_context.content_path, - "Path to the configured content path", - ); - c.publish("evicting_map", self.evicting_map.as_ref(), ""); - } -} - #[async_trait] impl HealthStatusIndicator for FilesystemStore { fn get_name(&self) -> &'static str { diff --git a/nativelink-store/src/grpc_store.rs b/nativelink-store/src/grpc_store.rs index fa7019c42..6f0e38d7a 100644 --- a/nativelink-store/src/grpc_store.rs +++ b/nativelink-store/src/grpc_store.rs @@ -22,6 +22,7 @@ use bytes::BytesMut; use futures::stream::{unfold, FuturesUnordered}; use futures::{future, Future, Stream, StreamExt, TryFutureExt, TryStreamExt}; use nativelink_error::{error_if, make_input_err, Error, ResultExt}; +use nativelink_metric::MetricsComponent; use nativelink_proto::build::bazel::remote::execution::v2::action_cache_client::ActionCacheClient; use nativelink_proto::build::bazel::remote::execution::v2::content_addressable_storage_client::ContentAddressableStorageClient; use nativelink_proto::build::bazel::remote::execution::v2::{ @@ -59,7 +60,9 @@ use uuid::Uuid; // This store is usually a pass-through store, but can also be used as a CAS store. Using it as an // AC store has one major side-effect... The has() function may not give the proper size of the // underlying data. This might cause issues if embedded in certain stores. +#[derive(MetricsComponent)] pub struct GrpcStore { + #[metric(help = "Instance name for the store")] instance_name: String, store_type: nativelink_config::stores::StoreType, retrier: Retrier, diff --git a/nativelink-store/src/memory_store.rs b/nativelink-store/src/memory_store.rs index c194e241b..84a118cc0 100644 --- a/nativelink-store/src/memory_store.rs +++ b/nativelink-store/src/memory_store.rs @@ -22,10 +22,10 @@ use std::time::SystemTime; use async_trait::async_trait; use bytes::{Bytes, BytesMut}; use nativelink_error::{make_err, Code, Error, ResultExt}; +use nativelink_metric::MetricsComponent; use nativelink_util::buf_channel::{DropCloserReadHalf, DropCloserWriteHalf}; use nativelink_util::evicting_map::{EvictingMap, LenEntry}; use nativelink_util::health_utils::{default_health_status_indicator, HealthStatusIndicator}; -use nativelink_util::metrics_utils::{Collector, CollectorState, MetricsComponent, Registry}; use nativelink_util::store_trait::{ StoreDriver, StoreKey, StoreOptimizations, StoreSubscription, StoreSubscriptionItem, UploadSizeInfo, @@ -127,8 +127,10 @@ impl StoreSubscriptionItem for MemoryStoreSubscriptionItem { } type SubscriptionSender = watch::Sender, Error>>; +#[derive(MetricsComponent)] pub struct MemoryStore { weak_self: Weak, + #[metric(group = "evicting_map")] evicting_map: EvictingMap, BytesWrapper, SystemTime>, subscriptions: RwLock, SubscriptionSender>>, } @@ -317,16 +319,6 @@ impl StoreDriver for MemoryStore { fn as_any_arc(self: Arc) -> Arc { self } - - fn register_metrics(self: Arc, registry: &mut Registry) { - registry.register_collector(Box::new(Collector::new(&self))); - } -} - -impl MetricsComponent for MemoryStore { - fn gather_metrics(&self, c: &mut CollectorState) { - c.publish("evicting_map", &self.evicting_map, ""); - } } default_health_status_indicator!(MemoryStore); diff --git a/nativelink-store/src/noop_store.rs b/nativelink-store/src/noop_store.rs index 151b28675..457e8b199 100644 --- a/nativelink-store/src/noop_store.rs +++ b/nativelink-store/src/noop_store.rs @@ -17,6 +17,9 @@ use std::sync::Arc; use async_trait::async_trait; use nativelink_error::{make_err, Code, Error, ResultExt}; +use nativelink_metric::{ + MetricFieldData, MetricKind, MetricPublishKnownKindData, MetricsComponent, +}; use nativelink_util::buf_channel::{DropCloserReadHalf, DropCloserWriteHalf}; use nativelink_util::health_utils::{default_health_status_indicator, HealthStatusIndicator}; use nativelink_util::store_trait::{StoreDriver, StoreKey, StoreOptimizations, UploadSizeInfo}; @@ -24,6 +27,16 @@ use nativelink_util::store_trait::{StoreDriver, StoreKey, StoreOptimizations, Up #[derive(Default)] pub struct NoopStore; +impl MetricsComponent for NoopStore { + fn publish( + &self, + _kind: MetricKind, + _field_metadata: MetricFieldData, + ) -> Result { + Ok(MetricPublishKnownKindData::Component) + } +} + impl NoopStore { pub fn new() -> Arc { Arc::new(NoopStore {}) diff --git a/nativelink-store/src/redis_store.rs b/nativelink-store/src/redis_store.rs index 1e10792f7..c72e29d9f 100644 --- a/nativelink-store/src/redis_store.rs +++ b/nativelink-store/src/redis_store.rs @@ -25,10 +25,10 @@ use futures::stream::FuturesOrdered; use futures::{Future, TryFutureExt, TryStreamExt}; use nativelink_config::stores::RedisMode; use nativelink_error::{error_if, make_err, Code, Error, ResultExt}; +use nativelink_metric::MetricsComponent; use nativelink_util::background_spawn; use nativelink_util::buf_channel::{DropCloserReadHalf, DropCloserWriteHalf}; use nativelink_util::health_utils::{HealthRegistryBuilder, HealthStatus, HealthStatusIndicator}; -use nativelink_util::metrics_utils::{Collector, CollectorState, MetricsComponent, Registry}; use nativelink_util::store_trait::{StoreDriver, StoreKey, UploadSizeInfo}; use redis::aio::{ConnectionLike, ConnectionManager}; use redis::cluster_async::ClusterConnection; @@ -239,6 +239,7 @@ unsafe impl Send for BackgroundConnection {} unsafe impl Sync for BackgroundConnection {} /// A [`StoreDriver`] implementation that uses Redis as a backing store. +#[derive(MetricsComponent)] pub struct RedisStore { /// The connection to the underlying Redis instance(s). connection: BackgroundConnection, @@ -250,6 +251,7 @@ pub struct RedisStore { /// A common prefix to append to all keys before they are sent to Redis. /// /// See [`RedisStore::key_prefix`](`nativelink_config::stores::RedisStore::key_prefix`). + #[metric(help = "Prefix to append to all keys before sending to Redis")] key_prefix: String, } @@ -602,22 +604,11 @@ where self } - fn register_metrics(self: Arc, registry: &mut Registry) { - registry.register_collector(Box::new(Collector::new(&self))); - } - fn register_health(self: Arc, registry: &mut HealthRegistryBuilder) { registry.register_indicator(self); } } -impl MetricsComponent for RedisStore -where - C: ConnectionLike + Clone + Send + 'static, -{ - fn gather_metrics(&self, _c: &mut CollectorState) {} -} - #[async_trait] impl HealthStatusIndicator for RedisStore where diff --git a/nativelink-store/src/ref_store.rs b/nativelink-store/src/ref_store.rs index 8e64b185c..cb2ba7cfd 100644 --- a/nativelink-store/src/ref_store.rs +++ b/nativelink-store/src/ref_store.rs @@ -18,6 +18,7 @@ use std::sync::{Arc, Mutex, Weak}; use async_trait::async_trait; use nativelink_error::{make_err, make_input_err, Code, Error, ResultExt}; +use nativelink_metric::MetricsComponent; use nativelink_util::buf_channel::{DropCloserReadHalf, DropCloserWriteHalf}; use nativelink_util::health_utils::{default_health_status_indicator, HealthStatusIndicator}; use nativelink_util::store_trait::{Store, StoreDriver, StoreKey, StoreLike, UploadSizeInfo}; @@ -35,7 +36,9 @@ struct StoreReference { unsafe impl Sync for StoreReference {} +#[derive(MetricsComponent)] pub struct RefStore { + #[metric(help = "The store we are referencing")] ref_store_name: String, store_manager: Weak, ref_store: StoreReference, diff --git a/nativelink-store/src/s3_store.rs b/nativelink-store/src/s3_store.rs index 918f4a976..c5ae44fab 100644 --- a/nativelink-store/src/s3_store.rs +++ b/nativelink-store/src/s3_store.rs @@ -41,6 +41,7 @@ use hyper::service::Service; use hyper::Uri; use hyper_rustls::{HttpsConnector, MaybeHttpsStream}; use nativelink_error::{make_err, make_input_err, Code, Error, ResultExt}; +use nativelink_metric::MetricsComponent; use nativelink_util::buf_channel::{ make_buf_channel_pair, DropCloserReadHalf, DropCloserWriteHalf, }; @@ -232,12 +233,17 @@ impl http_body::Body for BodyWrapper { } } +#[derive(MetricsComponent)] pub struct S3Store { s3_client: Arc, + #[metric(help = "The bucket name for the S3 store")] bucket: String, + #[metric(help = "The key prefix for the S3 store")] key_prefix: String, retrier: Retrier, + #[metric(help = "The number of bytes to buffer for retrying requests")] max_retry_buffer_per_request: usize, + #[metric(help = "The number of concurrent uploads allowed for multipart uploads")] multipart_max_concurrent_uploads: usize, } diff --git a/nativelink-store/src/shard_store.rs b/nativelink-store/src/shard_store.rs index 270ce9c5d..1dc4e18a4 100644 --- a/nativelink-store/src/shard_store.rs +++ b/nativelink-store/src/shard_store.rs @@ -20,15 +20,28 @@ use std::sync::Arc; use async_trait::async_trait; use futures::stream::{FuturesUnordered, TryStreamExt}; use nativelink_error::{error_if, Error, ResultExt}; +use nativelink_metric::MetricsComponent; use nativelink_util::buf_channel::{DropCloserReadHalf, DropCloserWriteHalf}; use nativelink_util::health_utils::{default_health_status_indicator, HealthStatusIndicator}; -use nativelink_util::metrics_utils::Registry; use nativelink_util::store_trait::{Store, StoreDriver, StoreKey, StoreLike, UploadSizeInfo}; +#[derive(MetricsComponent)] +struct StoreAndWeight { + #[metric(help = "The weight of the store")] + weight: u32, + #[metric(help = "The underlying store")] + store: Store, +} + +#[derive(MetricsComponent)] pub struct ShardStore { // The weights will always be in ascending order a specific store is choosen based on the // the hash of the key hash that is nearest-binary searched using the u32 as the index. - weights_and_stores: Vec<(u32, Store)>, + #[metric( + group = "stores", + help = "The weights and stores that are used to determine which store to use" + )] + weights_and_stores: Vec, } impl ShardStore { @@ -63,7 +76,11 @@ impl ShardStore { // Our last item should always be the max. *weights.last_mut().unwrap() = u32::MAX; Ok(Arc::new(Self { - weights_and_stores: weights.into_iter().zip(stores).collect(), + weights_and_stores: weights + .into_iter() + .zip(stores) + .map(|(weight, store)| StoreAndWeight { weight, store }) + .collect(), })) } @@ -112,13 +129,13 @@ impl ShardStore { } }; self.weights_and_stores - .binary_search_by_key(&key, |(weight, _)| *weight) + .binary_search_by_key(&key, |item| item.weight) .unwrap_or_else(|index| index) } fn get_store(&self, key: &StoreKey) -> &Store { let index = self.get_store_index(key); - &self.weights_and_stores[index].1 + &self.weights_and_stores[index].store } } @@ -132,7 +149,7 @@ impl StoreDriver for ShardStore { if keys.len() == 1 { // Hot path: It is very common to lookup only one key. let store_idx = self.get_store_index(&keys[0]); - let store = &self.weights_and_stores[store_idx].1; + let store = &self.weights_and_stores[store_idx].store; return store .has_with_results(keys, results) .await @@ -159,7 +176,7 @@ impl StoreDriver for ShardStore { .into_iter() .enumerate() .map(|(store_idx, (key_idxs, keys))| async move { - let store = &self.weights_and_stores[store_idx].1; + let store = &self.weights_and_stores[store_idx].store; let mut inner_results = vec![None; keys.len()]; store .has_with_results(&keys, &mut inner_results) @@ -210,7 +227,7 @@ impl StoreDriver for ShardStore { return self; }; let index = self.get_store_index(&key); - self.weights_and_stores[index].1.inner_store(Some(key)) + self.weights_and_stores[index].store.inner_store(Some(key)) } fn as_any<'a>(&'a self) -> &'a (dyn std::any::Any + Sync + Send + 'static) { @@ -220,13 +237,6 @@ impl StoreDriver for ShardStore { fn as_any_arc(self: Arc) -> Arc { self } - - fn register_metrics(self: Arc, registry: &mut Registry) { - for (i, (_, store)) in self.weights_and_stores.iter().enumerate() { - let store_registry = registry.sub_registry_with_prefix(format!("store_{i}")); - store.clone().register_metrics(store_registry); - } - } } default_health_status_indicator!(ShardStore); diff --git a/nativelink-store/src/size_partitioning_store.rs b/nativelink-store/src/size_partitioning_store.rs index 2c54d625d..2ac1cde7c 100644 --- a/nativelink-store/src/size_partitioning_store.rs +++ b/nativelink-store/src/size_partitioning_store.rs @@ -17,15 +17,19 @@ use std::sync::Arc; use async_trait::async_trait; use nativelink_error::{make_input_err, Error, ResultExt}; +use nativelink_metric::MetricsComponent; use nativelink_util::buf_channel::{DropCloserReadHalf, DropCloserWriteHalf}; use nativelink_util::health_utils::{default_health_status_indicator, HealthStatusIndicator}; -use nativelink_util::metrics_utils::{Collector, CollectorState, MetricsComponent, Registry}; use nativelink_util::store_trait::{Store, StoreDriver, StoreKey, StoreLike, UploadSizeInfo}; use tokio::join; +#[derive(MetricsComponent)] pub struct SizePartitioningStore { + #[metric(help = "Size to partition our data")] partition_size: i64, + #[metric(group = "lower_store")] lower_store: Store, + #[metric(group = "upper_store")] upper_store: Store, } @@ -159,24 +163,6 @@ impl StoreDriver for SizePartitioningStore { fn as_any_arc(self: Arc) -> Arc { self } - - fn register_metrics(self: Arc, registry: &mut Registry) { - let lower_store_registry = registry.sub_registry_with_prefix("lower_store"); - self.lower_store.register_metrics(lower_store_registry); - let upper_store_registry = registry.sub_registry_with_prefix("upper_store"); - self.upper_store.register_metrics(upper_store_registry); - registry.register_collector(Box::new(Collector::new(&self))); - } -} - -impl MetricsComponent for SizePartitioningStore { - fn gather_metrics(&self, c: &mut CollectorState) { - c.publish( - "partition_size", - &self.partition_size, - "Size to partition our data", - ); - } } default_health_status_indicator!(SizePartitioningStore); diff --git a/nativelink-store/src/store_manager.rs b/nativelink-store/src/store_manager.rs index 8d4557498..32efda709 100644 --- a/nativelink-store/src/store_manager.rs +++ b/nativelink-store/src/store_manager.rs @@ -13,11 +13,14 @@ // limitations under the License. use std::collections::HashMap; -use std::sync::RwLock; +use nativelink_metric::{MetricsComponent, RootMetricsComponent}; use nativelink_util::store_trait::Store; +use parking_lot::RwLock; +#[derive(MetricsComponent)] pub struct StoreManager { + #[metric] stores: RwLock>, } @@ -29,18 +32,12 @@ impl StoreManager { } pub fn add_store(&self, name: &str, store: Store) { - let mut stores = self - .stores - .write() - .expect("Failed to lock mutex in add_store()"); + let mut stores = self.stores.write(); stores.insert(name.to_string(), store); } pub fn get_store(&self, name: &str) -> Option { - let stores = self - .stores - .read() - .expect("Failed to lock read mutex in get_store()"); + let stores = self.stores.read(); if let Some(store) = stores.get(name) { return Some(store.clone()); } @@ -48,6 +45,8 @@ impl StoreManager { } } +impl RootMetricsComponent for StoreManager {} + impl Default for StoreManager { fn default() -> Self { Self::new() diff --git a/nativelink-store/src/verify_store.rs b/nativelink-store/src/verify_store.rs index 100b08d33..449758300 100644 --- a/nativelink-store/src/verify_store.rs +++ b/nativelink-store/src/verify_store.rs @@ -17,6 +17,7 @@ use std::sync::Arc; use async_trait::async_trait; use nativelink_error::{make_input_err, Error, ResultExt}; +use nativelink_metric::MetricsComponent; use nativelink_util::buf_channel::{ make_buf_channel_pair, DropCloserReadHalf, DropCloserWriteHalf, }; @@ -24,19 +25,23 @@ use nativelink_util::digest_hasher::{ default_digest_hasher_func, DigestHasher, ACTIVE_HASHER_FUNC, }; use nativelink_util::health_utils::{default_health_status_indicator, HealthStatusIndicator}; -use nativelink_util::metrics_utils::{ - Collector, CollectorState, CounterWithTime, MetricsComponent, Registry, -}; +use nativelink_util::metrics_utils::CounterWithTime; use nativelink_util::origin_context::ActiveOriginContext; use nativelink_util::store_trait::{Store, StoreDriver, StoreKey, StoreLike, UploadSizeInfo}; +#[derive(MetricsComponent)] pub struct VerifyStore { + #[metric(group = "inner_store")] inner_store: Store, + #[metric(help = "If the verification store is verifying the size of the data")] verify_size: bool, + #[metric(help = "If the verification store is verifying the hash of the data")] verify_hash: bool, // Metrics. + #[metric(help = "Number of failures the verification store had due to size mismatches")] size_verification_failures: CounterWithTime, + #[metric(help = "Number of failures the verification store had due to hash mismatches")] hash_verification_failures: CounterWithTime, } @@ -238,37 +243,6 @@ impl StoreDriver for VerifyStore { fn as_any_arc(self: Arc) -> Arc { self } - - fn register_metrics(self: Arc, registry: &mut Registry) { - let backend_store = registry.sub_registry_with_prefix("backend"); - self.inner_store.register_metrics(backend_store); - registry.register_collector(Box::new(Collector::new(&self))); - } -} - -impl MetricsComponent for VerifyStore { - fn gather_metrics(&self, c: &mut CollectorState) { - c.publish( - "verify_size_enabled", - &self.verify_size, - "If the verification store is verifying the size of the data", - ); - c.publish( - "verify_hash_enabled", - &self.verify_hash, - "If the verification store is verifying the hash of the data", - ); - c.publish( - "size_verification_failures_total", - &self.size_verification_failures, - "Number of failures the verification store had due to size mismatches", - ); - c.publish( - "hash_verification_failures_total", - &self.hash_verification_failures, - "Number of failures the verification store had due to hash mismatches", - ); - } } default_health_status_indicator!(VerifyStore); diff --git a/nativelink-store/tests/fast_slow_store_test.rs b/nativelink-store/tests/fast_slow_store_test.rs index 8e6829743..47748d0c8 100644 --- a/nativelink-store/tests/fast_slow_store_test.rs +++ b/nativelink-store/tests/fast_slow_store_test.rs @@ -20,6 +20,7 @@ use async_trait::async_trait; use bytes::Bytes; use nativelink_error::{make_err, Code, Error, ResultExt}; use nativelink_macro::nativelink_test; +use nativelink_metric::MetricsComponent; use nativelink_store::fast_slow_store::FastSlowStore; use nativelink_store::memory_store::MemoryStore; use nativelink_store::noop_store::NoopStore; @@ -230,6 +231,7 @@ fn calculate_range_test() { #[nativelink_test] async fn drop_on_eof_completes_store_futures() -> Result<(), Error> { + #[derive(MetricsComponent)] struct DropCheckStore { drop_flag: Arc, read_rx: Mutex>>, @@ -301,12 +303,6 @@ async fn drop_on_eof_completes_store_futures() -> Result<(), Error> { fn as_any_arc(self: Arc) -> Arc { self } - - fn register_metrics( - self: Arc, - _registry: &mut nativelink_util::metrics_utils::Registry, - ) { - } } impl Drop for DropCheckStore { diff --git a/nativelink-util/BUILD.bazel b/nativelink-util/BUILD.bazel index 8cb9fe668..8d894a501 100644 --- a/nativelink-util/BUILD.bazel +++ b/nativelink-util/BUILD.bazel @@ -41,6 +41,7 @@ rust_library( deps = [ "//nativelink-config", "//nativelink-error", + "//nativelink-metric", "//nativelink-proto", "@crates//:async-lock", "@crates//:bitflags", @@ -56,7 +57,6 @@ rust_library( "@crates//:parking_lot", "@crates//:pin-project", "@crates//:pin-project-lite", - "@crates//:prometheus-client", "@crates//:prost", "@crates//:prost-types", "@crates//:rand", diff --git a/nativelink-util/Cargo.toml b/nativelink-util/Cargo.toml index ab0cbb83c..13b0b0517 100644 --- a/nativelink-util/Cargo.toml +++ b/nativelink-util/Cargo.toml @@ -7,6 +7,7 @@ edition = "2021" nativelink-config = { path = "../nativelink-config" } nativelink-error = { path = "../nativelink-error" } nativelink-proto = { path = "../nativelink-proto" } +nativelink-metric = { path = "../nativelink-metric" } async-lock = "3.3.0" async-trait = "0.1.80" @@ -22,7 +23,6 @@ hyper-util = { version = "0.1.6", features = ["tokio"] } lru = "0.12.3" parking_lot = "0.12.2" pin-project-lite = "0.2.14" -prometheus-client = "0.21.2" prost = "0.12.4" prost-types = "0.12.4" rand = "0.8.5" diff --git a/nativelink-util/src/action_messages.rs b/nativelink-util/src/action_messages.rs index 4d839ef03..b74f142c1 100644 --- a/nativelink-util/src/action_messages.rs +++ b/nativelink-util/src/action_messages.rs @@ -18,6 +18,9 @@ use std::hash::{Hash, Hasher}; use std::time::{Duration, SystemTime}; use nativelink_error::{error_if, make_input_err, Error, ResultExt}; +use nativelink_metric::{ + publish, MetricFieldData, MetricKind, MetricPublishKnownKindData, MetricsComponent, +}; use nativelink_proto::build::bazel::remote::execution::v2::{ execution_stage, Action, ActionResult as ProtoActionResult, ExecuteOperationMetadata, ExecuteRequest, ExecuteResponse, ExecutedActionMetadata, FileNode, LogFile, OutputDirectory, @@ -35,7 +38,6 @@ use uuid::Uuid; use crate::common::{DigestInfo, HashMapExt, VecExt}; use crate::digest_hasher::DigestHasherFunc; -use crate::metrics_utils::{CollectorState, MetricsComponent}; use crate::platform_properties::PlatformProperties; /// Default priority remote execution jobs will get when not provided. @@ -70,9 +72,15 @@ impl std::fmt::Display for ClientOperationId { } } -#[derive(Clone, Serialize, Deserialize)] +fn uuid_to_string(uuid: &Uuid) -> String { + uuid.hyphenated().to_string() +} + +#[derive(Clone, Serialize, Deserialize, MetricsComponent)] pub struct OperationId { + #[metric(help = "The unique qualifier of the operation")] pub unique_qualifier: ActionUniqueQualifier, + #[metric(help = "The id of the operation", handler = uuid_to_string)] pub id: Uuid, } @@ -212,6 +220,16 @@ impl std::fmt::Debug for OperationId { #[derive(Default, Eq, PartialEq, Hash, Copy, Clone, Serialize, Deserialize)] pub struct WorkerId(pub Uuid); +impl MetricsComponent for WorkerId { + fn publish( + &self, + _kind: MetricKind, + _field_metadata: MetricFieldData, + ) -> Result { + Ok(MetricPublishKnownKindData::String(uuid_to_string(&self.0))) + } +} + impl std::fmt::Display for WorkerId { fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { let mut buf = Uuid::encode_buffer(); @@ -248,6 +266,28 @@ pub enum ActionUniqueQualifier { Uncachable(ActionUniqueKey), } +impl MetricsComponent for ActionUniqueQualifier { + fn publish( + &self, + _kind: MetricKind, + field_metadata: MetricFieldData, + ) -> Result { + let (cachable, action) = match self { + Self::Cachable(action) => (true, action), + Self::Uncachable(action) => (false, action), + }; + publish!( + cachable, + &cachable, + MetricKind::Default, + "If the action is cachable.", + "" + ); + action.publish(MetricKind::Component, field_metadata)?; + Ok(MetricPublishKnownKindData::Component) + } +} + impl ActionUniqueQualifier { /// Get the instance_name of the action. pub const fn instance_name(&self) -> &String { @@ -293,13 +333,16 @@ impl std::fmt::Display for ActionUniqueQualifier { /// This is a utility struct used to make it easier to match `ActionInfos` in a /// `HashMap` without needing to construct an entire `ActionInfo`. -#[derive(Debug, Clone, Eq, PartialEq, Hash, Serialize, Deserialize)] +#[derive(Debug, Clone, Eq, PartialEq, Hash, Serialize, Deserialize, MetricsComponent)] pub struct ActionUniqueKey { /// Name of instance group this action belongs to. + #[metric(help = "Name of instance group this action belongs to.")] pub instance_name: String, /// The digest function this action expects. + #[metric(help = "The digest function this action expects.")] pub digest_function: DigestHasherFunc, /// Digest of the underlying `Action`. + #[metric(help = "Digest of the underlying Action.")] pub digest: DigestInfo, } @@ -308,24 +351,32 @@ pub struct ActionUniqueKey { /// to ensure we never match against another `ActionInfo` (when a task should never be cached). /// This struct must be 100% compatible with `ExecuteRequest` struct in `remote_execution.proto` /// except for the salt field. -#[derive(Clone, Debug, Serialize, Deserialize)] +#[derive(Clone, Debug, Serialize, Deserialize, MetricsComponent)] pub struct ActionInfo { /// Digest of the underlying `Command`. + #[metric(help = "Digest of the underlying Command.")] pub command_digest: DigestInfo, /// Digest of the underlying `Directory`. + #[metric(help = "Digest of the underlying Directory.")] pub input_root_digest: DigestInfo, /// Timeout of the action. + #[metric(help = "Timeout of the action.")] pub timeout: Duration, /// The properties rules that must be applied when finding a worker that can run this action. + #[metric(group = "platform_properties")] pub platform_properties: PlatformProperties, /// The priority of the action. Higher value means it should execute faster. + #[metric(help = "The priority of the action. Higher value means it should execute faster.")] pub priority: i32, /// When this action started to be loaded from the CAS. + #[metric(help = "When this action started to be loaded from the CAS.")] pub load_timestamp: SystemTime, /// When this action was created. + #[metric(help = "When this action was created.")] pub insert_timestamp: SystemTime, /// Info used to uniquely identify this ActionInfo and if it is cachable. /// This is primarily used to join actions/operations together using this key. + #[metric(help = "Info used to uniquely identify this ActionInfo and if it is cachable.")] pub unique_qualifier: ActionUniqueQualifier, } @@ -810,21 +861,20 @@ impl ActionStage { } impl MetricsComponent for ActionStage { - fn gather_metrics(&self, c: &mut CollectorState) { - let (stage, maybe_exit_code) = match self { - ActionStage::Unknown => ("Unknown", None), - ActionStage::CacheCheck => ("CacheCheck", None), - ActionStage::Queued => ("Queued", None), - ActionStage::Executing => ("Executing", None), - ActionStage::Completed(action_result) => ("Completed", Some(action_result.exit_code)), - ActionStage::CompletedFromCache(proto_action_result) => { - ("CompletedFromCache", Some(proto_action_result.exit_code)) - } + fn publish( + &self, + _kind: MetricKind, + _field_metadata: MetricFieldData, + ) -> Result { + let value = match self { + ActionStage::Unknown => "Unknown".to_string(), + ActionStage::CacheCheck => "CacheCheck".to_string(), + ActionStage::Queued => "Queued".to_string(), + ActionStage::Executing => "Executing".to_string(), + ActionStage::Completed(_) => "Completed".to_string(), + ActionStage::CompletedFromCache(_) => "CompletedFromCache".to_string(), }; - c.publish("stage", &stage.to_string(), "The state of the action."); - if let Some(exit_code) = maybe_exit_code { - c.publish("exit_code", &exit_code, "The exit code of the action."); - } + Ok(MetricPublishKnownKindData::String(value)) } } @@ -1089,9 +1139,11 @@ where /// Current state of the action. /// This must be 100% compatible with `Operation` in `google/longrunning/operations.proto`. -#[derive(PartialEq, Debug, Clone, Serialize, Deserialize)] +#[derive(PartialEq, Debug, Clone, Serialize, Deserialize, MetricsComponent)] pub struct ActionState { + #[metric(help = "The current stage of the action.")] pub stage: ActionStage, + #[metric(help = "The unique identifier of the action.")] pub id: OperationId, } @@ -1173,9 +1225,3 @@ impl ActionState { } } } - -impl MetricsComponent for ActionState { - fn gather_metrics(&self, c: &mut CollectorState) { - c.publish("stage", &self.stage, ""); - } -} diff --git a/nativelink-util/src/common.rs b/nativelink-util/src/common.rs index 5318719c9..19b3673c9 100644 --- a/nativelink-util/src/common.rs +++ b/nativelink-util/src/common.rs @@ -20,6 +20,9 @@ use std::hash::Hash; use bytes::{BufMut, Bytes, BytesMut}; use hex::FromHex; use nativelink_error::{make_input_err, Error, ResultExt}; +use nativelink_metric::{ + MetricFieldData, MetricKind, MetricPublishKnownKindData, MetricsComponent, +}; use nativelink_proto::build::bazel::remote::execution::v2::Digest; use prost::Message; use serde::{Deserialize, Serialize}; @@ -36,6 +39,17 @@ pub struct DigestInfo { pub size_bytes: i64, } +impl MetricsComponent for DigestInfo { + fn publish( + &self, + _kind: MetricKind, + field_metadata: MetricFieldData, + ) -> Result { + format!("{}-{}", self.hash_str(), self.size_bytes) + .publish(MetricKind::String, field_metadata) + } +} + impl DigestInfo { pub const fn new(packed_hash: [u8; 32], size_bytes: i64) -> Self { DigestInfo { diff --git a/nativelink-util/src/digest_hasher.rs b/nativelink-util/src/digest_hasher.rs index a02f20fd9..72721bffe 100644 --- a/nativelink-util/src/digest_hasher.rs +++ b/nativelink-util/src/digest_hasher.rs @@ -19,6 +19,9 @@ use bytes::BytesMut; use futures::Future; use nativelink_config::stores::ConfigDigestHashFunction; use nativelink_error::{make_err, make_input_err, Code, Error, ResultExt}; +use nativelink_metric::{ + MetricFieldData, MetricKind, MetricPublishKnownKindData, MetricsComponent, +}; use nativelink_proto::build::bazel::remote::execution::v2::digest_function::Value as ProtoDigestFunction; use serde::{Deserialize, Serialize}; use sha2::{Digest, Sha256}; @@ -68,6 +71,16 @@ pub enum DigestHasherFunc { Blake3, } +impl MetricsComponent for DigestHasherFunc { + fn publish( + &self, + kind: MetricKind, + field_metadata: MetricFieldData, + ) -> Result { + format!("{self:?}").publish(kind, field_metadata) + } +} + impl DigestHasherFunc { pub fn hasher(&self) -> DigestHasherImpl { self.into() diff --git a/nativelink-util/src/evicting_map.rs b/nativelink-util/src/evicting_map.rs index a67f2fb60..382313851 100644 --- a/nativelink-util/src/evicting_map.rs +++ b/nativelink-util/src/evicting_map.rs @@ -24,11 +24,12 @@ use std::sync::Arc; use async_lock::Mutex; use lru::LruCache; use nativelink_config::stores::EvictionPolicy; +use nativelink_metric::MetricsComponent; use serde::{Deserialize, Serialize}; use tracing::{event, Level}; use crate::instant_wrapper::InstantWrapper; -use crate::metrics_utils::{CollectorState, Counter, CounterWithTime, MetricsComponent}; +use crate::metrics_utils::{Counter, CounterWithTime}; #[derive(Serialize, Deserialize, PartialEq, Debug, Clone)] pub struct SerializedLRU { @@ -96,18 +97,22 @@ impl LenEntry for Arc { } } +#[derive(MetricsComponent)] struct State { lru: LruCache>, btree: Option>, + #[metric(help = "Total size of all items in the store")] sum_store_size: u64, - // Metrics. + #[metric(help = "Number of bytes evicted from the store")] evicted_bytes: Counter, + #[metric(help = "Number of items evicted from the store")] evicted_items: CounterWithTime, + #[metric(help = "Number of bytes replaced in the store")] replaced_bytes: Counter, + #[metric(help = "Number of items replaced in the store")] replaced_items: CounterWithTime, - removed_bytes: Counter, - removed_items: CounterWithTime, + #[metric(help = "Number of bytes inserted into the store since it was created")] lifetime_inserted_bytes: Counter, } @@ -147,12 +152,18 @@ impl State } } +#[derive(MetricsComponent)] pub struct EvictingMap { + #[metric] state: Mutex>, anchor_time: I, + #[metric(help = "Maximum size of the store in bytes")] max_bytes: u64, + #[metric(help = "Number of bytes to evict when the store is full")] evict_bytes: u64, + #[metric(help = "Maximum number of seconds to keep an item in the store")] max_seconds: i32, + #[metric(help = "Maximum number of items to keep in the store")] max_count: u64, } @@ -174,8 +185,6 @@ where evicted_items: CounterWithTime::default(), replaced_bytes: Counter::default(), replaced_items: CounterWithTime::default(), - removed_bytes: Counter::default(), - removed_items: CounterWithTime::default(), lifetime_inserted_bytes: Counter::default(), }), anchor_time, @@ -461,111 +470,3 @@ where false } } - -impl MetricsComponent - for EvictingMap -{ - fn gather_metrics(&self, c: &mut CollectorState) { - c.publish( - "max_bytes", - &self.max_bytes, - "Maximum size of the store in bytes", - ); - c.publish( - "evict_bytes", - &self.evict_bytes, - "Number of bytes to evict when the store is full", - ); - c.publish( - "anchor_time_timestamp", - &self.anchor_time.unix_timestamp(), - "Anchor time for the store", - ); - c.publish( - "max_seconds", - &self.max_seconds, - "Maximum number of seconds to keep an item in the store", - ); - c.publish( - "max_count", - &self.max_count, - "Maximum number of items to keep in the store", - ); - futures::executor::block_on(async move { - let state = self.state.lock().await; - c.publish( - "sum_store_size_bytes", - &state.sum_store_size, - "Total size of all items in the store", - ); - c.publish( - "items_in_store_total", - &state.lru.len(), - "Number of items in the store", - ); - c.publish( - "oldest_item_timestamp", - &state - .lru - .peek_lru() - .map(|(_, v)| { - self.anchor_time.unix_timestamp() as i64 - v.seconds_since_anchor as i64 - }) - .unwrap_or(-1), - "Timestamp of the oldest item in the store", - ); - c.publish( - "newest_item_timestamp", - &state - .lru - .iter() - .next() - .map(|(_, v)| { - self.anchor_time.unix_timestamp() as i64 - v.seconds_since_anchor as i64 - }) - .unwrap_or(-1), - "Timestamp of the newest item in the store", - ); - c.publish( - "evicted_items_total", - &state.evicted_items, - "Number of items evicted from the store", - ); - c.publish( - "evicted_bytes", - &state.evicted_bytes, - "Number of bytes evicted from the store", - ); - c.publish( - "lifetime_inserted_bytes", - &state.lifetime_inserted_bytes, - "Number of bytes inserted into the store since it was created", - ); - c.publish( - "replaced_bytes", - &state.replaced_bytes, - "Number of bytes replaced in the store", - ); - c.publish( - "replaced_items_total", - &state.replaced_items, - "Number of items replaced in the store", - ); - c.publish( - "removed_bytes", - &state.removed_bytes, - "Number of bytes explicitly removed from the store", - ); - c.publish( - "removed_items_total", - &state.removed_items, - "Number of items explicitly removed from the store", - ); - c.publish_stats( - "item_size_bytes", - state.lru.iter().take(1_000_000).map(|(_, v)| v.data.len()), - "Stats about the first 1_000_000 items in the store (these are newest items in the store)", - ); - }); - } -} diff --git a/nativelink-util/src/metrics_utils.rs b/nativelink-util/src/metrics_utils.rs index 346656b04..3ba95a564 100644 --- a/nativelink-util/src/metrics_utils.rs +++ b/nativelink-util/src/metrics_utils.rs @@ -12,36 +12,15 @@ // See the License for the specific language governing permissions and // limitations under the License. -use std::borrow::Cow; -use std::fmt::Debug; -use std::marker::PhantomData; use std::mem::forget; -use std::sync::atomic::{ - AtomicBool, AtomicI16, AtomicI32, AtomicI64, AtomicI8, AtomicIsize, AtomicU16, AtomicU32, - AtomicU64, AtomicU8, AtomicUsize, Ordering, -}; -use std::sync::{Arc, Weak}; +use std::sync::atomic::{AtomicBool, AtomicU64, Ordering}; use std::thread_local; -use std::time::{Duration, Instant, SystemTime, UNIX_EPOCH}; +use std::time::{Instant, SystemTime, UNIX_EPOCH}; use futures::Future; -use prometheus_client::collector::Collector as PrometheusCollector; -use prometheus_client::encoding::{EncodeMetric, MetricEncoder}; -use prometheus_client::metrics::info::Info; -use prometheus_client::metrics::MetricType; -pub use prometheus_client::registry::Registry; -use prometheus_client::registry::{Descriptor, LocalMetric, Prefix}; -use prometheus_client::MaybeOwned; - -/// A component that can be registered with the metrics collector. -pub trait MetricsComponent { - /// This method will magically be called by the metrics collector to gather - /// all the metrics from this component if it has been registered. - /// This function should be extremely fast. - /// - /// It is safe to block in this function. - fn gather_metrics(&self, collector: &mut CollectorState); -} +use nativelink_metric::{ + group, publish, MetricFieldData, MetricKind, MetricPublishKnownKindData, MetricsComponent, +}; thread_local! { /// This is a thread local variable that will enable or disable metrics for @@ -60,7 +39,7 @@ thread_local! { pub fn metrics_enabled() -> bool { METRICS_ENABLED.with( #[inline] - |v| v.load(Ordering::Relaxed), + |v| v.load(Ordering::Acquire), ) } @@ -69,185 +48,7 @@ pub fn metrics_enabled() -> bool { /// so you'd need to run this function on every thread in the thread pool in /// order to enable it everywhere. pub fn set_metrics_enabled_for_this_thread(enabled: bool) { - METRICS_ENABLED.with(|v| v.store(enabled, Ordering::Relaxed)); -} - -type NameString = String; -type HelpString = String; -type Labels = Vec<(Cow<'static, str>, Cow<'static, str>)>; -type Metric = ( - NameString, - HelpString, - MaybeOwned<'static, Box>, - Labels, -); - -type TextMetric = (NameString, HelpString, String, Labels); - -#[derive(Default)] -pub struct CollectorState { - module_name: Option, - metrics: Vec, - text: Vec, - children: Vec<(CollectorState, Labels)>, -} - -impl CollectorState { - /// Publishes a value. This should be the primary way a metric is published. - /// Any special types that want metrics published should implement `MetricPublisher` - /// for that type. - #[inline] - pub fn publish( - &mut self, - name: impl Into, - value: impl MetricPublisher, - help: impl Into, - ) { - value.publish(self, name.into(), help.into(), vec![]); - } - - /// Same as publish() but with labels. - #[inline] - pub fn publish_with_labels( - &mut self, - name: impl Into, - value: impl MetricPublisher, - help: impl Into, - labels: Labels, - ) { - value.publish(self, name.into(), help.into(), labels); - } - - /// Publish a numerical metric. Usually used by `MetricPublisher` to publish metrics. - #[inline] - pub fn publish_number( - &mut self, - name: impl Into, - value: T, - help: impl Into, - labels: impl Into, - ) where - N: Debug + 'static, - T: Into>, - NumericalMetric: EncodeMetric, - { - let gague: Box = Box::new(value.into()); - self.metrics.push(( - name.into(), - help.into(), - MaybeOwned::Owned(gague), - labels.into(), - )); - } - - /// Publish a static text metric. Generally these are used for labels and don't - /// change during runtime. Usually used by `MetricPublisher` to publish metrics. - #[inline] - pub fn publish_text( - &mut self, - name: impl Into, - value: impl Into, - help: impl Into, - labels: impl Into, - ) { - self.text - .push((name.into(), help.into(), value.into(), labels.into())); - } - - /// Publish a histogram metric. Be careful not to have the iterator take too - /// much data or this will consume a lot of memory because we need to collect - /// all the data and sort them to calculate the percentiles. - #[inline] - pub fn publish_stats( - &mut self, - name: impl Into + Clone, - data: impl Iterator, - help: impl Into + Clone, - ) where - N: Debug + 'static, - T: Into> + Ord + Copy + std::fmt::Display, - NumericalMetric: EncodeMetric, - { - let mut data = data.collect::>(); - if data.is_empty() { - return; - } - data.sort_unstable(); - let data_len = data.len() as f64; - for i in &[ - 0.00, 0.01, 0.03, 0.05, 0.10, 0.30, 0.50, 0.70, 0.90, 0.95, 0.97, 0.99, 1.00, - ] { - let index = (i * data_len) as usize; - let value = data - .get(if index < data.len() { index } else { index - 1 }) - .unwrap(); - let labels = vec![("quantile".into(), format!("{i:.2}").into())]; - self.publish_number(name.clone(), *value, help.clone(), labels); - } - } - - fn into_metrics<'a>(self, parent_labels: Labels) -> CollectorResult<'a> { - let module_name1 = self.module_name.clone(); - let module_name2 = self.module_name.clone(); - let parent_labels1 = parent_labels.clone(); - let parent_labels2 = parent_labels.clone(); - let parent_labels3 = parent_labels; - Box::new( - self.metrics - .into_iter() - .map(move |(name, help, metric, labels)| { - let mut combined_labels = parent_labels1.clone(); - combined_labels.extend_from_slice(&labels); - let mut prefix: Option = None; - if let Some(parent_prefix) = &module_name1 { - prefix = Some(Prefix::from(parent_prefix.clone())); - } - ( - Cow::Owned(Descriptor::new( - name, - help, - None, - prefix.as_ref(), - combined_labels, - )), - metric, - ) - }) - .chain( - self.text - .into_iter() - .map(move |(name, help, value, labels)| { - let mut combined_labels = parent_labels2.clone(); - combined_labels.extend_from_slice(&labels); - let info: Box = - Box::new(Info::new(vec![(name, value)])); - let mut prefix: Option = None; - if let Some(parent_prefix) = &module_name2 { - prefix = Some(Prefix::from(parent_prefix.clone())); - } - ( - Cow::Owned(Descriptor::new( - "labels", - help, - None, - prefix.as_ref(), - combined_labels, - )), - MaybeOwned::Owned(info), - ) - }), - ) - .chain( - self.children - .into_iter() - .flat_map(move |(child_state, labels)| { - let mut combined_labels = parent_labels3.clone(); - combined_labels.extend_from_slice(&labels); - child_state.into_metrics(combined_labels) - }), - ), - ) - } + METRICS_ENABLED.with(|v| v.store(enabled, Ordering::Release)); } #[derive(Default)] @@ -261,35 +62,43 @@ impl FuncCounterWrapper { pub fn wrap(&self, func: impl FnOnce() -> Result) -> Result { let result = (func)(); if result.is_ok() { - self.successes.fetch_add(1, Ordering::Relaxed); + self.successes.fetch_add(1, Ordering::Acquire); } else { - self.failures.fetch_add(1, Ordering::Relaxed); + self.failures.fetch_add(1, Ordering::Acquire); } result } } -impl MetricPublisher for &FuncCounterWrapper { - #[inline] - fn publish(&self, state: &mut CollectorState, name: String, help: String, labels: Labels) { - let successes = self.successes.load(Ordering::Relaxed); - let failures = self.failures.load(Ordering::Relaxed); - let mut success_labels = labels.clone(); - success_labels.extend_from_slice(&[("type".into(), "success".into())]); - state.publish_number( - name.clone(), - successes, - format!("{help} The number of successes."), - success_labels, +// Derive-macros have no way to tell the collector that the parent +// is now a group with the name of the group as the field so we +// can attach multiple values on the same group, so we need to +// manually implement the `MetricsComponent` trait to do so. +impl MetricsComponent for FuncCounterWrapper { + fn publish( + &self, + _kind: MetricKind, + field_metadata: MetricFieldData, + ) -> Result { + let _enter = group!(field_metadata.name).entered(); + + publish!( + "successes", + &self.successes, + MetricKind::Counter, + format!( + "The number of times {} was successful.", + field_metadata.name + ) ); - let mut failure_labels = labels; - failure_labels.extend_from_slice(&[("type".into(), "failure".into())]); - state.publish_number( - name, - failures, - format!("{help} The number of failures."), - failure_labels, + publish!( + "failures", + &self.failures, + MetricKind::Counter, + format!("The number of times {} failed.", field_metadata.name) ); + + Ok(MetricPublishKnownKindData::Component) } } @@ -313,7 +122,7 @@ impl<'a> Drop for DropCounter<'a> { if !metrics_enabled() { return; } - self.counter.fetch_add(1, Ordering::Relaxed); + self.counter.fetch_add(1, Ordering::Acquire); } } @@ -331,9 +140,9 @@ impl<'a> AsyncTimer<'a> { } self.counter .sum_func_duration_ns - .fetch_add(self.start.elapsed().as_nanos() as u64, Ordering::Relaxed); - self.counter.calls.fetch_add(1, Ordering::Relaxed); - self.counter.successes.fetch_add(1, Ordering::Relaxed); + .fetch_add(self.start.elapsed().as_nanos() as u64, Ordering::Acquire); + self.counter.calls.fetch_add(1, Ordering::Acquire); + self.counter.successes.fetch_add(1, Ordering::Acquire); // This causes DropCounter's drop to never be called. forget(self.drop_counter); } @@ -353,18 +162,71 @@ pub struct AsyncCounterWrapper { pub sum_func_duration_ns: AtomicU64, } +// Derive-macros have no way to tell the collector that the parent +// is now a group with the name of the group as the field so we +// can attach multiple values on the same group, so we need to +// manually implement the `MetricsComponent` trait to do so. +impl MetricsComponent for AsyncCounterWrapper { + fn publish( + &self, + _kind: MetricKind, + field_metadata: MetricFieldData, + ) -> Result { + let _enter = group!(field_metadata.name).entered(); + + publish!( + "calls", + &self.calls, + MetricKind::Counter, + format!("The number of times {} was called.", field_metadata.name) + ); + publish!( + "successes", + &self.successes, + MetricKind::Counter, + format!( + "The number of times {} was successful.", + field_metadata.name + ) + ); + publish!( + "failures", + &self.failures, + MetricKind::Counter, + format!("The number of times {} failed.", field_metadata.name) + ); + publish!( + "drops", + &self.drops, + MetricKind::Counter, + format!("The number of times {} was dropped.", field_metadata.name) + ); + publish!( + "sum_func_duration_ns", + &self.sum_func_duration_ns, + MetricKind::Counter, + format!( + "The sum of the time spent in nanoseconds in {}.", + field_metadata.name + ) + ); + + Ok(MetricPublishKnownKindData::Component) + } +} + impl AsyncCounterWrapper { #[inline] pub fn wrap_fn<'a, T: 'a, E>( &'a self, func: impl FnOnce() -> Result + 'a, ) -> Result { - self.calls.fetch_add(1, Ordering::Relaxed); + self.calls.fetch_add(1, Ordering::Acquire); let result = (func)(); if result.is_ok() { - self.successes.fetch_add(1, Ordering::Relaxed); + self.successes.fetch_add(1, Ordering::Acquire); } else { - self.failures.fetch_add(1, Ordering::Relaxed); + self.failures.fetch_add(1, Ordering::Acquire); } result } @@ -379,9 +241,9 @@ impl AsyncCounterWrapper { } let result = self.wrap_no_capture_result(future).await; if result.is_ok() { - self.successes.fetch_add(1, Ordering::Relaxed); + self.successes.fetch_add(1, Ordering::Acquire); } else { - self.failures.fetch_add(1, Ordering::Relaxed); + self.failures.fetch_add(1, Ordering::Acquire); } result } @@ -394,7 +256,7 @@ impl AsyncCounterWrapper { if !metrics_enabled() { return future.await; } - self.calls.fetch_add(1, Ordering::Relaxed); + self.calls.fetch_add(1, Ordering::Acquire); let drop_counter = DropCounter::new(&self.drops); let instant = Instant::now(); let result = future.await; @@ -402,7 +264,7 @@ impl AsyncCounterWrapper { // This will ensure we don't increment the counter if we make it here with a zero cost. forget(drop_counter); self.sum_func_duration_ns - .fetch_add(instant.elapsed().as_nanos() as u64, Ordering::Relaxed); + .fetch_add(instant.elapsed().as_nanos() as u64, Ordering::Acquire); result } @@ -416,65 +278,6 @@ impl AsyncCounterWrapper { } } -impl MetricPublisher for &AsyncCounterWrapper { - #[inline] - fn publish(&self, state: &mut CollectorState, name: String, help: String, labels: Labels) { - let calls = self.calls.load(Ordering::Relaxed); - let successes = self.successes.load(Ordering::Relaxed); - let failures = self.failures.load(Ordering::Relaxed); - let drops = self.drops.load(Ordering::Relaxed); - let active = calls - successes - failures - drops; - let non_zero_calls = if calls == 0 { 1 } else { calls }; - let avg_duration_ns = self.sum_func_duration_ns.load(Ordering::Relaxed) / non_zero_calls; - { - let mut labels = labels.clone(); - labels.extend_from_slice(&[("type".into(), "drop".into())]); - state.publish_number( - name.clone(), - drops, - format!("{help} The number of dropped futures."), - labels, - ); - } - { - let mut labels = labels.clone(); - labels.extend_from_slice(&[("type".into(), "success".into())]); - state.publish_number( - name.clone(), - successes, - format!("{help} The number of successes."), - labels, - ); - } - { - let mut labels = labels.clone(); - labels.extend_from_slice(&[("type".into(), "failure".into())]); - state.publish_number( - name.clone(), - failures, - format!("{help} The number of failures."), - labels, - ); - } - { - let mut labels = labels.clone(); - labels.extend_from_slice(&[("type".into(), "active".into())]); - state.publish_number( - name.clone(), - active, - format!("{help} The number of active futures."), - labels, - ); - } - state.publish_with_labels( - format!("{name}_avg_duration_ns"), - &avg_duration_ns, - format!("{help} The average number of nanos spent in future."), - labels, - ); - } -} - /// Tracks an number. #[derive(Default)] pub struct Counter(AtomicU64); @@ -490,7 +293,7 @@ impl Counter { if !metrics_enabled() { return; } - self.0.fetch_add(value, Ordering::Relaxed); + self.0.fetch_add(value, Ordering::Acquire); } #[inline] @@ -498,14 +301,17 @@ impl Counter { if !metrics_enabled() { return; } - self.0.fetch_sub(value, Ordering::Relaxed); + self.0.fetch_sub(value, Ordering::Acquire); } } -impl MetricPublisher for &Counter { - #[inline] - fn publish(&self, state: &mut CollectorState, name: String, help: String, labels: Labels) { - state.publish_with_labels(name, &self.0, help, labels); +impl MetricsComponent for Counter { + fn publish( + &self, + kind: MetricKind, + field_metadata: MetricFieldData, + ) -> Result { + self.0.publish(kind, field_metadata) } } @@ -513,7 +319,7 @@ impl MetricPublisher for &Counter { #[derive(Default)] pub struct CounterWithTime { pub counter: AtomicU64, - pub last_time: AtomicI64, + pub last_time: AtomicU64, } impl CounterWithTime { @@ -522,232 +328,42 @@ impl CounterWithTime { if !metrics_enabled() { return; } - self.counter.fetch_add(1, Ordering::Relaxed); + self.counter.fetch_add(1, Ordering::Acquire); self.last_time.store( SystemTime::now() .duration_since(UNIX_EPOCH) .unwrap() - .as_secs() as i64, - Ordering::Relaxed, - ); - } -} - -impl MetricPublisher for &CounterWithTime { - #[inline] - fn publish(&self, state: &mut CollectorState, name: String, help: String, labels: Labels) { - state.publish_with_labels( - format!("{name}_last_ts"), - &self.last_time, - format!("The timestamp of when {name} was last published"), - labels.clone(), + .as_secs(), + Ordering::Release, ); - state.publish_with_labels(name, &self.counter, help, labels); - } -} - -pub struct Collector -where - T: Sync + Send + 'static, -{ - handle: Weak, - _marker: PhantomData, -} - -impl Collector -where - T: Sync + Send + 'static, -{ - pub fn new(handle: &Arc) -> Self { - Self { - handle: Arc::downgrade(handle), - _marker: PhantomData, - } - } -} - -impl Debug for Collector { - fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { - f.debug_struct("Collector").finish() - } -} - -type CollectorResult<'a> = - Box, MaybeOwned<'a, Box>)> + 'a>; - -impl PrometheusCollector for Collector { - fn collect(&self) -> CollectorResult { - let Some(handle) = self.handle.upgrade() else { - // Don't report any metrics if the component is no longer alive. - return Box::new(std::iter::empty()); - }; - - let mut state = CollectorState::default(); - handle.gather_metrics(&mut state); - state.into_metrics(vec![]) } } -pub trait MetricPublisher { - /// Publish a gague metric. - fn publish(&self, state: &mut CollectorState, name: String, help: String, labels: Labels); -} - -/// Implements MetricPublisher for string types. -impl MetricPublisher for &String { - #[inline] - fn publish(&self, state: &mut CollectorState, name: String, help: String, labels: Labels) { - state.publish_text(name, *self, help, labels); - } -} - -/// Implements MetricPublisher for MetricsComponent. -impl MetricPublisher for &T -where - T: MetricsComponent, -{ - #[inline] +// Derive-macros have no way to tell the collector that the parent +// is now a group with the name of the group as the field so we +// can attach multiple values on the same group, so we need to +// manually implement the `MetricsComponent` trait to do so. +impl MetricsComponent for CounterWithTime { fn publish( &self, - parent_state: &mut CollectorState, - module_name: String, - _help: String, - labels: Labels, - ) { - let module_name = if module_name.is_empty() { - None - } else { - Some(module_name) - }; - let mut state = CollectorState { - module_name: match (&parent_state.module_name, module_name) { - (Some(parent), None) => Some(parent.clone()), - (Some(parent), Some(child)) => Some(format!("{parent}_{child}")), - (None, child) => child, - }, - metrics: Vec::default(), - text: Vec::default(), - children: Vec::default(), - }; - self.gather_metrics(&mut state); - parent_state.children.push((state, labels)); - } -} - -impl MetricPublisher for &SystemTime { - #[inline] - fn publish(&self, state: &mut CollectorState, name: String, help: String, labels: Labels) { - state.publish_number( - name, - NumericalMetric(self.duration_since(UNIX_EPOCH).unwrap().as_secs_f64()), - help, - labels, + _kind: MetricKind, + field_metadata: MetricFieldData, + ) -> Result { + let _enter = group!(field_metadata.name).entered(); + + publish!( + "counter", + &self.counter, + MetricKind::Counter, + format!("Current count of {}.", field_metadata.name) + ); + publish!( + "last_time", + &self.last_time, + MetricKind::Counter, + format!("Last timestamp {} was published.", field_metadata.name) ); - } -} - -impl EncodeMetric for NumericalMetric { - fn encode(&self, mut encoder: MetricEncoder) -> Result<(), std::fmt::Error> { - encoder.encode_gauge(&self.0.duration_since(UNIX_EPOCH).unwrap().as_secs_f64()) - } - - fn metric_type(&self) -> MetricType { - MetricType::Gauge - } -} - -impl MetricPublisher for &Duration { - #[inline] - fn publish(&self, state: &mut CollectorState, name: String, help: String, labels: Labels) { - state.publish_number(name, NumericalMetric(self.as_secs_f64()), help, labels); - } -} - -impl EncodeMetric for NumericalMetric { - fn encode(&self, mut encoder: MetricEncoder) -> Result<(), std::fmt::Error> { - encoder.encode_gauge(&self.0.as_secs_f64()) - } - fn metric_type(&self) -> MetricType { - MetricType::Gauge + Ok(MetricPublishKnownKindData::Component) } } - -macro_rules! impl_publish_atomic { - ($($t:ty),*) => { - $( - impl MetricPublisher for &$t { - #[inline] - fn publish(&self, state: &mut CollectorState, name: String, help: String, labels: Labels) { - state.publish_number(name, &self.load(Ordering::Relaxed), help, labels); - } - } - )* - }; -} - -impl_publish_atomic!( - AtomicU8, - AtomicU16, - AtomicU32, - AtomicU64, - AtomicUsize, - AtomicI8, - AtomicI16, - AtomicI32, - AtomicI64, - AtomicIsize -); - -#[derive(Debug)] -pub struct NumericalMetric(T); - -macro_rules! impl_numerical { - ($($t:ty),*) => { - $( - impl From<$t> for NumericalMetric<$t> { - #[inline] - fn from(t: $t) -> Self { - NumericalMetric(t) - } - } - impl From<&$t> for NumericalMetric<$t> { - #[inline] - fn from(t: &$t) -> Self { - NumericalMetric(*t) - } - } - )* - }; -} - -// Regsiter all the numerical types to be converted into Numerical. -impl_numerical!(u8, bool, u16, u32, u64, usize, i8, i16, i32, i64, isize, f32, f64); - -macro_rules! impl_numerical_metric { - ($u:ty,$($t:ty),*) => { - $( - impl MetricPublisher for &$t { - #[inline] - fn publish(&self, state: &mut CollectorState, name: String, help: String, labels: Labels) { - state.publish_number(name, *self, help, labels); - } - } - - impl EncodeMetric for NumericalMetric<$t> { - fn encode(&self, mut encoder: MetricEncoder) -> Result<(), std::fmt::Error> { - encoder.encode_gauge(&TryInto::<$u>::try_into(self.0).map_err(|_| std::fmt::Error::default())?) - } - - fn metric_type(&self) -> MetricType { - MetricType::Gauge - } - } - )* - }; -} -// Implement metrics for all the numerical integer types by trying to cast it to i64. -impl_numerical_metric!(i64, bool, u8, u16, u32, u64, usize, i8, i16, i32, i64, isize); - -// Implement metrics for all float types by trying to cast it to f64. -impl_numerical_metric!(f64, f64, f32); diff --git a/nativelink-util/src/operation_state_manager.rs b/nativelink-util/src/operation_state_manager.rs index cb1b331e3..066a80e38 100644 --- a/nativelink-util/src/operation_state_manager.rs +++ b/nativelink-util/src/operation_state_manager.rs @@ -20,7 +20,7 @@ use async_trait::async_trait; use bitflags::bitflags; use futures::Stream; use nativelink_error::Error; -use prometheus_client::registry::Registry; +use nativelink_metric::MetricsComponent; use crate::action_messages::{ ActionInfo, ActionStage, ActionState, ActionUniqueKey, ClientOperationId, OperationId, WorkerId, @@ -97,7 +97,7 @@ pub type ActionStateResultStream<'a> = Pin> + Send + 'a>>; #[async_trait] -pub trait ClientStateManager: Sync + Send { +pub trait ClientStateManager: Sync + Send + MetricsComponent { /// Add a new action to the queue or joins an existing action. async fn add_action( &self, @@ -110,13 +110,10 @@ pub trait ClientStateManager: Sync + Send { &'a self, filter: OperationFilter, ) -> Result, Error>; - - /// Register metrics with the registry. - fn register_metrics(self: Arc, _registry: &mut Registry) {} } #[async_trait] -pub trait WorkerStateManager: Sync + Send { +pub trait WorkerStateManager: Sync + Send + MetricsComponent { /// Update that state of an operation. /// The worker must also send periodic updates even if the state /// did not change with a modified timestamp in order to prevent @@ -127,13 +124,10 @@ pub trait WorkerStateManager: Sync + Send { worker_id: &WorkerId, action_stage: Result, ) -> Result<(), Error>; - - /// Register metrics with the registry. - fn register_metrics(self: Arc, _registry: &mut Registry) {} } #[async_trait] -pub trait MatchingEngineStateManager: Sync + Send { +pub trait MatchingEngineStateManager: Sync + Send + MetricsComponent { /// Returns a stream of operations that match the filter. async fn filter_operations<'a>( &'a self, @@ -146,7 +140,4 @@ pub trait MatchingEngineStateManager: Sync + Send { operation_id: &OperationId, worker_id_or_reason_for_unsassign: Result<&WorkerId, Error>, ) -> Result<(), Error>; - - /// Register metrics with the registry. - fn register_metrics(self: Arc, _registry: &mut Registry) {} } diff --git a/nativelink-util/src/platform_properties.rs b/nativelink-util/src/platform_properties.rs index 7a90e189f..3bcf998bf 100644 --- a/nativelink-util/src/platform_properties.rs +++ b/nativelink-util/src/platform_properties.rs @@ -15,6 +15,9 @@ use std::borrow::Cow; use std::collections::HashMap; +use nativelink_metric::{ + publish, MetricFieldData, MetricKind, MetricPublishKnownKindData, MetricsComponent, +}; use nativelink_proto::build::bazel::remote::execution::v2::Platform as ProtoPlatform; use serde::{Deserialize, Serialize}; @@ -25,8 +28,9 @@ use serde::{Deserialize, Serialize}; /// all the platform property keys configured on the worker. /// /// Additional rules may be applied based on `PlatfromPropertyValue`. -#[derive(Eq, PartialEq, Clone, Debug, Default, Serialize, Deserialize)] +#[derive(Eq, PartialEq, Clone, Debug, Default, Serialize, Deserialize, MetricsComponent)] pub struct PlatformProperties { + #[metric] pub properties: HashMap, } @@ -117,3 +121,22 @@ impl PlatformPropertyValue { } } } + +impl MetricsComponent for PlatformPropertyValue { + fn publish( + &self, + kind: MetricKind, + field_metadata: MetricFieldData, + ) -> Result { + let name = field_metadata.name.into_owned(); + let help = field_metadata.help.as_ref(); + match self { + Self::Exact(v) => publish!(name, v, kind, help, "exact"), + Self::Minimum(v) => publish!(name, v, kind, help, "minimum"), + Self::Priority(v) => publish!(name, v, kind, help, "priority"), + Self::Unknown(v) => publish!(name, v, kind, help, "unknown"), + } + + Ok(MetricPublishKnownKindData::Component) + } +} diff --git a/nativelink-util/src/store_trait.rs b/nativelink-util/src/store_trait.rs index 218d5d92e..0f081f7ff 100644 --- a/nativelink-util/src/store_trait.rs +++ b/nativelink-util/src/store_trait.rs @@ -25,6 +25,7 @@ use bytes::{Bytes, BytesMut}; use futures::future::{select, Either}; use futures::{join, try_join, Future, FutureExt}; use nativelink_error::{error_if, make_err, Code, Error, ResultExt}; +use nativelink_metric::MetricsComponent; use rand::rngs::StdRng; use rand::{RngCore, SeedableRng}; use serde::{Deserialize, Serialize}; @@ -37,7 +38,6 @@ use crate::default_store_key_subscribe::default_store_key_subscribe; use crate::digest_hasher::{default_digest_hasher_func, DigestHasher, DigestHasherFunc}; use crate::fs::{self, idle_file_descriptor_timeout}; use crate::health_utils::{HealthRegistryBuilder, HealthStatus, HealthStatusIndicator}; -use crate::metrics_utils::Registry; static DEFAULT_DIGEST_SIZE_HEALTH_CHECK: OnceLock = OnceLock::new(); /// Default digest size for health check data. Any change in this value @@ -332,9 +332,10 @@ impl<'a> From<&DigestInfo> for StoreKey<'a> { } } -#[derive(Clone)] +#[derive(Clone, MetricsComponent)] #[repr(transparent)] pub struct Store { + #[metric] inner: Arc, } @@ -383,12 +384,6 @@ impl Store { self.inner.clone().subscribe(key.into()) } - /// Register any metrics that this store wants to expose to the Prometheus. - #[inline] - pub fn register_metrics(&self, registry: &mut Registry) { - self.inner.clone().register_metrics(registry) - } - /// Register health checks used to monitor the store. #[inline] pub fn register_health(&self, registry: &mut HealthRegistryBuilder) { @@ -598,7 +593,9 @@ pub trait StoreLike: Send + Sync + Sized + Unpin + 'static { } #[async_trait] -pub trait StoreDriver: Sync + Send + Unpin + HealthStatusIndicator + 'static { +pub trait StoreDriver: + Sync + Send + Unpin + MetricsComponent + HealthStatusIndicator + 'static +{ /// See: [`StoreLike::has`] for details. #[inline] async fn has(self: Pin<&Self>, key: StoreKey<'_>) -> Result, Error> { @@ -834,9 +831,6 @@ pub trait StoreDriver: Sync + Send + Unpin + HealthStatusIndicator + 'static { fn as_any(&self) -> &(dyn std::any::Any + Sync + Send + 'static); fn as_any_arc(self: Arc) -> Arc; - /// Register any metrics that this store wants to expose to the Prometheus. - fn register_metrics(self: Arc, _registry: &mut Registry) {} - // Register health checks used to monitor the store. fn register_health(self: Arc, _registry: &mut HealthRegistryBuilder) {} } diff --git a/nativelink-worker/BUILD.bazel b/nativelink-worker/BUILD.bazel index e55bf1a3e..d0c78662c 100644 --- a/nativelink-worker/BUILD.bazel +++ b/nativelink-worker/BUILD.bazel @@ -22,6 +22,7 @@ rust_library( deps = [ "//nativelink-config", "//nativelink-error", + "//nativelink-metric", "//nativelink-proto", "//nativelink-scheduler", "//nativelink-store", diff --git a/nativelink-worker/Cargo.toml b/nativelink-worker/Cargo.toml index 241f951ae..a846e4502 100644 --- a/nativelink-worker/Cargo.toml +++ b/nativelink-worker/Cargo.toml @@ -9,6 +9,7 @@ nativelink-proto = { path = "../nativelink-proto" } nativelink-config = { path = "../nativelink-config" } nativelink-util = { path = "../nativelink-util" } nativelink-store = { path = "../nativelink-store" } +nativelink-metric = { path = "../nativelink-metric" } # TODO(aaronmondal): This should not be a dependency. Move corresponding # functionality out of the schedulers. diff --git a/nativelink-worker/src/local_worker.rs b/nativelink-worker/src/local_worker.rs index f9316bfcf..a99e80720 100644 --- a/nativelink-worker/src/local_worker.rs +++ b/nativelink-worker/src/local_worker.rs @@ -24,6 +24,7 @@ use futures::stream::FuturesUnordered; use futures::{select, Future, FutureExt, StreamExt, TryFutureExt}; use nativelink_config::cas_server::LocalWorkerConfig; use nativelink_error::{make_err, make_input_err, Code, Error, ResultExt}; +use nativelink_metric::{MetricsComponent, RootMetricsComponent}; use nativelink_proto::com::github::trace_machina::nativelink::remote_execution::update_for_worker::Update; use nativelink_proto::com::github::trace_machina::nativelink::remote_execution::worker_api_client::WorkerApiClient; use nativelink_proto::com::github::trace_machina::nativelink::remote_execution::{ @@ -33,9 +34,7 @@ use nativelink_store::fast_slow_store::FastSlowStore; use nativelink_util::action_messages::{ActionResult, ActionStage}; use nativelink_util::common::fs; use nativelink_util::digest_hasher::{DigestHasherFunc, ACTIVE_HASHER_FUNC}; -use nativelink_util::metrics_utils::{ - AsyncCounterWrapper, Collector, CollectorState, CounterWithTime, MetricsComponent, Registry, -}; +use nativelink_util::metrics_utils::{AsyncCounterWrapper, CounterWithTime}; use nativelink_util::origin_context::ActiveOriginContext; use nativelink_util::store_trait::Store; use nativelink_util::{spawn, tls_utils}; @@ -388,7 +387,13 @@ pub async fn new_local_worker( cas_store: Store, ac_store: Option, historical_store: Store, -) -> Result, Error> { +) -> Result< + ( + LocalWorker, + Arc, + ), + Error, +> { let fast_slow_store = cas_store .downcast_ref::(None) .err_tip(|| "Expected store for LocalWorker's store to be a FastSlowStore")? @@ -428,42 +433,40 @@ pub async fn new_local_worker( max_action_timeout, timeout_handled_externally: config.timeout_handled_externally, })?); - Ok( - LocalWorker::new_with_connection_factory_and_actions_manager( - config.clone(), - running_actions_manager, - Box::new(move || { - let config = config.clone(); - Box::pin(async move { - let timeout = config - .worker_api_endpoint - .timeout - .unwrap_or(DEFAULT_ENDPOINT_TIMEOUT_S); - let timeout_duration = Duration::from_secs_f32(timeout); - let tls_config = - tls_utils::load_client_config(&config.worker_api_endpoint.tls_config) - .err_tip(|| "Parsing local worker TLS configuration")?; - let endpoint = - tls_utils::endpoint_from(&config.worker_api_endpoint.uri, tls_config) - .map_err(|e| { - make_input_err!("Invalid URI for worker endpoint : {e:?}") - })? - .connect_timeout(timeout_duration) - .timeout(timeout_duration); - - let transport = endpoint.connect().await.map_err(|e| { - make_err!( - Code::Internal, - "Could not connect to endpoint {}: {e:?}", - config.worker_api_endpoint.uri - ) - })?; - Ok(WorkerApiClient::new(transport).into()) - }) - }), - Box::new(move |d| Box::pin(sleep(d))), - ), - ) + let local_worker = LocalWorker::new_with_connection_factory_and_actions_manager( + config.clone(), + running_actions_manager, + Box::new(move || { + let config = config.clone(); + Box::pin(async move { + let timeout = config + .worker_api_endpoint + .timeout + .unwrap_or(DEFAULT_ENDPOINT_TIMEOUT_S); + let timeout_duration = Duration::from_secs_f32(timeout); + let tls_config = + tls_utils::load_client_config(&config.worker_api_endpoint.tls_config) + .err_tip(|| "Parsing local worker TLS configuration")?; + let endpoint = + tls_utils::endpoint_from(&config.worker_api_endpoint.uri, tls_config) + .map_err(|e| make_input_err!("Invalid URI for worker endpoint : {e:?}"))? + .connect_timeout(timeout_duration) + .timeout(timeout_duration); + + let transport = endpoint.connect().await.map_err(|e| { + make_err!( + Code::Internal, + "Could not connect to endpoint {}: {e:?}", + config.worker_api_endpoint.uri + ) + })?; + Ok(WorkerApiClient::new(transport).into()) + }) + }), + Box::new(move |d| Box::pin(sleep(d))), + ); + let metrics = local_worker.metrics.clone(); + Ok((local_worker, metrics)) } impl LocalWorker { @@ -594,20 +597,28 @@ impl LocalWorker { } // Unreachable. } - - pub fn register_metrics(&self, registry: &mut Registry) { - registry.register_collector(Box::new(Collector::new(&self.metrics))); - } } -struct Metrics { +#[derive(MetricsComponent)] +pub struct Metrics { + #[metric( + help = "Total number of actions sent to this worker to process. This does not mean it started them, it just means it received a request to execute it." + )] start_actions_received: CounterWithTime, + #[metric(help = "Total number of disconnects received from the scheduler.")] disconnects_received: CounterWithTime, + #[metric(help = "Total number of keep-alives received from the scheduler.")] keep_alives_received: CounterWithTime, + #[metric( + help = "Stats about the calls to check if an action satisfies the config supplied script." + )] preconditions: AsyncCounterWrapper, + #[metric] running_actions_manager_metrics: Weak, } +impl RootMetricsComponent for Metrics {} + impl Metrics { fn new(running_actions_manager_metrics: Weak) -> Self { Self { @@ -628,37 +639,3 @@ impl Metrics { fut(self).await } } - -impl MetricsComponent for Metrics { - fn gather_metrics(&self, c: &mut CollectorState) { - c.publish( - "start_actions_received", - &self.start_actions_received, - concat!( - "Total number of actions sent to this worker to process. This ", - "does not mean it started them, it just means it received a request ", - "to execute it." - ), - ); - c.publish( - "disconnects_received", - &self.disconnects_received, - "Total number of disconnects received from the scheduler.", - ); - c.publish( - "keep_alives_received", - &self.keep_alives_received, - "Total number of keep-alives received from the scheduler.", - ); - c.publish( - "preconditions", - &self.preconditions, - "Stats about the calls to check if an action satisfies the config supplied script.", // Data is appended to this. - ); - if let Some(running_actions_manager_metrics) = - self.running_actions_manager_metrics.upgrade() - { - c.publish("", running_actions_manager_metrics.as_ref(), ""); - } - } -} diff --git a/nativelink-worker/src/running_actions_manager.rs b/nativelink-worker/src/running_actions_manager.rs index ef8aeb458..8f6914edf 100644 --- a/nativelink-worker/src/running_actions_manager.rs +++ b/nativelink-worker/src/running_actions_manager.rs @@ -40,6 +40,7 @@ use nativelink_config::cas_server::{ EnvironmentSource, UploadActionResultConfig, UploadCacheResultsStrategy, }; use nativelink_error::{make_err, make_input_err, Code, Error, ResultExt}; +use nativelink_metric::MetricsComponent; use nativelink_proto::build::bazel::remote::execution::v2::{ Action, ActionResult as ProtoActionResult, Command as ProtoCommand, Directory as ProtoDirectory, Directory, DirectoryNode, ExecuteResponse, FileNode, SymlinkNode, @@ -60,9 +61,7 @@ use nativelink_util::action_messages::{ }; use nativelink_util::common::{fs, DigestInfo}; use nativelink_util::digest_hasher::{DigestHasher, DigestHasherFunc}; -use nativelink_util::metrics_utils::{ - AsyncCounterWrapper, CollectorState, CounterWithTime, MetricsComponent, -}; +use nativelink_util::metrics_utils::{AsyncCounterWrapper, CounterWithTime}; use nativelink_util::store_trait::{Store, StoreLike, UploadSizeInfo}; use nativelink_util::{background_spawn, spawn, spawn_blocking}; use parking_lot::Mutex; @@ -1908,123 +1907,46 @@ impl RunningActionsManager for RunningActionsManagerImpl { } } -#[derive(Default)] +#[derive(Default, MetricsComponent)] pub struct Metrics { + #[metric(help = "Stats about the create_and_add_action command.")] create_and_add_action: AsyncCounterWrapper, + #[metric(help = "Stats about the cache_action_result command.")] cache_action_result: AsyncCounterWrapper, + #[metric(help = "Stats about the kill_all command.")] kill_all: AsyncCounterWrapper, + #[metric(help = "Stats about the create_action_info command.")] create_action_info: AsyncCounterWrapper, + #[metric(help = "Stats about the make_work_directory command.")] make_action_directory: AsyncCounterWrapper, + #[metric(help = "Stats about the prepare_action command.")] prepare_action: AsyncCounterWrapper, + #[metric(help = "Stats about the execute command.")] execute: AsyncCounterWrapper, + #[metric(help = "Stats about the upload_results command.")] upload_results: AsyncCounterWrapper, + #[metric(help = "Stats about the cleanup command.")] cleanup: AsyncCounterWrapper, + #[metric(help = "Stats about the get_finished_result command.")] get_finished_result: AsyncCounterWrapper, + #[metric(help = "Stats about the get_proto_command_from_store command.")] get_proto_command_from_store: AsyncCounterWrapper, + #[metric(help = "Stats about the download_to_directory command.")] download_to_directory: AsyncCounterWrapper, + #[metric(help = "Stats about the prepare_output_files command.")] prepare_output_files: AsyncCounterWrapper, + #[metric(help = "Stats about the prepare_output_paths command.")] prepare_output_paths: AsyncCounterWrapper, + #[metric(help = "Stats about the child_process command.")] child_process: AsyncCounterWrapper, + #[metric(help = "Stats about the child_process_success_error_code command.")] child_process_success_error_code: CounterWithTime, + #[metric(help = "Stats about the child_process_failure_error_code command.")] child_process_failure_error_code: CounterWithTime, + #[metric(help = "Total time spent uploading stdout.")] upload_stdout: AsyncCounterWrapper, + #[metric(help = "Total time spent uploading stderr.")] upload_stderr: AsyncCounterWrapper, + #[metric(help = "Total number of task timeouts.")] task_timeouts: CounterWithTime, } - -impl MetricsComponent for Metrics { - fn gather_metrics(&self, c: &mut CollectorState) { - c.publish( - "create_and_add_action", - &self.create_and_add_action, - "Stats about the create_and_add_action command.", - ); - c.publish( - "cache_action_result", - &self.cache_action_result, - "Stats about the cache_action_result command.", - ); - c.publish( - "kill_all", - &self.kill_all, - "Stats about the kill_all command.", - ); - c.publish( - "create_action_info", - &self.create_action_info, - "Stats about the create_action_info command.", - ); - c.publish( - "make_work_directory", - &self.make_action_directory, - "Stats about the make_work_directory command.", - ); - c.publish( - "prepare_action", - &self.prepare_action, - "Stats about the prepare_action command.", - ); - c.publish("execute", &self.execute, "Stats about the execute command."); - c.publish( - "upload_results", - &self.upload_results, - "Stats about the upload_results command.", - ); - c.publish("cleanup", &self.cleanup, "Stats about the cleanup command."); - c.publish( - "get_finished_result", - &self.get_finished_result, - "Stats about the get_finished_result command.", - ); - c.publish( - "get_proto_command_from_store", - &self.get_proto_command_from_store, - "Stats about the get_proto_command_from_store command.", - ); - c.publish( - "download_to_directory", - &self.download_to_directory, - "Stats about the download_to_directory command.", - ); - c.publish( - "prepare_output_files", - &self.prepare_output_files, - "Stats about the prepare_output_files command.", - ); - c.publish( - "prepare_output_paths", - &self.prepare_output_paths, - "Stats about the prepare_output_paths command.", - ); - c.publish( - "child_process", - &self.child_process, - "Stats about the child_process command.", - ); - c.publish( - "child_process_success_error_code", - &self.child_process_success_error_code, - "Stats about the child_process_success_error_code command.", - ); - c.publish( - "child_process_failure_error_code", - &self.child_process_failure_error_code, - "Stats about the child_process_failure_error_code command.", - ); - c.publish( - "upload_stdout", - &self.upload_stdout, - "Total time spent uploading stdout.", - ); - c.publish( - "upload_stderr", - &self.upload_stderr, - "Total time spent uploading stderr.", - ); - c.publish( - "task_timeouts_count", - &self.task_timeouts, - "Total number of task timeouts.", - ) - } -} diff --git a/src/bin/nativelink.rs b/src/bin/nativelink.rs index c05c62b16..fdc07f1ef 100644 --- a/src/bin/nativelink.rs +++ b/src/bin/nativelink.rs @@ -22,13 +22,18 @@ use axum::Router; use clap::Parser; use futures::future::{select_all, BoxFuture, Either, OptionFuture, TryFutureExt}; use hyper::server::conn::Http; -use hyper::{Response, StatusCode}; +use hyper::{Body, Response, StatusCode}; use mimalloc::MiMalloc; use nativelink_config::cas_server::{ CasConfig, GlobalConfig, HttpCompressionAlgorithm, ListenerConfig, ServerConfig, WorkerConfig, }; use nativelink_config::stores::ConfigDigestHashFunction; use nativelink_error::{make_err, Code, Error, ResultExt}; +use nativelink_metric::{ + MetricFieldData, MetricKind, MetricPublishKnownKindData, MetricsComponent, RootMetricsComponent, +}; +use nativelink_metric_collector::{otel_export, MetricsCollectorLayer}; +use nativelink_scheduler::action_scheduler::ActionScheduler; use nativelink_scheduler::default_scheduler_factory::scheduler_factory; use nativelink_service::ac_server::AcServer; use nativelink_service::bep_server::BepServer; @@ -44,10 +49,7 @@ use nativelink_util::action_messages::WorkerId; use nativelink_util::common::fs::{set_idle_file_descriptor_timeout, set_open_file_limit}; use nativelink_util::digest_hasher::{set_default_digest_hasher_func, DigestHasherFunc}; use nativelink_util::health_utils::HealthRegistryBuilder; -use nativelink_util::metrics_utils::{ - set_metrics_enabled_for_this_thread, Collector, CollectorState, Counter, MetricsComponent, - Registry, -}; +use nativelink_util::metrics_utils::{set_metrics_enabled_for_this_thread, Counter}; use nativelink_util::origin_context::OriginContext; use nativelink_util::store_trait::{ set_default_digest_size_health_check, DEFAULT_DIGEST_SIZE_HEALTH_CHECK_CFG, @@ -55,7 +57,10 @@ use nativelink_util::store_trait::{ use nativelink_util::task::TaskExecutor; use nativelink_util::{background_spawn, init_tracing, spawn, spawn_blocking}; use nativelink_worker::local_worker::new_local_worker; -use parking_lot::Mutex; +use opentelemetry::metrics::MeterProvider; +use opentelemetry_sdk::metrics::SdkMeterProvider; +use parking_lot::{Mutex, RwLock}; +use prometheus::{Encoder, TextEncoder}; use rustls_pemfile::{certs as extract_certs, crls as extract_crls}; use scopeguard::guard; use tokio::net::TcpListener; @@ -69,6 +74,7 @@ use tonic::codec::CompressionEncoding; use tonic::transport::Server as TonicServer; use tower::util::ServiceExt; use tracing::{error_span, event, trace_span, Level}; +use tracing_subscriber::layer::SubscriberExt; #[global_allocator] static GLOBAL: MiMalloc = MiMalloc; @@ -99,11 +105,61 @@ struct Args { config_file: String, } +/// The root metrics collector struct. All metrics will be +/// collected from this struct traversing down each child +/// component. +#[derive(MetricsComponent)] +struct RootMetrics { + #[metric(group = "stores")] + stores: Arc, + #[metric(group = "servers")] + servers: HashMap>, + #[metric(group = "workers")] + workers: HashMap>, + // TODO(allada) We cannot upcast these to RootMetricsComponent because + // of https://github.com/rust-lang/rust/issues/65991. + // TODO(allada) To prevent output from being too verbose we only + // print the action_schedulers. + #[metric(group = "action_schedulers")] + schedulers: HashMap>, +} + +impl RootMetricsComponent for RootMetrics {} + +/// Wrapper to allow us to hash `SocketAddr` for metrics. +#[derive(Hash, PartialEq, Eq)] +struct SocketAddrWrapper(SocketAddr); + +impl MetricsComponent for SocketAddrWrapper { + fn publish( + &self, + _kind: MetricKind, + _field_metadata: MetricFieldData, + ) -> Result { + Ok(MetricPublishKnownKindData::String(self.0.to_string())) + } +} + +impl RootMetricsComponent for SocketAddrWrapper {} + +/// Simple wrapper to enable us to register the Hashmap so it can +/// report metrics about what clients are connected. +#[derive(MetricsComponent)] +struct ConnectedClientsMetrics { + #[metric(group = "currently_connected_clients")] + inner: Mutex>, + #[metric(help = "Total client connections since server started")] + counter: Counter, + #[metric(help = "Timestamp when the server started")] + server_start_ts: u64, +} + +impl RootMetricsComponent for ConnectedClientsMetrics {} + async fn inner_main( cfg: CasConfig, server_start_timestamp: u64, ) -> Result<(), Box> { - let mut root_metrics_registry = ::with_prefix("nativelink"); let health_registry_builder = Arc::new(AsyncMutex::new(HealthRegistryBuilder::new( "nativelink".into(), ))); @@ -111,41 +167,30 @@ async fn inner_main( let store_manager = Arc::new(StoreManager::new()); { let mut health_registry_lock = health_registry_builder.lock().await; - let root_store_metrics = root_metrics_registry.sub_registry_with_prefix("stores"); for (name, store_cfg) in cfg.stores { let health_component_name = format!("stores/{name}"); let mut health_register_store = health_registry_lock.sub_builder(health_component_name.into()); - let store_metrics = root_store_metrics.sub_registry_with_prefix(&name); - store_manager.add_store( - &name, - store_factory( - &store_cfg, - &store_manager, - Some(store_metrics), - Some(&mut health_register_store), - ) + let store = store_factory(&store_cfg, &store_manager, Some(&mut health_register_store)) .await - .err_tip(|| format!("Failed to create store '{name}'"))?, - ); + .err_tip(|| format!("Failed to create store '{name}'"))?; + store_manager.add_store(&name, store); } } let mut action_schedulers = HashMap::new(); let mut worker_schedulers = HashMap::new(); if let Some(schedulers_cfg) = cfg.schedulers { - let root_scheduler_metrics = root_metrics_registry.sub_registry_with_prefix("schedulers"); for (name, scheduler_cfg) in schedulers_cfg { - let scheduler_metrics = root_scheduler_metrics.sub_registry_with_prefix(&name); let (maybe_action_scheduler, maybe_worker_scheduler) = - scheduler_factory(&scheduler_cfg, &store_manager, scheduler_metrics) + scheduler_factory(&scheduler_cfg, &store_manager) .err_tip(|| format!("Failed to create scheduler '{name}'"))?; if let Some(action_scheduler) = maybe_action_scheduler { - action_schedulers.insert(name.clone(), action_scheduler); + action_schedulers.insert(name.clone(), action_scheduler.clone()); } if let Some(worker_scheduler) = maybe_worker_scheduler { - worker_schedulers.insert(name.clone(), worker_scheduler); + worker_schedulers.insert(name.clone(), worker_scheduler.clone()); } } } @@ -157,39 +202,7 @@ async fn inner_main( } } - /// Simple wrapper to enable us to register the Hashmap so it can - /// report metrics about what clients are connected. - struct ConnectedClientsMetrics { - inner: Mutex>, - counter: Counter, - server_start_ts: u64, - } - impl MetricsComponent for ConnectedClientsMetrics { - fn gather_metrics(&self, c: &mut CollectorState) { - c.publish( - "server_start_time", - &self.server_start_ts, - "Timestamp when the server started", - ); - - let connected_clients = self.inner.lock(); - for client in connected_clients.iter() { - c.publish_with_labels( - "connected_clients", - &1, - "The endpoint of the connected clients", - vec![("endpoint".into(), format!("{client}").into())], - ); - } - - c.publish( - "total_client_connections", - &self.counter, - "Total client connections since server started", - ); - } - } - + let mut server_metrics: HashMap> = HashMap::new(); // Registers all the ConnectedClientsMetrics to the registries // and zips them in. It is done this way to get around the need // for `root_metrics_registry` to become immutable in the loop. @@ -208,9 +221,7 @@ async fn inner_main( counter: Counter::default(), server_start_ts: server_start_timestamp, }); - let server_metrics = - root_metrics_registry.sub_registry_with_prefix(format!("server_{name}")); - server_metrics.register_collector(Box::new(Collector::new(&connected_clients_mux))); + server_metrics.insert(name.clone(), connected_clients_mux.clone()); (server_cfg, connected_clients_mux) }) @@ -218,8 +229,13 @@ async fn inner_main( let mut root_futures: Vec>> = Vec::new(); - // Lock our registry as immutable and clonable. - let root_metrics_registry = Arc::new(AsyncMutex::new(root_metrics_registry)); + let root_metrics = Arc::new(RwLock::new(RootMetrics { + stores: store_manager.clone(), + servers: server_metrics, + workers: HashMap::new(), // Will be filled in later. + schedulers: action_schedulers.clone(), + })); + for (server_cfg, connected_clients_mux) in servers_and_clients { let services = server_cfg.services.ok_or("'services' must be configured")?; @@ -422,7 +438,6 @@ async fn inner_main( .err_tip(|| "Could not create WorkerApi service")?, ); - let root_metrics_registry = root_metrics_registry.clone(); let health_registry = health_registry_builder.lock().await.build(); let mut svc = Router::new() @@ -439,8 +454,8 @@ async fn inner_main( } if let Some(prometheus_cfg) = services.experimental_prometheus { - fn error_to_response(e: E) -> Response { - let mut response = Response::new(format!("Error: {e:?}")); + fn error_to_response(e: E) -> Response { + let mut response = Response::new(format!("Error: {e:?}").into()); *response.status_mut() = StatusCode::INTERNAL_SERVER_ERROR; response } @@ -449,9 +464,12 @@ async fn inner_main( } else { &prometheus_cfg.path }; + + let root_metrics_clone = root_metrics.clone(); + svc = svc.route_service( path, - axum::routing::get(move |_request: hyper::Request| { + axum::routing::get(move |request: hyper::Request| { Arc::new(OriginContext::new()).wrap_async( trace_span!("prometheus_ctx"), async move { @@ -459,19 +477,91 @@ async fn inner_main( // collection. This allows it to call functions like `tokio::block_in_place` // if it needs to wait on a future. spawn_blocking!("prometheus_metrics", move || { - let mut buf = String::new(); - let root_metrics_registry_guard = - futures::executor::block_on(root_metrics_registry.lock()); - prometheus_client::encoding::text::encode( - &mut buf, - &root_metrics_registry_guard, + let (layer, output_metrics) = MetricsCollectorLayer::new(); + + // Traverse all the MetricsComponent's. The `MetricsCollectorLayer` will + // collect all the metrics and store them in `output_metrics`. + tracing::subscriber::with_default( + tracing_subscriber::registry().with(layer), + || { + let metrics_component = root_metrics_clone.read(); + MetricsComponent::publish( + &*metrics_component, + MetricKind::Component, + MetricFieldData::default(), + ) + }, ) - .map_err(|e| std::io::Error::new(std::io::ErrorKind::Other, e)) - .map(|_| { - // This is a hack to get around this bug: https://github.com/prometheus/client_rust/issues/155 - buf = buf.replace("nativelink_nativelink_stores_", ""); - buf = buf.replace("nativelink_nativelink_workers_", ""); - let mut response = Response::new(buf); + .map_err(|e| make_err!(Code::Internal, "{e}")) + .err_tip(|| "While processing prometheus metrics")?; + + // Convert the collected metrics into OpenTelemetry metrics then + // encode them into Prometheus format and populate them into a + // hyper::Response. + let response = { + let registry = prometheus::Registry::new(); + let exporter = opentelemetry_prometheus::exporter() + .with_registry(registry.clone()) + .without_counter_suffixes() + .without_scope_info() + .build() + .map_err(|e| make_err!(Code::Internal, "{e}")) + .err_tip(|| { + "While creating OpenTelemetry Prometheus exporter" + })?; + + // Prepare our OpenTelemetry collector/exporter. + let provider = + SdkMeterProvider::builder().with_reader(exporter).build(); + let meter = provider.meter("nativelink"); + + // TODO(allada) We should put this as part of the config instead of a magic + // request header. + if let Some(json_type) = + request.headers().get("x-nativelink-json") + { + let json_data = if json_type == "pretty" { + serde_json::to_string_pretty(&*output_metrics.lock()) + .map_err(|e| { + make_err!( + Code::Internal, + "Could not convert to json {e:?}" + ) + })? + } else { + serde_json::to_string(&*output_metrics.lock()).map_err( + |e| { + make_err!( + Code::Internal, + "Could not convert to json {e:?}" + ) + }, + )? + }; + let mut response = Response::new(Body::from(json_data)); + response.headers_mut().insert( + hyper::header::CONTENT_TYPE, + hyper::header::HeaderValue::from_static( + "application/json", + ), + ); + return Ok(response); + } + + // Export the metrics to OpenTelemetry. + otel_export( + "nativelink".to_string(), + &meter, + &output_metrics.lock(), + ); + + // Translate the OpenTelemetry metrics to Prometheus format and encode + // them into a hyper::Response. + let mut result = vec![]; + TextEncoder::new() + .encode(®istry.gather(), &mut result) + .unwrap(); + let mut response = Response::new(Body::from(result)); // Per spec we should probably use `application/openmetrics-text; version=1.0.0; charset=utf-8` // https://github.com/OpenObservability/OpenMetrics/blob/1386544931307dff279688f332890c31b6c5de36/specification/OpenMetrics.md#overall-structure // However, this makes debugging more difficult, so we use the old text/plain instead. @@ -481,11 +571,12 @@ async fn inner_main( "text/plain; version=0.0.4; charset=utf-8", ), ); - response - }) - .unwrap_or_else(error_to_response) + Result::<_, Error>::Ok(response) + }; + response }) .await + .unwrap_or_else(|e| Ok(error_to_response(e))) .unwrap_or_else(error_to_response) }, ) @@ -691,7 +782,10 @@ async fn inner_main( ?socket_addr, "Client connected" ); - connected_clients_mux.inner.lock().insert(remote_addr); + connected_clients_mux + .inner + .lock() + .insert(SocketAddrWrapper(remote_addr)); connected_clients_mux.counter.inc(); // This is the safest way to guarantee that if our future @@ -707,7 +801,10 @@ async fn inner_main( "Client disconnected" ); if let Some(connected_clients_mux) = weak_connected_clients_mux.upgrade() { - connected_clients_mux.inner.lock().remove(&remote_addr); + connected_clients_mux + .inner + .lock() + .remove(&SocketAddrWrapper(remote_addr)); } }, ); @@ -767,9 +864,8 @@ async fn inner_main( // We start workers after our TcpListener is setup so if our worker connects to one // of these services it will be able to connect. let worker_cfgs = cfg.workers.unwrap_or_default(); - let mut root_metrics_registry_guard = root_metrics_registry.lock().await; - let root_worker_metrics = root_metrics_registry_guard.sub_registry_with_prefix("workers"); let mut worker_names = HashSet::with_capacity(worker_cfgs.len()); + let mut worker_metrics: HashMap> = HashMap::new(); for (i, worker_cfg) in worker_cfgs.into_iter().enumerate() { let spawn_fut = match worker_cfg { WorkerConfig::local(local_worker_cfg) => { @@ -805,7 +901,7 @@ async fn inner_main( } else { fast_slow_store.clone() }; - let local_worker = new_local_worker( + let (local_worker, metrics) = new_local_worker( Arc::new(local_worker_cfg), fast_slow_store, maybe_ac_store, @@ -825,9 +921,8 @@ async fn inner_main( name )))?; } - let worker_metrics = root_worker_metrics.sub_registry_with_prefix(&name); - local_worker.register_metrics(worker_metrics); worker_names.insert(name.clone()); + worker_metrics.insert(name.clone(), metrics); let fut = Arc::new(OriginContext::new()) .wrap_async(trace_span!("worker_ctx"), local_worker.run()); spawn!("worker", fut, ?name) @@ -835,6 +930,7 @@ async fn inner_main( }; root_futures.push(Box::pin(spawn_fut.map_ok_or_else(|e| Err(e.into()), |v| v))); } + root_metrics.write().workers = worker_metrics; } if let Err(e) = select_all(root_futures).await.0 {