Skip to content

Commit

Permalink
[Breaking] Factor out health status checks to its own service (TraceM…
Browse files Browse the repository at this point in the history
…achina#823)

Health status checks are now a proper service and no longer enabled by
default.

Add `"health": {},` under your services config to enable.
  • Loading branch information
blizzardc0der authored and chinchaun committed May 6, 2024
1 parent b431d5f commit 42e5b55
Show file tree
Hide file tree
Showing 22 changed files with 159 additions and 81 deletions.
2 changes: 2 additions & 0 deletions Cargo.lock

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

3 changes: 2 additions & 1 deletion deployment-examples/chromium/cas.json
Original file line number Diff line number Diff line change
Expand Up @@ -108,7 +108,8 @@
"cas_stores": {
"main": "CAS_MAIN_STORE",
}
}
},
"health": {},
}
}]
}
3 changes: 2 additions & 1 deletion deployment-examples/chromium/scheduler.json
Original file line number Diff line number Diff line change
Expand Up @@ -87,7 +87,8 @@
// are a frontend api.
"worker_api": {
"scheduler": "MAIN_SCHEDULER",
}
},
"health": {},
}
}]
}
3 changes: 2 additions & 1 deletion deployment-examples/docker-compose/local-storage-cas.json
Original file line number Diff line number Diff line change
Expand Up @@ -96,7 +96,8 @@
"cas_stores": {
"main": "CAS_MAIN_STORE",
}
}
},
"health": {},
}
}]
}
3 changes: 2 additions & 1 deletion deployment-examples/docker-compose/scheduler.json
Original file line number Diff line number Diff line change
Expand Up @@ -71,7 +71,8 @@
// are a frontend api.
"worker_api": {
"scheduler": "MAIN_SCHEDULER",
}
},
"health": {},
}
}]
}
3 changes: 2 additions & 1 deletion deployment-examples/kubernetes/cas.json
Original file line number Diff line number Diff line change
Expand Up @@ -108,7 +108,8 @@
"cas_stores": {
"main": "CAS_MAIN_STORE",
}
}
},
"health": {},
}
}]
}
3 changes: 2 additions & 1 deletion deployment-examples/kubernetes/scheduler.json
Original file line number Diff line number Diff line change
Expand Up @@ -72,7 +72,8 @@
// are a frontend api.
"worker_api": {
"scheduler": "MAIN_SCHEDULER",
}
},
"health": {},
}
}]
}
Original file line number Diff line number Diff line change
Expand Up @@ -163,6 +163,7 @@
"worker_api": {
"scheduler": "MAIN_SCHEDULER",
}
}
},
"health": {},
}]
}
Original file line number Diff line number Diff line change
Expand Up @@ -146,7 +146,8 @@
"cas_stores": {
"main": "CAS_STORE"
}
}
},
"health": {},
}
}]
}
Original file line number Diff line number Diff line change
Expand Up @@ -171,7 +171,8 @@
// are a frontend api.
"worker_api": {
"scheduler": "MAIN_SCHEDULER"
}
},
"health": {},
}
}]
}
Original file line number Diff line number Diff line change
Expand Up @@ -112,7 +112,8 @@
"services": {
"experimental_prometheus": {
"path": "/metrics"
}
},
"health": {},
}
}]
}
3 changes: 2 additions & 1 deletion nativelink-config/examples/README.md
Original file line number Diff line number Diff line change
Expand Up @@ -365,7 +365,8 @@ The `public` server consists of a `listener` object and a `services` object. The
"worker_api": {
"scheduler": "MAIN_SCHEDULER",
},
"admin": {}
"admin": {},
"health": {},
}
}],
"global": {},
Expand Down
3 changes: 2 additions & 1 deletion nativelink-config/examples/basic_cas.json
Original file line number Diff line number Diff line change
Expand Up @@ -154,7 +154,8 @@
"worker_api": {
"scheduler": "MAIN_SCHEDULER",
},
"admin": {}
"admin": {},
"health": {},
}
}],
"global": {
Expand Down
3 changes: 2 additions & 1 deletion nativelink-config/examples/filesystem_cas.json
Original file line number Diff line number Diff line change
Expand Up @@ -162,7 +162,8 @@
// are a frontend api.
"worker_api": {
"scheduler": "MAIN_SCHEDULER",
}
},
"health": {},
}
}]
}
Original file line number Diff line number Diff line change
Expand Up @@ -162,7 +162,8 @@
"cas_stores": {
"main": "CAS_MAIN_STORE",
}
}
},
"health": {},
}
}]
}
15 changes: 15 additions & 0 deletions nativelink-config/src/cas_server.rs
Original file line number Diff line number Diff line change
Expand Up @@ -184,6 +184,18 @@ pub struct AdminConfig {
pub path: String,
}

#[derive(Deserialize, Debug, Default)]
#[serde(deny_unknown_fields)]
pub struct HealthConfig {
/// Path to register the health status check. If path is "/status", and your
/// domain is "example.com", you can reach the endpoint with:
/// <http://example.com/status>.
///
/// Default: "/status"
#[serde(default)]
pub path: String,
}

#[derive(Deserialize, Debug)]
#[serde(deny_unknown_fields)]
pub struct ServicesConfig {
Expand Down Expand Up @@ -228,6 +240,9 @@ pub struct ServicesConfig {
/// This is the service for any administrative tasks.
/// It provides a REST API endpoint for administrative purposes.
pub admin: Option<AdminConfig>,

/// This is the service for health status check.
pub health: Option<HealthConfig>,
}

#[derive(Deserialize, Debug)]
Expand Down
6 changes: 6 additions & 0 deletions nativelink-service/BUILD.bazel
Original file line number Diff line number Diff line change
Expand Up @@ -14,6 +14,7 @@ rust_library(
"src/capabilities_server.rs",
"src/cas_server.rs",
"src/execution_server.rs",
"src/health_server.rs",
"src/lib.rs",
"src/worker_api_server.rs",
],
Expand All @@ -27,13 +28,16 @@ rust_library(
"//nativelink-util",
"@crates//:bytes",
"@crates//:futures",
"@crates//:hyper",
"@crates//:log",
"@crates//:parking_lot",
"@crates//:prost",
"@crates//:rand",
"@crates//:serde_json5",
"@crates//:tokio",
"@crates//:tokio-stream",
"@crates//:tonic",
"@crates//:tower",
"@crates//:tracing",
"@crates//:uuid",
],
Expand Down Expand Up @@ -64,9 +68,11 @@ rust_test_suite(
"@crates//:prometheus-client",
"@crates//:prost",
"@crates//:prost-types",
"@crates//:serde_json5",
"@crates//:tokio",
"@crates//:tokio-stream",
"@crates//:tonic",
"@crates//:tower",
],
)

Expand Down
3 changes: 3 additions & 0 deletions nativelink-service/Cargo.toml
Original file line number Diff line number Diff line change
Expand Up @@ -13,13 +13,16 @@ nativelink-scheduler = { path = "../nativelink-scheduler" }

bytes = "1.6.0"
futures = "0.3.30"
hyper = { version = "0.14.28" }
serde_json5 = "0.1.0"
log = "0.4.21"
parking_lot = "0.12.1"
prost = "0.12.4"
rand = "0.8.5"
tokio = { version = "1.37.0", features = ["sync", "rt"] }
tokio-stream = { version = "0.1.15", features = ["sync"] }
tonic = { version = "0.11.0", features = ["gzip", "tls"] }
tower = "0.4.13"
tracing = "0.1.40"
uuid = { version = "1.8.0", features = ["v4"] }

Expand Down
82 changes: 82 additions & 0 deletions nativelink-service/src/health_server.rs
Original file line number Diff line number Diff line change
@@ -0,0 +1,82 @@
// Copyright 2024 The NativeLink Authors. All rights reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.

use std::future::Future;
use std::pin::Pin;
use std::task::{Context, Poll};

use futures::StreamExt;
use hyper::header::{HeaderValue, CONTENT_TYPE};
use hyper::{Body, Request, Response, StatusCode};
use nativelink_util::health_utils::{
HealthRegistry, HealthStatus, HealthStatusDescription, HealthStatusReporter,
};
use tower::Service;

/// Content type header value for JSON.
const JSON_CONTENT_TYPE: &str = "application/json; charset=utf-8";

#[derive(Clone)]
pub struct HealthServer {
health_registry: HealthRegistry,
}

impl HealthServer {
pub fn new(health_registry: HealthRegistry) -> Self {
Self { health_registry }
}
}

impl Service<Request<hyper::Body>> for HealthServer {
type Response = Response<hyper::Body>;
type Error = std::convert::Infallible;
type Future = Pin<Box<dyn Future<Output = Result<Self::Response, Self::Error>> + Send>>;

fn poll_ready(&mut self, _cx: &mut Context<'_>) -> Poll<Result<(), Self::Error>> {
Poll::Ready(Ok(()))
}

fn call(&mut self, _req: Request<Body>) -> Self::Future {
let health_registry = self.health_registry.clone();
Box::pin(async move {
let health_status_descriptions: Vec<HealthStatusDescription> =
health_registry.health_status_report().collect().await;
match serde_json5::to_string(&health_status_descriptions) {
Ok(body) => {
let contains_failed_report =
health_status_descriptions.iter().any(|description| {
matches!(description.status, HealthStatus::Failed { .. })
});
let status_code = if contains_failed_report {
StatusCode::SERVICE_UNAVAILABLE
} else {
StatusCode::OK
};

Ok(Response::builder()
.status(status_code)
.header(CONTENT_TYPE, HeaderValue::from_static(JSON_CONTENT_TYPE))
.body(Body::from(body))
.unwrap())
}

Err(e) => Ok(Response::builder()
.status(StatusCode::INTERNAL_SERVER_ERROR)
.header(CONTENT_TYPE, HeaderValue::from_static(JSON_CONTENT_TYPE))
.body(Body::from(format!("Internal Failure: {e:?}")))
.unwrap()),
}
})
}
}
1 change: 1 addition & 0 deletions nativelink-service/src/lib.rs
Original file line number Diff line number Diff line change
Expand Up @@ -17,4 +17,5 @@ pub mod bytestream_server;
pub mod capabilities_server;
pub mod cas_server;
pub mod execution_server;
pub mod health_server;
pub mod worker_api_server;
8 changes: 6 additions & 2 deletions nativelink-util/src/health_utils.rs
Original file line number Diff line number Diff line change
Expand Up @@ -162,13 +162,17 @@ pub struct HealthRegistry {
}

pub trait HealthStatusReporter {
fn health_status_report(&self) -> Pin<Box<dyn Stream<Item = HealthStatusDescription> + '_>>;
fn health_status_report(
&self,
) -> Pin<Box<dyn Stream<Item = HealthStatusDescription> + Send + '_>>;
}

/// Health status reporter implementation for the health registry that provides a stream
/// of health status descriptions.
impl HealthStatusReporter for HealthRegistry {
fn health_status_report(&self) -> Pin<Box<dyn Stream<Item = HealthStatusDescription> + '_>> {
fn health_status_report(
&self,
) -> Pin<Box<dyn Stream<Item = HealthStatusDescription> + Send + '_>> {
Box::pin(futures::stream::iter(self.indicators.iter()).then(
|(namespace, indicator)| async move {
HealthStatusDescription {
Expand Down
Loading

0 comments on commit 42e5b55

Please sign in to comment.