Merge branch 'main' into ziqif/enable-kvbm-metrics

ziqifan617 · web-flow · commit 7799661fe4e3 · 2025-08-22T10:24:20.000-07:00
diff --git a/components/backends/vllm/launch/dsr1_dep.sh b/components/backends/vllm/launch/dsr1_dep.sh
@@ -47,8 +47,8 @@ while [[ $# -gt 0 ]]; do
             echo "  --gpus-per-node L     Number of GPUs per node (required, int)"
             echo "  --master-addr ADDR    Master node address (default: localhost)"
             echo "  --log-dir DIR         Directory for log files (default: ./logs)"
-            echo "  --model MODEL    Model name to use (default: deepseek-ai/DeepSeek-R1)"
-            echo "  -h, --help           Show this help message"
+            echo "  --model MODEL         Model name to use (default: ${MODEL})"
+            echo "  -h, --help            Show this help message"
             exit 0
             ;;
         *)
diff --git a/docs/support_matrix.md b/docs/support_matrix.md
@@ -83,7 +83,7 @@ If you are using a **GPU**, the following GPU models and architectures are suppo
 
 
 > [!Caution]
-> ¹ There is a known issue with the TensorRT-LLM framework when installed within the AL2023 container via the Python Wheels which makes effective environment setup challenging.
+> ¹ There is a known issue with the TensorRT-LLM framework when running the AL2023 container locally with `docker run --network host ...` due to a [bug](https://github.com/mpi4py/mpi4py/discussions/491#discussioncomment-12660609) in mpi4py. To avoid this issue, replace the `--network host` flag with more precise networking configuration by mapping only the necessary ports (e.g., 4222 for nats, 2379/2380 for etcd, 8080 for frontend).
 
 
 ## Build Support
diff --git a/examples/multimodal/deploy/agg_llava.yaml b/examples/multimodal/deploy/agg_llava.yaml
@@ -0,0 +1,68 @@
+# SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: Apache-2.0
+
+apiVersion: nvidia.com/v1alpha1
+kind: DynamoGraphDeployment
+metadata:
+  name: agg-llava
+spec:
+  backendFramework: vllm
+  services:
+    Frontend:
+      dynamoNamespace: agg-llava
+      componentType: frontend
+      replicas: 1
+      extraPodSpec:
+        mainContainer:
+          image: my-registry/vllm-runtime:my-tag
+    EncodeWorker:
+      envFromSecret: hf-token-secret
+      dynamoNamespace: agg-llava
+      componentType: worker
+      replicas: 1
+      resources:
+        limits:
+          gpu: "1"
+      extraPodSpec:
+        mainContainer:
+          image: my-registry/vllm-runtime:my-tag
+          workingDir: /workspace/examples/multimodal
+          command:
+            - /bin/sh
+            - -c
+          args:
+            - python3 components/encode_worker.py --model llava-hf/llava-1.5-7b-hf
+    VLMWorker:
+      envFromSecret: hf-token-secret
+      dynamoNamespace: agg-llava
+      componentType: worker
+      replicas: 1
+      resources:
+        limits:
+          gpu: "1"
+      extraPodSpec:
+        mainContainer:
+          image: my-registry/vllm-runtime:my-tag
+          workingDir: /workspace/examples/multimodal
+          command:
+            - /bin/sh
+            - -c
+          args:
+            - python3 components/worker.py --model llava-hf/llava-1.5-7b-hf --worker-type prefill
+    Processor:
+      envFromSecret: hf-token-secret
+      dynamoNamespace: agg-llava
+      componentType: worker
+      replicas: 1
+      resources:
+        limits:
+          gpu: "1"
+      extraPodSpec:
+        mainContainer:
+          image: my-registry/vllm-runtime:my-tag
+          workingDir: /workspace/examples/multimodal
+          command:
+            - /bin/sh
+            - -c
+          args:
+            - 'python3 components/processor.py --model llava-hf/llava-1.5-7b-hf --prompt-template "USER: <image>\n<prompt> ASSISTANT:"'
diff --git a/lib/bindings/python/rust/llm/entrypoint.rs b/lib/bindings/python/rust/llm/entrypoint.rs
@@ -13,7 +13,7 @@ use dynamo_llm::kv_router::KvRouterConfig as RsKvRouterConfig;
 use dynamo_llm::local_model::DEFAULT_HTTP_PORT;
 use dynamo_llm::local_model::{LocalModel, LocalModelBuilder};
 use dynamo_llm::mocker::protocols::MockEngineArgs;
-use dynamo_runtime::protocols::Endpoint as EndpointId;
+use dynamo_runtime::protocols::EndpointId;
 
 use crate::RouterMode;
 
@@ -130,14 +130,7 @@ impl EntrypointArgs {
         tls_key_path: Option<PathBuf>,
         extra_engine_args: Option<PathBuf>,
     ) -> PyResult<Self> {
-        let endpoint_id_obj: Option<EndpointId> = match endpoint_id {
-            Some(eid) => Some(eid.parse().map_err(|_| {
-                PyErr::new::<pyo3::exceptions::PyValueError, _>(format!(
-                    "Invalid endpoint_id format: {eid}"
-                ))
-            })?),
-            None => None,
-        };
+        let endpoint_id_obj: Option<EndpointId> = endpoint_id.as_deref().map(EndpointId::from);
         if (tls_cert_path.is_some() && tls_key_path.is_none())
             || (tls_cert_path.is_none() && tls_key_path.is_some())
         {
diff --git a/lib/llm/src/discovery/model_entry.rs b/lib/llm/src/discovery/model_entry.rs
@@ -21,11 +21,12 @@ use crate::{
 #[derive(Debug, Clone, Serialize, Deserialize, Eq, PartialEq)]
 pub struct ModelEntry {
     /// Public name of the model
-    /// This will be used to identify the model in the HTTP service from the value used in an an OpenAI ChatRequest.
+    /// Used to identify the model in the HTTP service from the value used in an OpenAI ChatRequest.
     pub name: String,
 
     /// How to address this on the network
-    pub endpoint: protocols::Endpoint,
+    #[serde(rename = "endpoint")]
+    pub endpoint_id: protocols::EndpointId,
 
     /// Specifies whether the model is a chat, completions, etc model.
     pub model_type: ModelType,
@@ -45,8 +46,8 @@ impl ModelEntry {
         matches!(self.model_type, ModelType::Backend)
     }
 
-    /// Fetch the ModelDeploymentCard from NATS.
-    /// This does not touch it's fields so you may need to call move_from_nats on it.
+    /// Fetch the ModelDeploymentCard from etcd.
+    /// This does not touch its fields so you may need to call move_from_nats on it.
     pub async fn load_mdc(
         &self,
         etcd_client: &etcd::Client,
diff --git a/lib/llm/src/discovery/watcher.rs b/lib/llm/src/discovery/watcher.rs
@@ -268,7 +268,7 @@ impl ModelWatcher {
     // Handles a PUT event from etcd, this usually means adding a new model to the list of served
     // models.
     async fn handle_put(&self, model_entry: &ModelEntry) -> anyhow::Result<()> {
-        let endpoint_id = model_entry.endpoint.clone();
+        let endpoint_id = &model_entry.endpoint_id;
         let component = self
             .drt
             .namespace(&endpoint_id.namespace)?
diff --git a/lib/llm/src/entrypoint/input/endpoint.rs b/lib/llm/src/entrypoint/input/endpoint.rs
@@ -20,7 +20,7 @@ use dynamo_runtime::engine::AsyncEngineStream;
 use dynamo_runtime::pipeline::{
     network::Ingress, Context, ManyOut, Operator, SegmentSource, ServiceBackend, SingleIn, Source,
 };
-use dynamo_runtime::{protocols::Endpoint as EndpointId, DistributedRuntime};
+use dynamo_runtime::{protocols::EndpointId, DistributedRuntime};
 
 use crate::entrypoint::EngineConfig;
 
@@ -141,7 +141,7 @@ pub async fn run(
 #[cfg(feature = "integration")]
 mod integration_tests {
     use super::*;
-    use dynamo_runtime::protocols::Endpoint as EndpointId;
+    use dynamo_runtime::protocols::EndpointId;
 
     async fn create_test_environment() -> anyhow::Result<(DistributedRuntime, EngineConfig)> {
         // Create a minimal distributed runtime and engine config for testing
diff --git a/lib/llm/src/http/service/clear_kv_blocks.rs b/lib/llm/src/http/service/clear_kv_blocks.rs
@@ -1,17 +1,5 @@
 // SPDX-FileCopyrightText: Copyright (c) 2024-2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
 // SPDX-License-Identifier: Apache-2.0
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-// http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
 
 use super::{service_v2, RouteDoc};
 use axum::{http::Method, response::IntoResponse, routing::post, Json, Router};
@@ -88,8 +76,8 @@ async fn clear_kv_blocks_handler(
 
     // create client for each model entry
     for entry in &model_entries {
-        let namespace = &entry.endpoint.namespace;
-        let component = &entry.endpoint.component;
+        let namespace = &entry.endpoint_id.namespace;
+        let component = &entry.endpoint_id.component;
         let entry_name = entry.name.to_string();
 
         tracing::debug!("Processing worker group: {}/{}", namespace, component);
diff --git a/lib/llm/src/http/service/health.rs b/lib/llm/src/http/service/health.rs
@@ -1,32 +1,5 @@
 // SPDX-FileCopyrightText: Copyright (c) 2024-2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
 // SPDX-License-Identifier: Apache-2.0
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-// http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-// SPDX-FileCopyrightText: Copyright (c) 2024-2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
-// SPDX-License-Identifier: Apache-2.0
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-// http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
 
 use super::{service_v2, RouteDoc};
 use axum::{http::Method, http::StatusCode, response::IntoResponse, routing::get, Json, Router};
@@ -104,7 +77,7 @@ async fn health_handler(
     } else {
         let endpoints: Vec<String> = model_entries
             .iter()
-            .map(|entry| entry.endpoint.as_url())
+            .map(|entry| entry.endpoint_id.as_url())
             .collect();
         (
             StatusCode::OK,
diff --git a/lib/llm/src/local_model.rs b/lib/llm/src/local_model.rs
@@ -6,7 +6,7 @@ use std::path::{Path, PathBuf};
 use std::sync::Arc;
 
 use anyhow::Context as _;
-use dynamo_runtime::protocols::Endpoint as EndpointId;
+use dynamo_runtime::protocols::EndpointId;
 use dynamo_runtime::slug::Slug;
 use dynamo_runtime::traits::DistributedRuntimeProvider;
 use dynamo_runtime::{
@@ -402,7 +402,7 @@ impl LocalModel {
         tracing::debug!("Registering with etcd as {network_name}");
         let model_registration = ModelEntry {
             name: self.display_name().to_string(),
-            endpoint: endpoint.id(),
+            endpoint_id: endpoint.id(),
             model_type,
             runtime_config: Some(self.runtime_config.clone()),
         };
diff --git a/lib/llm/src/mocker/engine.rs b/lib/llm/src/mocker/engine.rs
@@ -440,7 +440,7 @@ impl AnnotatedMockEngine {
     pub fn new(
         inner: MockVllmEngine,
         distributed_runtime: DistributedRuntime,
-        endpoint: dynamo_runtime::protocols::Endpoint,
+        endpoint_id: dynamo_runtime::protocols::EndpointId,
     ) -> Self {
         let inner = Arc::new(inner);
         let inner_clone = inner.clone();
@@ -449,13 +449,13 @@ impl AnnotatedMockEngine {
         tokio::spawn(async move {
             loop {
                 // Try to create component
-                let Ok(namespace) = distributed_runtime.namespace(&endpoint.namespace) else {
+                let Ok(namespace) = distributed_runtime.namespace(&endpoint_id.namespace) else {
                     tracing::debug!("Namespace not available yet, retrying...");
                     tokio::time::sleep(Duration::from_millis(100)).await;
                     continue;
                 };
 
-                let Ok(component) = namespace.component(&endpoint.component) else {
+                let Ok(component) = namespace.component(&endpoint_id.component) else {
                     tracing::debug!("Component not available yet, retrying...");
                     tokio::time::sleep(Duration::from_millis(100)).await;
                     continue;
@@ -509,13 +509,13 @@ impl AsyncEngine<SingleIn<PreprocessedRequest>, ManyOut<Annotated<LLMEngineOutpu
 /// Create a mocker engine as ExecutionContext
 pub async fn make_mocker_engine(
     distributed_runtime: DistributedRuntime,
-    endpoint: dynamo_runtime::protocols::Endpoint,
+    endpoint_id: dynamo_runtime::protocols::EndpointId,
     args: MockEngineArgs,
 ) -> Result<crate::backend::ExecutionContext, Error> {
     // Create the mocker engine
     tracing::info!("Creating mocker engine with config: {args:?}");
     let annotated_engine =
-        AnnotatedMockEngine::new(MockVllmEngine::new(args), distributed_runtime, endpoint);
+        AnnotatedMockEngine::new(MockVllmEngine::new(args), distributed_runtime, endpoint_id);
 
     Ok(Arc::new(annotated_engine))
 }
diff --git a/lib/llm/src/mocker/scheduler.rs b/lib/llm/src/mocker/scheduler.rs
@@ -1,17 +1,5 @@
 // SPDX-FileCopyrightText: Copyright (c) 2024-2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
 // SPDX-License-Identifier: Apache-2.0
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-// http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
 
 //! Asynchronous Scheduler for LLM Request Management
 //!
@@ -207,7 +195,7 @@ impl SchedulerState {
 
     /// Remove a UUID and its associated Request from collections.
     fn complete(&mut self, uuid: &Uuid) {
-        tracing::debug!("Request {} will complete", uuid);
+        tracing::trace!("Request {uuid} will complete");
         self.decode.remove(uuid);
         self.requests.remove(uuid);
         self.prefill_costs.remove(uuid);
diff --git a/lib/llm/src/protocols/common/llm_backend.rs b/lib/llm/src/protocols/common/llm_backend.rs
@@ -192,10 +192,5 @@ mod tests {
         assert_eq!(format!("{}", output.err().unwrap()), "Test error");
         assert!(!output.is_ok());
         assert!(output.is_err());
-
-        let output = LLMEngineOutput::from_err(anyhow::Error::msg("Test error 2").into());
-        assert_eq!(format!("{}", output.err().unwrap()), "Test error 2");
-        assert!(!output.is_ok());
-        assert!(output.is_err());
     }
 }
diff --git a/lib/runtime/src/component.rs b/lib/runtime/src/component.rs
@@ -47,7 +47,7 @@ use super::{
 };
 
 use crate::pipeline::network::{ingress::push_endpoint::PushEndpoint, PushWorkHandler};
-use crate::protocols::Endpoint as EndpointId;
+use crate::protocols::EndpointId;
 use crate::service::ComponentNatsServerPrometheusMetrics;
 use async_nats::{
     rustls::quic,
diff --git a/lib/runtime/src/protocols.rs b/lib/runtime/src/protocols.rs
diff --git a/lib/runtime/src/protocols/annotated.rs b/lib/runtime/src/protocols/annotated.rs
diff --git a/lib/runtime/src/storage/key_value_store/nats.rs b/lib/runtime/src/storage/key_value_store/nats.rs
diff --git a/tests/serve/test_sglang.py b/tests/serve/test_sglang.py
diff --git a/tests/serve/test_vllm.py b/tests/serve/test_vllm.py