Skip to content

Commit 7799661

Browse files
authored
Merge branch 'main' into ziqif/enable-kvbm-metrics
2 parents 371af5e + ea391f3 commit 7799661

File tree

19 files changed

+209
-196
lines changed

19 files changed

+209
-196
lines changed

components/backends/vllm/launch/dsr1_dep.sh

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -47,8 +47,8 @@ while [[ $# -gt 0 ]]; do
4747
echo " --gpus-per-node L Number of GPUs per node (required, int)"
4848
echo " --master-addr ADDR Master node address (default: localhost)"
4949
echo " --log-dir DIR Directory for log files (default: ./logs)"
50-
echo " --model MODEL Model name to use (default: deepseek-ai/DeepSeek-R1)"
51-
echo " -h, --help Show this help message"
50+
echo " --model MODEL Model name to use (default: ${MODEL})"
51+
echo " -h, --help Show this help message"
5252
exit 0
5353
;;
5454
*)

docs/support_matrix.md

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -83,7 +83,7 @@ If you are using a **GPU**, the following GPU models and architectures are suppo
8383

8484

8585
> [!Caution]
86-
> ¹ There is a known issue with the TensorRT-LLM framework when installed within the AL2023 container via the Python Wheels which makes effective environment setup challenging.
86+
> ¹ There is a known issue with the TensorRT-LLM framework when running the AL2023 container locally with `docker run --network host ...` due to a [bug](https://github.com/mpi4py/mpi4py/discussions/491#discussioncomment-12660609) in mpi4py. To avoid this issue, replace the `--network host` flag with more precise networking configuration by mapping only the necessary ports (e.g., 4222 for nats, 2379/2380 for etcd, 8080 for frontend).
8787
8888

8989
## Build Support
Lines changed: 68 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,68 @@
1+
# SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
2+
# SPDX-License-Identifier: Apache-2.0
3+
4+
apiVersion: nvidia.com/v1alpha1
5+
kind: DynamoGraphDeployment
6+
metadata:
7+
name: agg-llava
8+
spec:
9+
backendFramework: vllm
10+
services:
11+
Frontend:
12+
dynamoNamespace: agg-llava
13+
componentType: frontend
14+
replicas: 1
15+
extraPodSpec:
16+
mainContainer:
17+
image: my-registry/vllm-runtime:my-tag
18+
EncodeWorker:
19+
envFromSecret: hf-token-secret
20+
dynamoNamespace: agg-llava
21+
componentType: worker
22+
replicas: 1
23+
resources:
24+
limits:
25+
gpu: "1"
26+
extraPodSpec:
27+
mainContainer:
28+
image: my-registry/vllm-runtime:my-tag
29+
workingDir: /workspace/examples/multimodal
30+
command:
31+
- /bin/sh
32+
- -c
33+
args:
34+
- python3 components/encode_worker.py --model llava-hf/llava-1.5-7b-hf
35+
VLMWorker:
36+
envFromSecret: hf-token-secret
37+
dynamoNamespace: agg-llava
38+
componentType: worker
39+
replicas: 1
40+
resources:
41+
limits:
42+
gpu: "1"
43+
extraPodSpec:
44+
mainContainer:
45+
image: my-registry/vllm-runtime:my-tag
46+
workingDir: /workspace/examples/multimodal
47+
command:
48+
- /bin/sh
49+
- -c
50+
args:
51+
- python3 components/worker.py --model llava-hf/llava-1.5-7b-hf --worker-type prefill
52+
Processor:
53+
envFromSecret: hf-token-secret
54+
dynamoNamespace: agg-llava
55+
componentType: worker
56+
replicas: 1
57+
resources:
58+
limits:
59+
gpu: "1"
60+
extraPodSpec:
61+
mainContainer:
62+
image: my-registry/vllm-runtime:my-tag
63+
workingDir: /workspace/examples/multimodal
64+
command:
65+
- /bin/sh
66+
- -c
67+
args:
68+
- 'python3 components/processor.py --model llava-hf/llava-1.5-7b-hf --prompt-template "USER: <image>\n<prompt> ASSISTANT:"'

lib/bindings/python/rust/llm/entrypoint.rs

Lines changed: 2 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -13,7 +13,7 @@ use dynamo_llm::kv_router::KvRouterConfig as RsKvRouterConfig;
1313
use dynamo_llm::local_model::DEFAULT_HTTP_PORT;
1414
use dynamo_llm::local_model::{LocalModel, LocalModelBuilder};
1515
use dynamo_llm::mocker::protocols::MockEngineArgs;
16-
use dynamo_runtime::protocols::Endpoint as EndpointId;
16+
use dynamo_runtime::protocols::EndpointId;
1717

1818
use crate::RouterMode;
1919

@@ -130,14 +130,7 @@ impl EntrypointArgs {
130130
tls_key_path: Option<PathBuf>,
131131
extra_engine_args: Option<PathBuf>,
132132
) -> PyResult<Self> {
133-
let endpoint_id_obj: Option<EndpointId> = match endpoint_id {
134-
Some(eid) => Some(eid.parse().map_err(|_| {
135-
PyErr::new::<pyo3::exceptions::PyValueError, _>(format!(
136-
"Invalid endpoint_id format: {eid}"
137-
))
138-
})?),
139-
None => None,
140-
};
133+
let endpoint_id_obj: Option<EndpointId> = endpoint_id.as_deref().map(EndpointId::from);
141134
if (tls_cert_path.is_some() && tls_key_path.is_none())
142135
|| (tls_cert_path.is_none() && tls_key_path.is_some())
143136
{

lib/llm/src/discovery/model_entry.rs

Lines changed: 5 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -21,11 +21,12 @@ use crate::{
2121
#[derive(Debug, Clone, Serialize, Deserialize, Eq, PartialEq)]
2222
pub struct ModelEntry {
2323
/// Public name of the model
24-
/// This will be used to identify the model in the HTTP service from the value used in an an OpenAI ChatRequest.
24+
/// Used to identify the model in the HTTP service from the value used in an OpenAI ChatRequest.
2525
pub name: String,
2626

2727
/// How to address this on the network
28-
pub endpoint: protocols::Endpoint,
28+
#[serde(rename = "endpoint")]
29+
pub endpoint_id: protocols::EndpointId,
2930

3031
/// Specifies whether the model is a chat, completions, etc model.
3132
pub model_type: ModelType,
@@ -45,8 +46,8 @@ impl ModelEntry {
4546
matches!(self.model_type, ModelType::Backend)
4647
}
4748

48-
/// Fetch the ModelDeploymentCard from NATS.
49-
/// This does not touch it's fields so you may need to call move_from_nats on it.
49+
/// Fetch the ModelDeploymentCard from etcd.
50+
/// This does not touch its fields so you may need to call move_from_nats on it.
5051
pub async fn load_mdc(
5152
&self,
5253
etcd_client: &etcd::Client,

lib/llm/src/discovery/watcher.rs

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -268,7 +268,7 @@ impl ModelWatcher {
268268
// Handles a PUT event from etcd, this usually means adding a new model to the list of served
269269
// models.
270270
async fn handle_put(&self, model_entry: &ModelEntry) -> anyhow::Result<()> {
271-
let endpoint_id = model_entry.endpoint.clone();
271+
let endpoint_id = &model_entry.endpoint_id;
272272
let component = self
273273
.drt
274274
.namespace(&endpoint_id.namespace)?

lib/llm/src/entrypoint/input/endpoint.rs

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -20,7 +20,7 @@ use dynamo_runtime::engine::AsyncEngineStream;
2020
use dynamo_runtime::pipeline::{
2121
network::Ingress, Context, ManyOut, Operator, SegmentSource, ServiceBackend, SingleIn, Source,
2222
};
23-
use dynamo_runtime::{protocols::Endpoint as EndpointId, DistributedRuntime};
23+
use dynamo_runtime::{protocols::EndpointId, DistributedRuntime};
2424

2525
use crate::entrypoint::EngineConfig;
2626

@@ -141,7 +141,7 @@ pub async fn run(
141141
#[cfg(feature = "integration")]
142142
mod integration_tests {
143143
use super::*;
144-
use dynamo_runtime::protocols::Endpoint as EndpointId;
144+
use dynamo_runtime::protocols::EndpointId;
145145

146146
async fn create_test_environment() -> anyhow::Result<(DistributedRuntime, EngineConfig)> {
147147
// Create a minimal distributed runtime and engine config for testing

lib/llm/src/http/service/clear_kv_blocks.rs

Lines changed: 2 additions & 14 deletions
Original file line numberDiff line numberDiff line change
@@ -1,17 +1,5 @@
11
// SPDX-FileCopyrightText: Copyright (c) 2024-2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
22
// SPDX-License-Identifier: Apache-2.0
3-
//
4-
// Licensed under the Apache License, Version 2.0 (the "License");
5-
// you may not use this file except in compliance with the License.
6-
// You may obtain a copy of the License at
7-
//
8-
// http://www.apache.org/licenses/LICENSE-2.0
9-
//
10-
// Unless required by applicable law or agreed to in writing, software
11-
// distributed under the License is distributed on an "AS IS" BASIS,
12-
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13-
// See the License for the specific language governing permissions and
14-
// limitations under the License.
153

164
use super::{service_v2, RouteDoc};
175
use axum::{http::Method, response::IntoResponse, routing::post, Json, Router};
@@ -88,8 +76,8 @@ async fn clear_kv_blocks_handler(
8876

8977
// create client for each model entry
9078
for entry in &model_entries {
91-
let namespace = &entry.endpoint.namespace;
92-
let component = &entry.endpoint.component;
79+
let namespace = &entry.endpoint_id.namespace;
80+
let component = &entry.endpoint_id.component;
9381
let entry_name = entry.name.to_string();
9482

9583
tracing::debug!("Processing worker group: {}/{}", namespace, component);

lib/llm/src/http/service/health.rs

Lines changed: 1 addition & 28 deletions
Original file line numberDiff line numberDiff line change
@@ -1,32 +1,5 @@
11
// SPDX-FileCopyrightText: Copyright (c) 2024-2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
22
// SPDX-License-Identifier: Apache-2.0
3-
//
4-
// Licensed under the Apache License, Version 2.0 (the "License");
5-
// you may not use this file except in compliance with the License.
6-
// You may obtain a copy of the License at
7-
//
8-
// http://www.apache.org/licenses/LICENSE-2.0
9-
//
10-
// Unless required by applicable law or agreed to in writing, software
11-
// distributed under the License is distributed on an "AS IS" BASIS,
12-
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13-
// See the License for the specific language governing permissions and
14-
// limitations under the License.
15-
16-
// SPDX-FileCopyrightText: Copyright (c) 2024-2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
17-
// SPDX-License-Identifier: Apache-2.0
18-
//
19-
// Licensed under the Apache License, Version 2.0 (the "License");
20-
// you may not use this file except in compliance with the License.
21-
// You may obtain a copy of the License at
22-
//
23-
// http://www.apache.org/licenses/LICENSE-2.0
24-
//
25-
// Unless required by applicable law or agreed to in writing, software
26-
// distributed under the License is distributed on an "AS IS" BASIS,
27-
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
28-
// See the License for the specific language governing permissions and
29-
// limitations under the License.
303

314
use super::{service_v2, RouteDoc};
325
use axum::{http::Method, http::StatusCode, response::IntoResponse, routing::get, Json, Router};
@@ -104,7 +77,7 @@ async fn health_handler(
10477
} else {
10578
let endpoints: Vec<String> = model_entries
10679
.iter()
107-
.map(|entry| entry.endpoint.as_url())
80+
.map(|entry| entry.endpoint_id.as_url())
10881
.collect();
10982
(
11083
StatusCode::OK,

lib/llm/src/local_model.rs

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -6,7 +6,7 @@ use std::path::{Path, PathBuf};
66
use std::sync::Arc;
77

88
use anyhow::Context as _;
9-
use dynamo_runtime::protocols::Endpoint as EndpointId;
9+
use dynamo_runtime::protocols::EndpointId;
1010
use dynamo_runtime::slug::Slug;
1111
use dynamo_runtime::traits::DistributedRuntimeProvider;
1212
use dynamo_runtime::{
@@ -402,7 +402,7 @@ impl LocalModel {
402402
tracing::debug!("Registering with etcd as {network_name}");
403403
let model_registration = ModelEntry {
404404
name: self.display_name().to_string(),
405-
endpoint: endpoint.id(),
405+
endpoint_id: endpoint.id(),
406406
model_type,
407407
runtime_config: Some(self.runtime_config.clone()),
408408
};

0 commit comments

Comments
 (0)