Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
48 changes: 48 additions & 0 deletions examples/tensorrt_llm/README.md
Original file line number Diff line number Diff line change
Expand Up @@ -350,3 +350,51 @@ unset TRTLLM_USE_NIXL_KVCACHE
export TRTLLM_USE_UCX_KVCACHE=1
```


### Example architectures for Llama 4 Maverick Instruct + Eagle Speculative Decoding

#### Notes
* Testing for the current example used:
* One GB200x4 node for aggregate serving
* Two GB200x4 nodes for disaggregate serving
* To run Eagle Speculative Decoding with Llama 4, ensure the container meets the following criteria:
* Built with a version of TensorRT-LLM based on the 0.21 release [Link](https://github.com/NVIDIA/TensorRT-LLM/tree/release/0.21)
* The TensorRT-LLM build includes the changes from this PR [Link](https://github.com/NVIDIA/TensorRT-LLM/pull/5975)
* If you need to download model weights off huggingface, make sure you run the command `huggingface-cli login` and have access to the necessary gated models.

##### Aggregated Serving
```bash
cd /workspace/examples/tensorrt_llm
dynamo serve graphs.disagg:Frontend -f configs/llama4/eagle/eagle_agg.yaml
```
* Known Issue: In Aggregated Serving, setting `max_num_tokens` to higher values (e.g. `max_num_tokens: 8448`) can lead to Out of Memory (OOM) errors. This is being investigated by the TRTLLM team.

##### Disaggregated Serving

###### Head Node
Start nats/etcd
``` bash
nats-server -js &
etcd --listen-client-urls http://0.0.0.0:2379 --advertise-client-urls http://0.0.0.0:2379 --data-dir /tmp/etcd &
```

Launch graph of Frontend and TensorRTLLMWorker (decode) on head node:

```bash
cd /workspace/examples/tensorrt_llm
dynamo serve graphs.agg:Frontend -f configs/llama4/eagle/eagle_disagg.yaml &
```

###### Worker Node(s)
Set environment variables pointing at the etcd/nats endpoints on the head node.
```bash
export HEAD_NODE_IP="<head-node-ip>"
export NATS_SERVER="nats://${HEAD_NODE_IP}:4222"
export ETCD_ENDPOINTS="${HEAD_NODE_IP}:2379"
```

Deploy a Prefill worker:
```bash
cd /workspace/examples/tensorrt_llm
dynamo serve components.prefill_worker:TensorRTLLMPrefillWorker -f configs/llama4/eagle/eagle_disagg.yaml --service-name TensorRTLLMPrefillWorker &
```
31 changes: 31 additions & 0 deletions examples/tensorrt_llm/configs/llama4/eagle/eagle_agg.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,31 @@
# SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
# SPDX-License-Identifier: Apache-2.0
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

Frontend:
# This is the client-facing model name, you can set this to anything you'd like.
served_model_name: "nvidia/Llama-4-Maverick-17B-128E-Instruct-FP8"
endpoint: dynamo.TensorRTLLMWorker.generate
port: 8000
router: round-robin

TensorRTLLMWorker:
served_model_name: "nvidia/Llama-4-Maverick-17B-128E-Instruct-FP8"
model-path: "nvidia/Llama-4-Maverick-17B-128E-Instruct-FP8"
extra-engine-args: "configs/llama4/eagle/engine_configs/agg_config.yaml"
router: round-robin
ServiceArgs:
workers: 1
resources:
gpu: 4
44 changes: 44 additions & 0 deletions examples/tensorrt_llm/configs/llama4/eagle/eagle_disagg.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,44 @@
# SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
# SPDX-License-Identifier: Apache-2.0
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

Frontend:
served_model_name: "nvidia/Llama-4-Maverick-17B-128E-Instruct-FP8"
endpoint: dynamo.TensorRTLLMWorker.generate
port: 8000
router: round-robin

TensorRTLLMWorker:
served_model_name: "nvidia/Llama-4-Maverick-17B-128E-Instruct-FP8"
model-path: "nvidia/Llama-4-Maverick-17B-128E-Instruct-FP8"
# Path to a YAML file containing additional keyword arguments to pass to the TRTLLM engine.
# The fields in `extra-engine-args` holds higher priority than the above TRTLLM engine fields.
extra-engine-args: "configs/llama4/eagle/engine_configs/decode_config.yaml"
router: round-robin
enable-disagg: true
ServiceArgs:
workers: 1
resources:
gpu: 4

TensorRTLLMPrefillWorker:
model-path: "nvidia/Llama-4-Maverick-17B-128E-Instruct-FP8"
# Path to a YAML file containing additional keyword arguments to pass to the TRTLLM engine.
# The fields in `extra-engine-args` holds higher priority than the above TRTLLM engine fields.
extra-engine-args: "configs/llama4/eagle/engine_configs/prefill_config.yaml"
router: round-robin
ServiceArgs:
workers: 1
resources:
gpu: 4
Original file line number Diff line number Diff line change
@@ -0,0 +1,51 @@
# SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
# SPDX-License-Identifier: Apache-2.0
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

backend: pytorch
tensor_parallel_size: 4
moe_expert_parallel_size: 4
max_batch_size: 256
# When max_num_tokens set to higher values, can cause OOM issues.
# Will be investigated in the future with TRTLLM team.
max_num_tokens: 1024
max_seq_len: 8448
autotuner_enabled: false
disable_overlap_scheduler: true

# Enable Speculative Decoding in the model engine
speculative_config:
decoding_type: Eagle
max_draft_len: 1
pytorch_weights_path: nvidia/Llama-4-Maverick-17B-128E-Eagle3
eagle3_one_model: False

kv_cache_config:
free_gpu_memory_fraction: 0.5
enable_block_reuse: false

use_cuda_graph: true
cuda_graph_padding_enabled: true
cuda_graph_batch_sizes:
- 1
- 2
- 4
- 8
- 16
- 32
- 64
- 128
- 256
print_iter_log: true
kv_cache_dtype: fp8
Original file line number Diff line number Diff line change
@@ -0,0 +1,50 @@
# SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
# SPDX-License-Identifier: Apache-2.0
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

backend: pytorch
tensor_parallel_size: 4
moe_expert_parallel_size: 4
max_batch_size: 256
max_num_tokens: 512
# 8704 = 8192 ISL + 512 OSL
max_seq_len: 8704
disable_overlap_scheduler: true
autotuner_enabled: false

# Enable Speculative Decoding in the model engine
speculative_config:
decoding_type: Eagle
max_draft_len: 1
pytorch_weights_path: nvidia/Llama-4-Maverick-17B-128E-Eagle3
eagle3_one_model: False

kv_cache_config:
free_gpu_memory_fraction: 0.5
enable_block_reuse: false

use_cuda_graph: true
cuda_graph_padding_enabled: true
cuda_graph_batch_sizes:
- 1
- 2
- 4
- 8
- 16
- 32
- 64
- 128
- 256
print_iter_log: true
kv_cache_dtype: fp8
Original file line number Diff line number Diff line change
@@ -0,0 +1,36 @@
# SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
# SPDX-License-Identifier: Apache-2.0
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

backend: pytorch
tensor_parallel_size: 4
moe_expert_parallel_size: 4
max_batch_size: 1
max_num_tokens: 8192
max_seq_len: 8192
print_iter_log: true
kv_cache_dtype: fp8
disable_overlap_scheduler: true
autotuner_enabled: false

# Enable Speculative Decoding in the model engine
speculative_config:
decoding_type: Eagle
max_draft_len: 1
pytorch_weights_path: nvidia/Llama-4-Maverick-17B-128E-Eagle3
eagle3_one_model: False

kv_cache_config:
free_gpu_memory_fraction: 0.5
enable_block_reuse: false
Loading