Skip to content
Merged
Show file tree
Hide file tree
Changes from 12 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
31 changes: 31 additions & 0 deletions examples/tensorrt_llm/configs/llama4/eagle/eagle_agg.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,31 @@
# SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
# SPDX-License-Identifier: Apache-2.0
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

Frontend:
# This is the client-facing model name, you can set this to anything you'd like.
served_model_name: "meta-llama/Llama-4-Maverick-17B-128E"
endpoint: dynamo.TensorRTLLMWorker.generate
port: 8000
router: round-robin

TensorRTLLMWorker:
served_model_name: "meta-llama/Llama-4-Maverick-17B-128E"
model-path: "meta-llama/Llama-4-Maverick-17B-128E"
extra-engine-args: "configs/llama4/eagle/engine_configs/agg_config.yaml"
router: round-robin
ServiceArgs:
workers: 1
resources:
gpu: 8
48 changes: 48 additions & 0 deletions examples/tensorrt_llm/configs/llama4/eagle/eagle_disagg.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,48 @@
# SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
# SPDX-License-Identifier: Apache-2.0
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

Frontend:
served_model_name: "meta-llama/Llama-4-Maverick-17B-128E"
endpoint: dynamo.TensorRTLLMWorker.generate
port: 8000
router: round-robin

TensorRTLLMWorker:
served_model_name: "meta-llama/Llama-4-Maverick-17B-128E"
model-path: "meta-llama/Llama-4-Maverick-17B-128E"
# Path to a YAML file containing additional keyword arguments to pass to the TRTLLM engine.
# The fields in `extra-engine-args` holds higher priority than the above TRTLLM engine fields.
extra-engine-args: "configs/llama4/eagle/engine_configs/decode_config.yaml"
router: round-robin
enable-disagg: true
ServiceArgs:
workers: 1
resources:
gpu: 8

TensorRTLLMPrefillWorker:
# NOTE: FP4 only supported starting with Blackwell GPUs.
# https://huggingface.co/nvidia/DeepSeek-R1-FP4
# You can also specify the full path to locally downloaded weights
# instead of a HuggingFace ID here.
model-path: "meta-llama/Llama-4-Maverick-17B-128E"
# Path to a YAML file containing additional keyword arguments to pass to the TRTLLM engine.
# The fields in `extra-engine-args` holds higher priority than the above TRTLLM engine fields.
extra-engine-args: "configs/llama4/eagle/engine_configs/prefill_config.yaml"
router: round-robin
ServiceArgs:
workers: 1
resources:
gpu: 8
Original file line number Diff line number Diff line change
@@ -0,0 +1,55 @@
# SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
# SPDX-License-Identifier: Apache-2.0
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

# NOTE: FP4 only supported starting with Blackwell GPUs.
# https://huggingface.co/nvidia/DeepSeek-R1-FP4
# You can also specify the full path to locally downloaded weights
# instead of a HuggingFace ID here.

backend: pytorch
tensor_parallel_size: 4
moe_expert_parallel_size: 1
max_batch_size: 256
# 8448 = 8192 ISL + 256 OSL
max_num_tokens: 8448
max_seq_len: 8448

# Enable Speculative Decoding in the model engine
speculative_config:
decoding_type: Eagle
max_draft_len: 1
pytorch_weights_path: nvidia/Llama-4-Maverick-17B-128E-Eagle3
eagle3_one_model: False

kv_cache_config:
free_gpu_memory_fraction: 0.5
enable_block_reuse: false

disable_overlap_scheduler: true

use_cuda_graph: true
cuda_graph_padding_enabled: true
cuda_graph_batch_sizes:
- 1
- 2
- 4
- 8
- 16
- 32
- 64
- 128
- 256
print_iter_log: true
kv_cache_dtype: fp8
Original file line number Diff line number Diff line change
@@ -0,0 +1,57 @@
# SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
# SPDX-License-Identifier: Apache-2.0
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

# NOTE: FP4 only supported starting with Blackwell GPUs.
# https://huggingface.co/nvidia/DeepSeek-R1-FP4
# You can also specify the full path to locally downloaded weights
# instead of a HuggingFace ID here.

backend: pytorch
tensor_parallel_size: 4
moe_expert_parallel_size: 4
max_batch_size: 256
# Note: When MPT is enabled and `cuda_graph_batch_sizes` is specified, `max_num_tokens` must satisfy the following formula:
# max_num_tokens >= max(cuda_graph_batch_sizes) * (num_nextn_predict_layers + 1)
# This is a known issue in TensorRT-LLM and will be resolved in the next release.
max_num_tokens: 512
# 8704 = 8192 ISL + 512 OSL
max_seq_len: 8704
disable_overlap_scheduler: true

# Enable Speculative Decoding in the model engine
speculative_config:
decoding_type: Eagle
max_draft_len: 1
pytorch_weights_path: nvidia/Llama-4-Maverick-17B-128E-Eagle3
eagle3_one_model: False

kv_cache_config:
free_gpu_memory_fraction: 0.5
enable_block_reuse: false

use_cuda_graph: true
cuda_graph_padding_enabled: true
cuda_graph_batch_sizes:
- 1
- 2
- 4
- 8
- 16
- 32
- 64
- 128
- 256
print_iter_log: true
kv_cache_dtype: fp8
Original file line number Diff line number Diff line change
@@ -0,0 +1,40 @@
# SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
# SPDX-License-Identifier: Apache-2.0
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

# NOTE: FP4 only supported starting with Blackwell GPUs.
# https://huggingface.co/nvidia/DeepSeek-R1-FP4
# You can also specify the full path to locally downloaded weights
# instead of a HuggingFace ID here.

backend: pytorch
tensor_parallel_size: 4
moe_expert_parallel_size: 4
max_batch_size: 1
max_num_tokens: 8192
max_seq_len: 8192
print_iter_log: true
kv_cache_dtype: fp8
disable_overlap_scheduler: true

# Enable Speculative Decoding in the model engine
speculative_config:
decoding_type: Eagle
max_draft_len: 1
pytorch_weights_path: nvidia/Llama-4-Maverick-17B-128E-Eagle3
eagle3_one_model: False

kv_cache_config:
free_gpu_memory_fraction: 0.5
enable_block_reuse: false
Loading