Skip to content
Merged
Show file tree
Hide file tree
Changes from 6 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Original file line number Diff line number Diff line change
@@ -0,0 +1,50 @@
# SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
# SPDX-License-Identifier: Apache-2.0
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

# NOTE: FP4 only supported starting with Blackwell GPUs.
# https://huggingface.co/nvidia/DeepSeek-R1-FP4
# You can also specify the full path to locally downloaded weights
# instead of a HuggingFace ID here.

backend: pytorch
tensor_parallel_size: 4
moe_expert_parallel_size: 4
# enable_attention_dp: true
max_batch_size: 256
# 8448 = 8192 ISL + 256 OSL
max_num_tokens: 8448
max_seq_len: 8448
kv_cache_config:
free_gpu_memory_fraction: 0.30

# Enable the MTP(Multi-Token Prediction) in the model engine
speculative_config:
decoding_type: MTP
num_nextn_predict_layers: 1

use_cuda_graph: true
cuda_graph_padding_enabled: true
cuda_graph_batch_sizes:
- 1
- 2
- 4
- 8
- 16
- 32
- 64
- 128
- 256
print_iter_log: true
kv_cache_dtype: fp8
Original file line number Diff line number Diff line change
@@ -0,0 +1,53 @@
# SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
# SPDX-License-Identifier: Apache-2.0
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

# NOTE: FP4 only supported starting with Blackwell GPUs.
# https://huggingface.co/nvidia/DeepSeek-R1-FP4
# You can also specify the full path to locally downloaded weights
# instead of a HuggingFace ID here.

backend: pytorch
tensor_parallel_size: 4
moe_expert_parallel_size: 4
# enable_attention_dp: false
max_batch_size: 256
# Note: When MPT is enabled and `cuda_graph_batch_sizes` is specified, `max_num_tokens` must satisfy the following formula:
# max_num_tokens >= max(cuda_graph_batch_sizes) * (num_nextn_predict_layers + 1)
# This is a known issue in TensorRT-LLM and will be resolved in the next release.
max_num_tokens: 512
# 8704 = 8192 ISL + 512 OSL
max_seq_len: 8704
kv_cache_config:
free_gpu_memory_fraction: 0.85

# Enable the MTP(Multi-Token Prediction) in decode model engine
speculative_config:
decoding_type: MTP
num_nextn_predict_layers: 1

use_cuda_graph: true
cuda_graph_padding_enabled: true
cuda_graph_batch_sizes:
- 1
- 2
- 4
- 8
- 16
- 32
- 64
- 128
- 256
print_iter_log: true
kv_cache_dtype: fp8
Original file line number Diff line number Diff line change
@@ -0,0 +1,37 @@
# SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
# SPDX-License-Identifier: Apache-2.0
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

# NOTE: FP4 only supported starting with Blackwell GPUs.
# https://huggingface.co/nvidia/DeepSeek-R1-FP4
# You can also specify the full path to locally downloaded weights
# instead of a HuggingFace ID here.

backend: pytorch
tensor_parallel_size: 4
moe_expert_parallel_size: 4
# enable_attention_dp: true
max_batch_size: 1
max_num_tokens: 8192
max_seq_len: 8192
kv_cache_config:
free_gpu_memory_fraction: 0.75
print_iter_log: true
kv_cache_dtype: fp8
disable_overlap_scheduler: true

# Enable the MTP(Multi-Token Prediction) in the prefill model engine
speculative_config:
decoding_type: MTP
num_nextn_predict_layers: 1
31 changes: 31 additions & 0 deletions examples/tensorrt_llm/configs/llama4/eagle/mtp_agg.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,31 @@
# SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
# SPDX-License-Identifier: Apache-2.0
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

Frontend:
# This is the client-facing model name, you can set this to anything you'd like.
served_model_name: "nvidia/Llama-4-Maverick-17B-128E-Eagle3"
endpoint: dynamo.TensorRTLLMWorker.generate
port: 8000
router: round-robin

TensorRTLLMWorker:
served_model_name: "nvidia/Llama-4-Maverick-17B-128E-Eagle3"
model-path: "nvidia/Llama-4-Maverick-17B-128E-Eagle3"
extra-engine-args: "configs/llama4/engine_configs/agg_config.yaml"
router: round-robin
ServiceArgs:
workers: 1
resources:
gpu: 8
52 changes: 52 additions & 0 deletions examples/tensorrt_llm/configs/llama4/eagle/mtp_disagg.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,52 @@
# SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
# SPDX-License-Identifier: Apache-2.0
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

Frontend:
served_model_name: "nvidia/Llama-4-Maverick-17B-128E-Eagle3"
endpoint: dynamo.TensorRTLLMWorker.generate
port: 8000
router: round-robin

TensorRTLLMWorker:
served_model_name: "nvidia/Llama-4-Maverick-17B-128E-Eagle3"
# NOTE: FP4 only supported starting with Blackwell GPUs.
# https://huggingface.co/nvidia/DeepSeek-R1-FP4
# You can also specify the full path to locally downloaded weights
# instead of a HuggingFace ID here.
model-path: "nvidia/Llama-4-Maverick-17B-128E-Eagle3"
# Path to a YAML file containing additional keyword arguments to pass to the TRTLLM engine.
# The fields in `extra-engine-args` holds higher priority than the above TRTLLM engine fields.
extra-engine-args: "configs/llama4/eagle/engine_configs/decode_config.yaml"
router: round-robin
enable-disagg: true
ServiceArgs:
workers: 1
resources:
gpu: 4

TensorRTLLMPrefillWorker:
# NOTE: FP4 only supported starting with Blackwell GPUs.
# https://huggingface.co/nvidia/DeepSeek-R1-FP4
# You can also specify the full path to locally downloaded weights
# instead of a HuggingFace ID here.
model-path: "nvidia/Llama-4-Maverick-17B-128E-Eagle3"
# Path to a YAML file containing additional keyword arguments to pass to the TRTLLM engine.
# The fields in `extra-engine-args` holds higher priority than the above TRTLLM engine fields.
extra-engine-args: "configs/llama4/eagle/engine_configs/prefill_config.yaml"
router: round-robin
ServiceArgs:
workers: 1
resources:
gpu: 4
Loading