diff --git a/components/backends/trtllm/engine_configs/llama4/eagle_one_model/eagle_agg.yml b/components/backends/trtllm/engine_configs/llama4/eagle_one_model/eagle_agg.yml new file mode 100644 index 0000000000..56ccf8d07d --- /dev/null +++ b/components/backends/trtllm/engine_configs/llama4/eagle_one_model/eagle_agg.yml @@ -0,0 +1,38 @@ +# SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# SPDX-License-Identifier: Apache-2.0 +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +backend: pytorch +tensor_parallel_size: 8 +moe_expert_parallel_size: 8 +max_batch_size: 8 +max_num_tokens: 4096 +disable_overlap_scheduler: true # disable_overlap_scheduler is having acc issue on both aggregated and disaggregated serving + +# Enable Speculative Decoding in the model engine +speculative_config: + decoding_type: Eagle + max_draft_len: 3 + pytorch_weights_path: nvidia/Llama-4-Maverick-17B-128E-Eagle3 + eagle3_one_model: true + +kv_cache_config: + free_gpu_memory_fraction: 0.5 + enable_block_reuse: false # true when target and draft are same kv dtype + +cuda_graph_config: + padding_enabled: true + max_batch_size: 8 + +print_iter_log: true diff --git a/components/backends/trtllm/engine_configs/llama4/eagle_one_model/eagle_decode.yaml b/components/backends/trtllm/engine_configs/llama4/eagle_one_model/eagle_decode.yaml new file mode 100644 index 0000000000..556a1365f5 --- /dev/null +++ b/components/backends/trtllm/engine_configs/llama4/eagle_one_model/eagle_decode.yaml @@ -0,0 +1,40 @@ +# SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# SPDX-License-Identifier: Apache-2.0 +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +backend: pytorch +tensor_parallel_size: 8 +moe_expert_parallel_size: 8 +max_batch_size: 256 +max_num_tokens: 1024 +# 8704 = 8192 ISL + 512 OSL +max_seq_len: 8704 +disable_overlap_scheduler: true + +# Enable Speculative Decoding in the model engine +speculative_config: + decoding_type: Eagle + max_draft_len: 3 + pytorch_weights_path: nvidia/Llama-4-Maverick-17B-128E-Eagle3 + eagle3_one_model: True + +kv_cache_config: + free_gpu_memory_fraction: 0.5 + enable_block_reuse: false + +cuda_graph_config: + padding_enabled: true + max_batch_size: 256 + +print_iter_log: true diff --git a/components/backends/trtllm/engine_configs/llama4/eagle_one_model/eagle_prefill.yaml b/components/backends/trtllm/engine_configs/llama4/eagle_one_model/eagle_prefill.yaml new file mode 100644 index 0000000000..a75d2a6219 --- /dev/null +++ b/components/backends/trtllm/engine_configs/llama4/eagle_one_model/eagle_prefill.yaml @@ -0,0 +1,34 @@ +# SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# SPDX-License-Identifier: Apache-2.0 +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +backend: pytorch +tensor_parallel_size: 8 +moe_expert_parallel_size: 8 +max_batch_size: 1 +max_num_tokens: 8192 +max_seq_len: 8192 +print_iter_log: true +disable_overlap_scheduler: true + +# Enable Speculative Decoding in the model engine +speculative_config: + decoding_type: Eagle + max_draft_len: 3 + pytorch_weights_path: nvidia/Llama-4-Maverick-17B-128E-Eagle3 + eagle3_one_model: True + +kv_cache_config: + free_gpu_memory_fraction: 0.5 + enable_block_reuse: false diff --git a/components/backends/trtllm/llama4_plus_eagle.md b/components/backends/trtllm/llama4_plus_eagle.md index 92d9704997..2d542f7a1a 100644 --- a/components/backends/trtllm/llama4_plus_eagle.md +++ b/components/backends/trtllm/llama4_plus_eagle.md @@ -34,6 +34,12 @@ For advanced control over how requests are routed between prefill and decode wor * Built with a version of TensorRT-LLM based on the 0.21 release [Link](https://github.com/NVIDIA/TensorRT-LLM/tree/release/0.21) * If you need to download model weights off huggingface, make sure you run the command `huggingface-cli login` and have access to the necessary gated models. +## Eagle3-one-model +* Eagle3-one-model (`eagle3_one_model=True`) config is added in `engine_configs/llama4/eagle_one_model`. Build dynamo with the latest commit `66f299a` in TRTLLM 1.0.0.rc2 [Link](https://github.com/NVIDIA/TensorRT-LLM/commits/v1.0.0rc2/). +* The configs in `engine_configs/llama4/eagle_one_model` are tested with 8xH100 cluster. Be sure to change the `NUM_GPUS_PER_NODE` accordingly or change TP/EP size in config. 1 8xH100 node for aggregated .yml file, 2 8xH100 for prefill/decode .yml file. +* The current `./multinode/start_frontend_services.sh` may got ran `NUM_GPUS_PER_NODE` times depending on how srun/mpi is launched, beware that the frontend service only needs to be ran once. +* Eagle3-one-model appends the eagle3 layer at the end of the TRTLLM engine, instead of sending base/draft requests between 2 engines. Visit TRTLLM for more information. + ## Setup @@ -77,3 +83,17 @@ export DECODE_ENGINE_CONFIG="/mnt/engine_configs/llama4/eagle/eagle_decode.yaml" ## Example Request See [here](./multinode/multinode-examples.md#example-request) to learn how to send a request to the deployment. + +``` +curl localhost:8000/v1/chat/completions -H "Content-Type: application/json" -d '{ + "model": "nvidia/Llama-4-Maverick-17B-128E-Instruct-FP8", + "messages": [{"role": "user", "content": "Why is NVIDIA a great company?"}], + "max_tokens": 1024 + }' -w "\n" + + +# output: +{"id":"cmpl-3e87ea5c-010e-4dd2-bcc4-3298ebd845a8","choices":[{"text":"NVIDIA is considered a great company for several reasons:\n\n1. **Technological Innovation**: NVIDIA is a leader in the field of graphics processing units (GPUs) and has been at the forefront of technological innovation. +... +and the broader tech industry.\n\nThese factors combined have contributed to NVIDIA's status as a great company in the technology sector.","index":0,"logprobs":null,"finish_reason":"stop"}],"created":1753329671,"model":"nvidia/Llama-4-Maverick-17B-128E-Instruct-FP8","system_fingerprint":null,"object":"text_completion","usage":{"prompt_tokens":16,"completion_tokens":562,"total_tokens":578,"prompt_tokens_details":null,"completion_tokens_details":null}} +```