Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Original file line number Diff line number Diff line change
Expand Up @@ -138,7 +138,8 @@ YOUR_DATA_PATH=<your dataset file following the format>

cat >./extra-llm-api-config.yml<<EOF
cuda_graph_config: {}
moe_backend: TRTLLM
moe_config:
backend: TRTLLM
speculative_config:
decoding_type: MTP
num_nextn_predict_layers: 3
Expand Down Expand Up @@ -196,7 +197,7 @@ We are seeing meaningful speedup using FP8 KV cache, thus refreshing the numbers
```bash
cat >./extra-llm-api-config.yml <<EOF
cuda_graph_config:
padding_enabled: true
enable_padding: true
batch_sizes:
- 896
- 512
Expand Down Expand Up @@ -263,7 +264,7 @@ YOUR_DATA_PATH=./dataset.txt

cat >./extra-llm-api-config.yml <<EOF
cuda_graph_config:
padding_enabled: true
enable_padding: true
batch_sizes:
- 1
- 2
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -124,7 +124,8 @@ YOUR_DATA_PATH=<your dataset file following the format>

cat >./extra-llm-api-config.yml<<EOF
cuda_graph_config: {}
moe_backend: TRTLLM
moe_config:
backend: TRTLLM
speculative_config:
decoding_type: MTP
num_nextn_predict_layers: 3
Expand Down Expand Up @@ -179,7 +180,8 @@ YOUR_DATA_PATH=<your dataset file following the format>

cat >./extra-llm-api-config.yml<<EOF
cuda_graph_config: {}
moe_backend: TRTLLM
moe_config:
backend: TRTLLM
speculative_config:
decoding_type: MTP
num_nextn_predict_layers: 3
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -157,7 +157,7 @@ These optimizations target the overall execution flow, scheduling, and resource

There is a feature called CUDA Graph padding in TensorRT-LLM, which is a good trade-off between the number of CUDA Graphs and the CUDA Graph hit ratio; it tries to pad a batch to the nearest one with a captured CUDA Graph. Normally you should enable the CUDA Graph padding feature to increase the CUDA Graph hit rate, but the padding itself has some overhead due to wasted tokens computation.

Users can opt-out the CUDA Graph padding feature to see the perf benefits, by setting the `cuda_graph_config:\n padding_enabled: False`, see API here [Pytorch backend config](https://github.com/NVIDIA/TensorRT-LLM/blob/main/tensorrt_llm/_torch/pyexecutor/config.py#L41)
Users can opt-out the CUDA Graph padding feature to see the perf benefits, by setting the `cuda_graph_config:\n enable_padding: False`, see API here [Pytorch backend config](https://github.com/NVIDIA/TensorRT-LLM/blob/main/tensorrt_llm/_torch/pyexecutor/config.py#L41)

* Overlap Scheduler:

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -623,7 +623,8 @@ Run 36-way expert parallelism inference with the EPLB configuration incorporated
cat > ./extra_llm_api_options_eplb.yaml <<EOF
enable_attention_dp: true
cuda_graph_config: {}
moe_load_balancer: ./moe_load_balancer.yaml
moe_config:
load_balancer: ./moe_load_balancer.yaml
EOF

trtllm-llmapi-launch \
Expand Down
2 changes: 1 addition & 1 deletion docs/source/performance/perf-overview.md
Original file line number Diff line number Diff line change
Expand Up @@ -201,7 +201,7 @@ trtllm-bench --model $model_name throughput --dataset $dataset_file --backend py
`llm_options.yml`
```yaml
cuda_graph_config:
padding_enabled: true
enable_padding: true
batch_sizes:
- 1
- 2
Expand Down
6 changes: 4 additions & 2 deletions docs/source/scripts/disaggregated/gen_yaml.py
Original file line number Diff line number Diff line change
Expand Up @@ -190,12 +190,14 @@ def gen_config_file(config_path: str,
'max_seq_len': 8576,
'free_gpu_memory_fraction': gen_gpu_memory_fraction,
'cuda_graph_config': {
'padding_enabled': True,
'enable_padding': True,
'batch_sizes': gen_cuda_graph_batch_sizes,
},
'print_iter_log': True,
'kv_cache_dtype': 'fp8',
'moe_backend': 'TRTLLM',
'moe_config': {
'backend': 'TRTLLM',
},
'cache_transceiver_config': {
'max_num_tokens': 8320,
},
Expand Down
2 changes: 1 addition & 1 deletion examples/llm-api/llm_runtime.py
Original file line number Diff line number Diff line change
Expand Up @@ -21,7 +21,7 @@ def example_cuda_graph_config():

cuda_graph_config = CudaGraphConfig(
batch_sizes=[1, 2, 4],
padding_enabled=True,
enable_padding=True,
)

llm = LLM(
Expand Down
6 changes: 3 additions & 3 deletions examples/llm-api/quickstart_advanced.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,7 @@

from tensorrt_llm import LLM, SamplingParams
from tensorrt_llm.llmapi import (CudaGraphConfig, DraftTargetDecodingConfig,
EagleDecodingConfig, KvCacheConfig,
EagleDecodingConfig, KvCacheConfig, MoeConfig,
MTPDecodingConfig, NGramDecodingConfig,
TorchCompileConfig)

Expand Down Expand Up @@ -188,7 +188,7 @@ def setup_llm(args):

cuda_graph_config = CudaGraphConfig(
batch_sizes=args.cuda_graph_batch_sizes,
padding_enabled=args.cuda_graph_padding_enabled,
enable_padding=args.cuda_graph_padding_enabled,
) if args.use_cuda_graph else None
llm = LLM(
model=args.model_dir,
Expand All @@ -207,7 +207,7 @@ def setup_llm(args):
enable_piecewise_cuda_graph= \
args.use_piecewise_cuda_graph)
if args.use_torch_compile else None,
moe_backend=args.moe_backend,
moe_config=MoeConfig(backend=args.moe_backend),
enable_trtllm_sampler=args.enable_trtllm_sampler,
max_seq_len=args.max_seq_len,
max_batch_size=args.max_batch_size,
Expand Down
13 changes: 7 additions & 6 deletions examples/models/core/deepseek_v3/README.md
Original file line number Diff line number Diff line change
Expand Up @@ -142,7 +142,7 @@ python /app/tensorrt_llm/benchmarks/cpp/prepare_dataset.py \

cat <<EOF > /tmp/extra-llm-api-config.yml
cuda_graph_config:
padding_enabled: true
enable_padding: true
batch_sizes: [1, 4, 8, 12]
EOF

Expand All @@ -169,9 +169,10 @@ python /app/tensorrt_llm/benchmarks/cpp/prepare_dataset.py \

cat <<EOF > /tmp/extra-llm-api-config.yml
cuda_graph_config:
padding_enabled: true
enable_padding: true
batch_sizes: [1, 2]
moe_max_num_tokens: 16384
moe_config:
max_num_tokens: 16384
EOF

trtllm-bench -m deepseek-ai/DeepSeek-R1 --model_path ${DS_R1_NVFP4_MODEL_PATH} throughput \
Expand Down Expand Up @@ -237,7 +238,7 @@ To serve the model using `trtllm-serve`:
```bash
cat >./extra-llm-api-config.yml <<EOF
cuda_graph_config:
padding_enabled: true
enable_padding: true
batch_sizes:
- 1
- 2
Expand Down Expand Up @@ -316,7 +317,7 @@ export TRTLLM_USE_UCX_KVCACHE=1

cat >./gen-extra-llm-api-config.yml <<EOF
cuda_graph_config:
padding_enabled: true
enable_padding: true
batch_sizes:
- 1
- 2
Expand Down Expand Up @@ -539,7 +540,7 @@ python3 /path/to/TensorRT-LLM/benchmarks/cpp/prepare_dataset.py \

cat >/path/to/TensorRT-LLM/extra-llm-api-config.yml <<EOF
cuda_graph_config:
padding_enabled: true
enable_padding: true
batch_sizes:
- 1
- 2
Expand Down
4 changes: 2 additions & 2 deletions examples/models/core/llama/README.md
Original file line number Diff line number Diff line change
Expand Up @@ -1557,14 +1557,14 @@ cat >./extra-llm-api-config.yml <<EOF
stream_interval: 2
cuda_graph_config:
max_batch_size: 1024
padding_enabled: true
enable_padding: true
EOF
```
Explanation:
- `stream_interval`: The iteration interval to create responses under the streaming mode.
- `cuda_graph_config`: CUDA Graph config.
- `max_batch_size`: Max CUDA graph batch size to capture.
- `padding_enabled`: Whether to enable CUDA graph padding.
- `enable_padding`: Whether to enable CUDA graph padding.


### Launch trtllm-serve OpenAI-compatible API server
Expand Down
8 changes: 4 additions & 4 deletions examples/models/core/llama4/README.md
Original file line number Diff line number Diff line change
Expand Up @@ -29,15 +29,15 @@ enable_attention_dp: true
stream_interval: 2
cuda_graph_config:
max_batch_size: 512
padding_enabled: true
enable_padding: true
EOF
```
Explanation:
- `enable_attention_dp`: Enable attention Data Parallel which is recommend to enable in high concurrency.
- `stream_interval`: The iteration interval to create responses under the streaming mode.
- `cuda_graph_config`: CUDA Graph config.
- `max_batch_size`: Max CUDA graph batch size to capture.
- `padding_enabled`: Whether to enable CUDA graph padding.
- `enable_padding`: Whether to enable CUDA graph padding.


#### 2. Launch trtllm-serve OpenAI-compatible API server
Expand Down Expand Up @@ -81,7 +81,7 @@ enable_min_latency: true
stream_interval: 2
cuda_graph_config:
max_batch_size: 8
padding_enabled: true
enable_padding: true
EOF
```
Explanation:
Expand All @@ -90,7 +90,7 @@ Explanation:
- `stream_interval`: The iteration interval to create responses under the streaming mode.
- `cuda_graph_config`: CUDA Graph config.
- `max_batch_size`: Max CUDA graph batch size to capture.
- `padding_enabled`: Whether to enable CUDA graph padding.
- `enable_padding`: Whether to enable CUDA graph padding.


#### 2. Launch trtllm-serve OpenAI-compatible API server
Expand Down
4 changes: 2 additions & 2 deletions examples/models/core/qwen/README.md
Original file line number Diff line number Diff line change
Expand Up @@ -745,7 +745,7 @@ To serve the model using `trtllm-serve`:
```bash
cat >./extra-llm-api-config.yml <<EOF
cuda_graph_config:
padding_enabled: true
enable_padding: true
batch_sizes:
- 1
- 2
Expand Down Expand Up @@ -821,7 +821,7 @@ export TRTLLM_USE_UCX_KVCACHE=1

cat >./gen-extra-llm-api-config.yml <<EOF
cuda_graph_config:
padding_enabled: true
enable_padding: true
batch_sizes:
- 1
- 2
Expand Down
26 changes: 15 additions & 11 deletions examples/wide_ep/ep_load_balancer/README.md
Original file line number Diff line number Diff line change
Expand Up @@ -28,7 +28,9 @@ cat > ./extra_llm_api_options.yaml <<EOF
enable_attention_dp: true
cuda_graph_config: {}
moe_backend: WideEP
moe_max_num_tokens: 8192
moe_config:
backend: WideEP
max_num_tokens: 8192
EOF

trtllm-llmapi-launch \
Expand Down Expand Up @@ -117,9 +119,10 @@ Run 36-way expert parallelism inference with the EPLB configuration incorporated
cat > ./extra_llm_api_options_eplb.yaml <<EOF
enable_attention_dp: true
cuda_graph_config: {}
moe_backend: WideEP
moe_max_num_tokens: 9216
moe_load_balancer: ./moe_load_balancer.yaml
moe_config:
backend: WideEP
max_num_tokens: 9216
load_balancer: ./moe_load_balancer.yaml
EOF

trtllm-llmapi-launch \
Expand Down Expand Up @@ -183,9 +186,10 @@ Run 36-way expert parallelism inference with the EPLB configuration incorporated
cat > ./extra_llm_api_options_eplb.yaml <<EOF
enable_attention_dp: true
cuda_graph_config: {}
moe_backend: WideEP
moe_max_num_tokens: 9216
moe_load_balancer: ./moe_load_balancer.yaml
moe_config:
backend: WideEP
max_num_tokens: 9216
load_balancer: ./moe_load_balancer.yaml
EOF

trtllm-llmapi-launch \
Expand All @@ -204,9 +208,9 @@ trtllm-bench --model ${MODEL_NAME} \

> **Note:** Similar to offline EP Load Balancer, you can enable expert ID counting to verify the effectiveness of EPLB, but remember to disable it when running inference for benchmarking or production purposes.

> **Explanation on moe_max_num_tokens:** For Large Scale EP, there can be extreme conditions that all ranks send tokens to a single rank since they all want that expert.
> **Explanation on max_num_tokens of moe_config:** For Large Scale EP, there can be extreme conditions that all ranks send tokens to a single rank since they all want that expert.
In that case, that rank will have too many tokens to compute. In order not to make the hot rank OOM, there is one strategy that chunk the tokens if there are too much.
`moe_max_num_tokens` is the parameter that controls the max chunk size. However, this may have performance penalty if there is enough since batch size is smaller.
`max_num_tokens` of moe_config is the parameter that controls the max chunk size. However, this may have performance penalty if there is enough since batch size is smaller.
So by default, it is set to some value that all tokens can complete in one wave. However, if EP size is large, we may need to trade off that in order not to OOM or got other runtime errors due to lack of memory.
One good point is that if memory is OK, we can set `moe_max_num_tokens` to `max_batch_size * ep_size` to make all generation requests can be processed in one chunk.
For example, if `ep_size` is 36 and `max_batch_size` is 256, we may set `moe_max_num_tokens` to 9216.
One good point is that if memory is OK, we can set `max_num_tokens` to `max_batch_size * ep_size` to make all generation requests can be processed in one chunk.
For example, if `ep_size` is 36 and `max_batch_size` is 256, we may set `max_num_tokens` to 9216.
2 changes: 1 addition & 1 deletion examples/wide_ep/slurm_scripts/gen_yaml.py
Original file line number Diff line number Diff line change
Expand Up @@ -196,7 +196,7 @@ def gen_config_file(config_path: str,
'max_seq_len': 2176,
'free_gpu_memory_fraction': gen_gpu_memory_fraction,
'cuda_graph_config': {
'padding_enabled': True,
'enable_padding': True,
'batch_sizes': gen_cuda_graph_batch_sizes,
},
'print_iter_log': True,
Expand Down
6 changes: 3 additions & 3 deletions tensorrt_llm/_torch/pyexecutor/model_engine.py
Original file line number Diff line number Diff line change
Expand Up @@ -309,7 +309,7 @@ def get_rank_model_storage(model):
def _filter_cuda_graph_batch_sizes(cuda_graph_batch_sizes: list[int],
max_batch_size: int, max_num_tokens: int,
max_draft_len: int,
padding_enabled: bool) -> list[int]:
enable_padding: bool) -> list[int]:
# This is the largest possible batch size for a pure decoding batch.
max_cuda_graph_bs = min(max_batch_size,
int(max_num_tokens / (1 + max_draft_len)))
Expand All @@ -326,8 +326,8 @@ def _filter_cuda_graph_batch_sizes(cuda_graph_batch_sizes: list[int],
# is that if the user is OK padding to a batch size B, they should also
# be OK with padding to some size B' < B since the performance will generally
# just be better in the smaller case.
if padding_enabled and (i == 0
or result[i - 1] != max_cuda_graph_bs):
if enable_padding and (i == 0
or result[i - 1] != max_cuda_graph_bs):
logger.warning(
"CUDA graph padding is enabled, but one of the given CUDA graph "
f"batch sizes ({bs}) is larger than the executor's max batch size "
Expand Down
2 changes: 1 addition & 1 deletion tensorrt_llm/bench/benchmark/utils/general.py
Original file line number Diff line number Diff line change
Expand Up @@ -152,7 +152,7 @@ def get_settings(params: dict, dataset_metadata: DatasetMetadata, model: str,
pass

cuda_graph_config = {
"padding_enabled": True,
"enable_padding": True,
"max_batch_size": max_batch_size
}

Expand Down
3 changes: 2 additions & 1 deletion tensorrt_llm/llmapi/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,7 +9,7 @@
CudaGraphConfig, DraftTargetDecodingConfig,
DynamicBatchConfig, EagleDecodingConfig,
ExtendedRuntimePerfKnobConfig, KvCacheConfig, LlmArgs,
LookaheadDecodingConfig, MedusaDecodingConfig,
LookaheadDecodingConfig, MedusaDecodingConfig, MoeConfig,
MTPDecodingConfig, NGramDecodingConfig, SchedulerConfig,
TorchCompileConfig, TorchLlmArgs, TrtLlmArgs,
UserProvidedDecodingConfig)
Expand All @@ -27,6 +27,7 @@
'KvCacheConfig',
'KvCacheRetentionConfig',
'CudaGraphConfig',
'MoeConfig',
'LookaheadDecodingConfig',
'MedusaDecodingConfig',
'EagleDecodingConfig',
Expand Down
Loading