diff --git a/docs/source/blogs/Best_perf_practice_on_DeepSeek-R1_in_TensorRT-LLM.md b/docs/source/blogs/Best_perf_practice_on_DeepSeek-R1_in_TensorRT-LLM.md index 926d407cb62..1d07ce70b29 100644 --- a/docs/source/blogs/Best_perf_practice_on_DeepSeek-R1_in_TensorRT-LLM.md +++ b/docs/source/blogs/Best_perf_practice_on_DeepSeek-R1_in_TensorRT-LLM.md @@ -195,20 +195,20 @@ We are seeing meaningful speedup using FP8 KV cache, thus refreshing the numbers #### Benchmark ```bash cat >./extra-llm-api-config.yml <./extra-llm-api-config.yml < /tmp/extra-llm-api-config.yml cuda_graph_config: - padding_enabled: true + enable_padding: true batch_sizes: [1, 4, 8, 12] EOF @@ -169,7 +169,7 @@ python /app/tensorrt_llm/benchmarks/cpp/prepare_dataset.py \ cat < /tmp/extra-llm-api-config.yml cuda_graph_config: - padding_enabled: true + enable_padding: true batch_sizes: [1, 2] moe_max_num_tokens: 16384 EOF @@ -237,7 +237,7 @@ To serve the model using `trtllm-serve`: ```bash cat >./extra-llm-api-config.yml <./gen-extra-llm-api-config.yml </path/to/TensorRT-LLM/extra-llm-api-config.yml <./extra-llm-api-config.yml <./gen-extra-llm-api-config.yml < list[int]: + enable_padding: bool) -> list[int]: # This is the largest possible batch size for a pure decoding batch. max_cuda_graph_bs = min(max_batch_size, int(max_num_tokens / (1 + max_draft_len))) @@ -326,8 +326,8 @@ def _filter_cuda_graph_batch_sizes(cuda_graph_batch_sizes: list[int], # is that if the user is OK padding to a batch size B, they should also # be OK with padding to some size B' < B since the performance will generally # just be better in the smaller case. - if padding_enabled and (i == 0 - or result[i - 1] != max_cuda_graph_bs): + if enable_padding and (i == 0 + or result[i - 1] != max_cuda_graph_bs): logger.warning( "CUDA graph padding is enabled, but one of the given CUDA graph " f"batch sizes ({bs}) is larger than the executor's max batch size " diff --git a/tensorrt_llm/bench/benchmark/utils/general.py b/tensorrt_llm/bench/benchmark/utils/general.py index 1447038251f..538192b88f4 100755 --- a/tensorrt_llm/bench/benchmark/utils/general.py +++ b/tensorrt_llm/bench/benchmark/utils/general.py @@ -151,7 +151,7 @@ def get_settings(params: dict, dataset_metadata: DatasetMetadata, model: str, pass cuda_graph_config = { - "padding_enabled": True, + "enable_padding": True, "max_batch_size": max_batch_size } diff --git a/tensorrt_llm/llmapi/llm_args.py b/tensorrt_llm/llmapi/llm_args.py index 0cfd6e95eaf..f7e66574192 100644 --- a/tensorrt_llm/llmapi/llm_args.py +++ b/tensorrt_llm/llmapi/llm_args.py @@ -71,7 +71,7 @@ class CudaGraphConfig(BaseModel): max_batch_size: int = Field( default=0, description="Maximum batch size for CUDA graphs.") - padding_enabled: bool = Field( + enable_padding: bool = Field( default=False, description= "If true, batches are rounded up to the nearest cuda_graph_batch_size. This is usually a net win for performance." @@ -1831,17 +1831,17 @@ def validate_stream_interval(self): @staticmethod def _generate_cuda_graph_batch_sizes(max_batch_size: int, - padding_enabled: bool) -> List[int]: + enable_padding: bool) -> List[int]: """Generate a list of batch sizes for CUDA graphs. Args: max_batch_size: Maximum batch size to generate up to - padding_enabled: Whether padding is enabled, which affects the batch size distribution + enable_padding: Whether padding is enabled, which affects the batch size distribution Returns: List of batch sizes to create CUDA graphs for """ - if padding_enabled: + if enable_padding: batch_sizes = [1, 2, 4] + [i * 8 for i in range(1, 17)] else: batch_sizes = list(range(1, 32)) + [32, 64, 128] @@ -1879,7 +1879,7 @@ def validate_cuda_graph_config(self) -> 'TorchLlmArgs': config.batch_sizes = sorted(config.batch_sizes) if config.max_batch_size != 0: if config.batch_sizes != self._generate_cuda_graph_batch_sizes( - config.max_batch_size, config.padding_enabled): + config.max_batch_size, config.enable_padding): raise ValueError( "Please don't set both cuda_graph_config.batch_sizes " "and cuda_graph_config.max_batch_size.\n" @@ -1891,7 +1891,7 @@ def validate_cuda_graph_config(self) -> 'TorchLlmArgs': else: max_batch_size = config.max_batch_size or 128 generated_sizes = self._generate_cuda_graph_batch_sizes( - max_batch_size, config.padding_enabled) + max_batch_size, config.enable_padding) config.batch_sizes = generated_sizes config.max_batch_size = max_batch_size @@ -1910,9 +1910,9 @@ def get_pytorch_backend_config(self) -> "PyTorchConfig": cuda_graph_max_batch_size=self.cuda_graph_config.max_batch_size if self.cuda_graph_config else CudaGraphConfig.model_fields['max_batch_size'].default, - cuda_graph_padding_enabled=self.cuda_graph_config.padding_enabled + cuda_graph_padding_enabled=self.cuda_graph_config.enable_padding if self.cuda_graph_config else - CudaGraphConfig.model_fields['padding_enabled'].default, + CudaGraphConfig.model_fields['enable_padding'].default, disable_overlap_scheduler=self.disable_overlap_scheduler, moe_max_num_tokens=self.moe_max_num_tokens, moe_load_balancer=self.moe_load_balancer, diff --git a/tests/integration/defs/accuracy/test_llm_api_pytorch.py b/tests/integration/defs/accuracy/test_llm_api_pytorch.py index 0db89d5d5b5..fcdf19f1808 100644 --- a/tests/integration/defs/accuracy/test_llm_api_pytorch.py +++ b/tests/integration/defs/accuracy/test_llm_api_pytorch.py @@ -102,7 +102,7 @@ def test_bfloat16(self, attn_backend, torch_compile): enable_fullgraph=True) if torch_compile else None pytorch_config = dict( torch_compile_config=torch_compile_config, - cuda_graph_config=CudaGraphConfig(padding_enabled=torch_compile, + cuda_graph_config=CudaGraphConfig(enable_padding=torch_compile, batch_sizes=[4]), attn_backend=attn_backend, disable_overlap_scheduler=torch_compile, @@ -129,7 +129,7 @@ def test_bfloat16_4gpus(self, tp_size, pp_size, attn_backend, enable_fullgraph=True) if torch_compile else None pytorch_config = dict( torch_compile_config=torch_compile_config, - cuda_graph_config=CudaGraphConfig(padding_enabled=torch_compile, + cuda_graph_config=CudaGraphConfig(enable_padding=torch_compile, batch_sizes=[4]), attn_backend=attn_backend, disable_overlap_scheduler=torch_compile, @@ -154,7 +154,7 @@ def test_fp8(self, fp8kv, attn_backend, torch_compile): enable_fullgraph=True) if torch_compile else None pytorch_config = dict( torch_compile_config=torch_compile_config, - cuda_graph_config=CudaGraphConfig(padding_enabled=torch_compile, + cuda_graph_config=CudaGraphConfig(enable_padding=torch_compile, batch_sizes=[4]), attn_backend=attn_backend, disable_overlap_scheduler=torch_compile, @@ -193,7 +193,7 @@ def test_fp8_4gpus(self, tp_size, pp_size, fp8kv, attn_backend, enable_fullgraph=True) if torch_compile else None pytorch_config = dict( torch_compile_config=torch_compile_config, - cuda_graph_config=CudaGraphConfig(padding_enabled=torch_compile, + cuda_graph_config=CudaGraphConfig(enable_padding=torch_compile, batch_sizes=[4]), attn_backend=attn_backend, disable_overlap_scheduler=torch_compile, @@ -741,7 +741,7 @@ def test_fp8_block_scales_cuda_graph_padding(self, mtp_nextn): disable_overlap_scheduler=False, cuda_graph_config=CudaGraphConfig( max_batch_size=512, - padding_enabled=True, + enable_padding=True, ), ) llm = LLM(f"{llm_models_root()}/DeepSeek-V3-Lite/fp8", @@ -765,7 +765,7 @@ def test_fp8_block_scales_cuda_graph_padding_4gpus(self, mtp_nextn, mtp_config = MTPDecodingConfig(num_nextn_predict_layers=mtp_nextn) pytorch_config = dict( disable_overlap_scheduler=False, - cuda_graph_config=CudaGraphConfig(padding_enabled=True), + cuda_graph_config=CudaGraphConfig(enable_padding=True), ) quant_config = QuantConfig() quant_config.quant_algo = QuantAlgo.FP8_BLOCK_SCALES @@ -1857,7 +1857,7 @@ class TestKanana_Instruct(LlmapiAccuracyTestHarness): def test_auto_dtype(self): "RCCA: https://nvbugspro.nvidia.com/bug/5310520" pytorch_config = dict(cuda_graph_config=CudaGraphConfig( - padding_enabled=True, max_batch_size=384)) + enable_padding=True, max_batch_size=384)) with LLM(self.MODEL_PATH, **pytorch_config, enable_attention_dp=True) as llm: task = MMLU(self.MODEL_NAME) diff --git a/tests/integration/defs/disaggregated/test_configs/disagg_config_ctxtp2_gentp2_deepseek_v3_lite_attention_dp_overlap_cuda_graph.yaml b/tests/integration/defs/disaggregated/test_configs/disagg_config_ctxtp2_gentp2_deepseek_v3_lite_attention_dp_overlap_cuda_graph.yaml index 6135aefa0a7..1171fb4f102 100644 --- a/tests/integration/defs/disaggregated/test_configs/disagg_config_ctxtp2_gentp2_deepseek_v3_lite_attention_dp_overlap_cuda_graph.yaml +++ b/tests/integration/defs/disaggregated/test_configs/disagg_config_ctxtp2_gentp2_deepseek_v3_lite_attention_dp_overlap_cuda_graph.yaml @@ -17,7 +17,7 @@ generation_servers: pipeline_parallel_size: 1 enable_attention_dp: true cuda_graph_config: - padding_enabled: False + enable_padding: False disable_overlap_scheduler: False urls: - "localhost:8002" diff --git a/tests/integration/defs/disaggregated/test_configs/disagg_config_ctxtp2_gentp2_deepseek_v3_lite_overlap_cuda_graph.yaml b/tests/integration/defs/disaggregated/test_configs/disagg_config_ctxtp2_gentp2_deepseek_v3_lite_overlap_cuda_graph.yaml index e4880434eb0..18acc70f9ac 100644 --- a/tests/integration/defs/disaggregated/test_configs/disagg_config_ctxtp2_gentp2_deepseek_v3_lite_overlap_cuda_graph.yaml +++ b/tests/integration/defs/disaggregated/test_configs/disagg_config_ctxtp2_gentp2_deepseek_v3_lite_overlap_cuda_graph.yaml @@ -15,7 +15,7 @@ generation_servers: tensor_parallel_size: 2 pipeline_parallel_size: 1 cuda_graph_config: - padding_enabled: False + enable_padding: False disable_overlap_scheduler: False urls: - "localhost:8002" diff --git a/tests/integration/defs/disaggregated/test_configs/disagg_config_cuda_graph_padding.yaml b/tests/integration/defs/disaggregated/test_configs/disagg_config_cuda_graph_padding.yaml index 8f1ff654b38..7009df9fd0f 100644 --- a/tests/integration/defs/disaggregated/test_configs/disagg_config_cuda_graph_padding.yaml +++ b/tests/integration/defs/disaggregated/test_configs/disagg_config_cuda_graph_padding.yaml @@ -28,7 +28,7 @@ generation_servers: free_gpu_memory_fraction: 0.2 enable_partial_reuse: False cuda_graph_config: - padding_enabled: True + enable_padding: True batch_sizes: [1,4,8,16,24,32] disable_overlap_scheduler: True urls: diff --git a/tests/integration/defs/perf/pytorch_model_config.py b/tests/integration/defs/perf/pytorch_model_config.py index cc5e5b799f1..c2b6c6e92c3 100644 --- a/tests/integration/defs/perf/pytorch_model_config.py +++ b/tests/integration/defs/perf/pytorch_model_config.py @@ -30,7 +30,7 @@ def get_model_yaml_config(model_label: str, base_config = { 'print_iter_log': True, 'cuda_graph_config': { - 'padding_enabled': True, + 'enable_padding': True, }, } if 'kv_cache_dtype' in model_label: @@ -65,9 +65,10 @@ def get_model_yaml_config(model_label: str, ], 'config': { 'enable_attention_dp': True, - 'cuda_graph_padding_enabled': True, - 'cuda_graph_batch_sizes': - [1, 2, 4, 8, 16, 32, 64, 128, 256, 384] + 'cuda_graph_config': { + 'enable_padding': True, + 'batch_sizes': [1, 2, 4, 8, 16, 32, 64, 128, 256, 384] + } } }, # DeepSeek R1 model with specific batch size 128 @@ -86,7 +87,7 @@ def get_model_yaml_config(model_label: str, 'config': { 'print_iter_log': True, 'cuda_graph_config': { - 'padding_enabled': True, + 'enable_padding': True, 'batch_sizes': [1, 512, 1024, 2048] } } diff --git a/tests/integration/defs/stress_test/stress_test.py b/tests/integration/defs/stress_test/stress_test.py index bfa6abd0177..f0f85fe51e3 100644 --- a/tests/integration/defs/stress_test/stress_test.py +++ b/tests/integration/defs/stress_test/stress_test.py @@ -519,7 +519,7 @@ def stress_test(config, if config.backend == "pytorch": extra_llm_options.update({ "cuda_graph_config": { - "padding_enabled": True, + "enable_padding": True, "batch_sizes": [1, 2, 4, 8, 16, 32, 64, 128, 256, 384], }, "print_iter_log": True, diff --git a/tests/unittest/llmapi/test_llm_args.py b/tests/unittest/llmapi/test_llm_args.py index b2eb9e8d8cd..0c2aaf20a13 100644 --- a/tests/unittest/llmapi/test_llm_args.py +++ b/tests/unittest/llmapi/test_llm_args.py @@ -272,7 +272,7 @@ def test_cuda_graph_batch_sizes_case_0_1(self): cuda_graph_config=CudaGraphConfig( batch_sizes=CudaGraphConfig._generate_cuda_graph_batch_sizes( 128, True), - padding_enabled=True, + enable_padding=True, max_batch_size=128)) assert args.cuda_graph_config.batch_sizes == CudaGraphConfig._generate_cuda_graph_batch_sizes( 128, True) @@ -282,14 +282,14 @@ def test_cuda_graph_batch_sizes_case_1(self): # set cuda_graph_batch_sizes only args = TorchLlmArgs(model=llama_model_path, cuda_graph_config=CudaGraphConfig( - batch_sizes=[1, 2, 4], padding_enabled=True)) + batch_sizes=[1, 2, 4], enable_padding=True)) assert args.cuda_graph_config.batch_sizes == [1, 2, 4] def test_cuda_graph_batch_sizes_case_2(self): # set cuda_graph_config.max_batch_size only args = TorchLlmArgs(model=llama_model_path, cuda_graph_config=CudaGraphConfig( - max_batch_size=128, padding_enabled=True)) + max_batch_size=128, enable_padding=True)) assert args.cuda_graph_config.batch_sizes == CudaGraphConfig._generate_cuda_graph_batch_sizes( 128, True) assert args.cuda_graph_config.max_batch_size == 128