diff --git a/docs/source/blogs/Best_perf_practice_on_DeepSeek-R1_in_TensorRT-LLM.md b/docs/source/blogs/Best_perf_practice_on_DeepSeek-R1_in_TensorRT-LLM.md
index 926d407cb62..1d07ce70b29 100644
--- a/docs/source/blogs/Best_perf_practice_on_DeepSeek-R1_in_TensorRT-LLM.md
+++ b/docs/source/blogs/Best_perf_practice_on_DeepSeek-R1_in_TensorRT-LLM.md
@@ -195,20 +195,20 @@ We are seeing meaningful speedup using FP8 KV cache, thus refreshing the numbers
 #### Benchmark
 ```bash
 cat >./extra-llm-api-config.yml <<EOF
-use_cuda_graph: true
-cuda_graph_padding_enabled: true
-cuda_graph_batch_sizes:
-- 896
-- 512
-- 256
-- 128
-- 64
-- 32
-- 16
-- 8
-- 4
-- 2
-- 1
+cuda_graph_config:
+  enable_padding: true
+  batch_sizes:
+  - 896
+  - 512
+  - 256
+  - 128
+  - 64
+  - 32
+  - 16
+  - 8
+  - 4
+  - 2
+  - 1
 print_iter_log: true
 kv_cache_dtype: fp8
 enable_attention_dp: true
@@ -262,19 +262,19 @@ python ${YOUR_WORK_PATH}/benchmarks/cpp/prepare_dataset.py \
 YOUR_DATA_PATH=./dataset.txt
 
 cat >./extra-llm-api-config.yml <<EOF
-use_cuda_graph: true
-cuda_graph_padding_enabled: true
-cuda_graph_batch_sizes:
-- 1
-- 2
-- 4
-- 8
-- 16
-- 32
-- 64
-- 128
-- 256
-- 384
+cuda_graph_config:
+  enable_padding: true
+  batch_sizes:
+  - 1
+  - 2
+  - 4
+  - 8
+  - 16
+  - 32
+  - 64
+  - 128
+  - 256
+  - 384
 print_iter_log: ${PRINT_ITER_LOG}
 enable_attention_dp: true
 EOF
diff --git a/docs/source/blogs/tech_blog/blog3_Optimizing_DeepSeek_R1_Throughput_on_NVIDIA_Blackwell_GPUs.md b/docs/source/blogs/tech_blog/blog3_Optimizing_DeepSeek_R1_Throughput_on_NVIDIA_Blackwell_GPUs.md
index 0de54f69fbf..7499a6e0936 100644
--- a/docs/source/blogs/tech_blog/blog3_Optimizing_DeepSeek_R1_Throughput_on_NVIDIA_Blackwell_GPUs.md
+++ b/docs/source/blogs/tech_blog/blog3_Optimizing_DeepSeek_R1_Throughput_on_NVIDIA_Blackwell_GPUs.md
@@ -151,7 +151,7 @@ These optimizations target the overall execution flow, scheduling, and resource
 
 * CUDA Graph
 
-    This had a significant **22% E2E performance impact** for throughput scenarios. CUDA Graphs allow capturing a sequence of CUDA operations and launching them as a single unit, drastically reducing kernel launch overheads. This is particularly beneficial for models with many small kernels, and particularly on the PyTorch flow, because the python host code normally executes slower than C++. Since the CUDA Graph freezes the kernel launch parameters, which is normally associated with the tensor shapes, it can only be safely used with static shape, meaning that different CUDA graphs need to be captured for different batch sizes. Each graph will have some cost of memory usage, and capturing time, thus we cannot capture every possible CUDA graph for all possible batches. For the non-captured batch sizes, PyTorch eager mode code will be executed. There is a feature called CUDA Graph padding in TensorRT-LLM, which is a good trade-off between the number of CUDA Graphs and the CUDA Graph hit ratio; it tries to pad a batch to the nearest one with a captured CUDA Graph. Normally you should enable the CUDA Graph padding feature to increase the CUDA Graph hit rate, but the padding itself has some overhead due to wasted tokens computation. Users can opt-out the CUDA Graph padding feature to see the perf benefits, by setting the `cuda_graph_padding_enabled` to false, see API here [Pytorch backend config](https://github.com/NVIDIA/TensorRT-LLM/blob/main/tensorrt_llm/_torch/pyexecutor/config.py#L41)
+    This had a significant **22% E2E performance impact** for throughput scenarios. CUDA Graphs allow capturing a sequence of CUDA operations and launching them as a single unit, drastically reducing kernel launch overheads. This is particularly beneficial for models with many small kernels, and particularly on the PyTorch flow, because the python host code normally executes slower than C++. Since the CUDA Graph freezes the kernel launch parameters, which is normally associated with the tensor shapes, it can only be safely used with static shape, meaning that different CUDA graphs need to be captured for different batch sizes. Each graph will have some cost of memory usage, and capturing time, thus we cannot capture every possible CUDA graph for all possible batches. For the non-captured batch sizes, PyTorch eager mode code will be executed. There is a feature called CUDA Graph padding in TensorRT-LLM, which is a good trade-off between the number of CUDA Graphs and the CUDA Graph hit ratio; it tries to pad a batch to the nearest one with a captured CUDA Graph. Normally you should enable the CUDA Graph padding feature to increase the CUDA Graph hit rate, but the padding itself has some overhead due to wasted tokens computation. Users can opt-out the CUDA Graph padding feature to see the perf benefits, by setting the `cuda_graph_config: enable_padding` to false, see API here [Pytorch backend config](https://github.com/NVIDIA/TensorRT-LLM/blob/main/tensorrt_llm/_torch/pyexecutor/config.py#L41)
 
 * Overlap Scheduler:
 
diff --git a/docs/source/performance/perf-overview.md b/docs/source/performance/perf-overview.md
index 05c4918db52..3f55a4e1095 100644
--- a/docs/source/performance/perf-overview.md
+++ b/docs/source/performance/perf-overview.md
@@ -201,7 +201,7 @@ trtllm-bench --model $model_name throughput --dataset $dataset_file --backend py
 `llm_options.yml`
 ```yaml
 cuda_graph_config:
-  padding_enabled: true
+  enable_padding: true
   batch_sizes:
     - 1
     - 2
diff --git a/docs/source/scripts/disaggregated/gen_yaml.py b/docs/source/scripts/disaggregated/gen_yaml.py
index 4cd8288fee9..63630d2a467 100644
--- a/docs/source/scripts/disaggregated/gen_yaml.py
+++ b/docs/source/scripts/disaggregated/gen_yaml.py
@@ -190,7 +190,7 @@ def gen_config_file(config_path: str,
             'max_seq_len': 8576,
             'free_gpu_memory_fraction': gen_gpu_memory_fraction,
             'cuda_graph_config': {
-                'padding_enabled': True,
+                'enable_padding': True,
                 'batch_sizes': gen_cuda_graph_batch_sizes,
             },
             'print_iter_log': True,
diff --git a/examples/models/core/deepseek_v3/README.md b/examples/models/core/deepseek_v3/README.md
index 13c2f30fbe2..3568dd554f6 100644
--- a/examples/models/core/deepseek_v3/README.md
+++ b/examples/models/core/deepseek_v3/README.md
@@ -142,7 +142,7 @@ python /app/tensorrt_llm/benchmarks/cpp/prepare_dataset.py \
 
 cat <<EOF > /tmp/extra-llm-api-config.yml
 cuda_graph_config:
-  padding_enabled: true
+  enable_padding: true
   batch_sizes: [1, 4, 8, 12]
 EOF
 
@@ -169,7 +169,7 @@ python /app/tensorrt_llm/benchmarks/cpp/prepare_dataset.py \
 
 cat <<EOF > /tmp/extra-llm-api-config.yml
 cuda_graph_config:
-  padding_enabled: true
+  enable_padding: true
   batch_sizes: [1, 2]
 moe_max_num_tokens: 16384
 EOF
@@ -237,7 +237,7 @@ To serve the model using `trtllm-serve`:
 ```bash
 cat >./extra-llm-api-config.yml <<EOF
 cuda_graph_config:
-  padding_enabled: true
+  enable_padding: true
   batch_sizes:
     - 1
     - 2
@@ -316,7 +316,7 @@ export TRTLLM_USE_UCX_KVCACHE=1
 
 cat >./gen-extra-llm-api-config.yml <<EOF
 cuda_graph_config:
-  padding_enabled: true
+  enable_padding: true
   batch_sizes:
     - 1
     - 2
@@ -538,7 +538,7 @@ python3 /path/to/TensorRT-LLM/benchmarks/cpp/prepare_dataset.py \
 
 cat >/path/to/TensorRT-LLM/extra-llm-api-config.yml <<EOF
 cuda_graph_config:
-  padding_enabled: true
+  enable_padding: true
   batch_sizes:
     - 1
     - 2
diff --git a/examples/models/core/qwen/README.md b/examples/models/core/qwen/README.md
index b30ef9aed3b..0d1a7670ef5 100644
--- a/examples/models/core/qwen/README.md
+++ b/examples/models/core/qwen/README.md
@@ -745,7 +745,7 @@ To serve the model using `trtllm-serve`:
 ```bash
 cat >./extra-llm-api-config.yml <<EOF
 cuda_graph_config:
-  padding_enabled: true
+  enable_padding: true
   batch_sizes:
   - 1
   - 2
@@ -821,7 +821,7 @@ export TRTLLM_USE_UCX_KVCACHE=1
 
 cat >./gen-extra-llm-api-config.yml <<EOF
 cuda_graph_config:
-  padding_enabled: true
+  enable_padding: true
   batch_sizes:
     - 1
     - 2
diff --git a/examples/pytorch/quickstart_advanced.py b/examples/pytorch/quickstart_advanced.py
index 0a2a711e05f..bf2c74b49e8 100644
--- a/examples/pytorch/quickstart_advanced.py
+++ b/examples/pytorch/quickstart_advanced.py
@@ -189,7 +189,7 @@ def setup_llm(args):
 
     cuda_graph_config = CudaGraphConfig(
         batch_sizes=args.cuda_graph_batch_sizes,
-        padding_enabled=args.cuda_graph_padding_enabled,
+        enable_padding=args.cuda_graph_padding_enabled,
     ) if args.use_cuda_graph else None
     llm = LLM(
         model=args.model_dir,
diff --git a/examples/wide_ep/slurm_scripts/gen_yaml.py b/examples/wide_ep/slurm_scripts/gen_yaml.py
index fd1de76b98b..121f614d870 100644
--- a/examples/wide_ep/slurm_scripts/gen_yaml.py
+++ b/examples/wide_ep/slurm_scripts/gen_yaml.py
@@ -196,7 +196,7 @@ def gen_config_file(config_path: str,
             'max_seq_len': 2176,
             'free_gpu_memory_fraction': gen_gpu_memory_fraction,
             'cuda_graph_config': {
-                'padding_enabled': True,
+                'enable_padding': True,
                 'batch_sizes': gen_cuda_graph_batch_sizes,
             },
             'print_iter_log': True,
diff --git a/tensorrt_llm/_torch/pyexecutor/model_engine.py b/tensorrt_llm/_torch/pyexecutor/model_engine.py
index 1e223ae99a4..ae7b1ceaee5 100644
--- a/tensorrt_llm/_torch/pyexecutor/model_engine.py
+++ b/tensorrt_llm/_torch/pyexecutor/model_engine.py
@@ -309,7 +309,7 @@ def get_rank_model_storage(model):
 def _filter_cuda_graph_batch_sizes(cuda_graph_batch_sizes: list[int],
                                    max_batch_size: int, max_num_tokens: int,
                                    max_draft_len: int,
-                                   padding_enabled: bool) -> list[int]:
+                                   enable_padding: bool) -> list[int]:
     # This is the largest possible batch size for a pure decoding batch.
     max_cuda_graph_bs = min(max_batch_size,
                             int(max_num_tokens / (1 + max_draft_len)))
@@ -326,8 +326,8 @@ def _filter_cuda_graph_batch_sizes(cuda_graph_batch_sizes: list[int],
             # is that if the user is OK padding to a batch size B, they should also
             # be OK with padding to some size B' < B since the performance will generally
             # just be better in the smaller case.
-            if padding_enabled and (i == 0
-                                    or result[i - 1] != max_cuda_graph_bs):
+            if enable_padding and (i == 0
+                                   or result[i - 1] != max_cuda_graph_bs):
                 logger.warning(
                     "CUDA graph padding is enabled, but one of the given CUDA graph "
                     f"batch sizes ({bs}) is larger than the executor's max batch size "
diff --git a/tensorrt_llm/bench/benchmark/utils/general.py b/tensorrt_llm/bench/benchmark/utils/general.py
index 1447038251f..538192b88f4 100755
--- a/tensorrt_llm/bench/benchmark/utils/general.py
+++ b/tensorrt_llm/bench/benchmark/utils/general.py
@@ -151,7 +151,7 @@ def get_settings(params: dict, dataset_metadata: DatasetMetadata, model: str,
             pass
 
     cuda_graph_config = {
-        "padding_enabled": True,
+        "enable_padding": True,
         "max_batch_size": max_batch_size
     }
 
diff --git a/tensorrt_llm/llmapi/llm_args.py b/tensorrt_llm/llmapi/llm_args.py
index 0cfd6e95eaf..f7e66574192 100644
--- a/tensorrt_llm/llmapi/llm_args.py
+++ b/tensorrt_llm/llmapi/llm_args.py
@@ -71,7 +71,7 @@ class CudaGraphConfig(BaseModel):
     max_batch_size: int = Field(
         default=0, description="Maximum batch size for CUDA graphs.")
 
-    padding_enabled: bool = Field(
+    enable_padding: bool = Field(
         default=False,
         description=
         "If true, batches are rounded up to the nearest cuda_graph_batch_size. This is usually a net win for performance."
@@ -1831,17 +1831,17 @@ def validate_stream_interval(self):
 
     @staticmethod
     def _generate_cuda_graph_batch_sizes(max_batch_size: int,
-                                         padding_enabled: bool) -> List[int]:
+                                         enable_padding: bool) -> List[int]:
         """Generate a list of batch sizes for CUDA graphs.
 
         Args:
             max_batch_size: Maximum batch size to generate up to
-            padding_enabled: Whether padding is enabled, which affects the batch size distribution
+            enable_padding: Whether padding is enabled, which affects the batch size distribution
 
         Returns:
             List of batch sizes to create CUDA graphs for
         """
-        if padding_enabled:
+        if enable_padding:
             batch_sizes = [1, 2, 4] + [i * 8 for i in range(1, 17)]
         else:
             batch_sizes = list(range(1, 32)) + [32, 64, 128]
@@ -1879,7 +1879,7 @@ def validate_cuda_graph_config(self) -> 'TorchLlmArgs':
             config.batch_sizes = sorted(config.batch_sizes)
             if config.max_batch_size != 0:
                 if config.batch_sizes != self._generate_cuda_graph_batch_sizes(
-                        config.max_batch_size, config.padding_enabled):
+                        config.max_batch_size, config.enable_padding):
                     raise ValueError(
                         "Please don't set both cuda_graph_config.batch_sizes "
                         "and cuda_graph_config.max_batch_size.\n"
@@ -1891,7 +1891,7 @@ def validate_cuda_graph_config(self) -> 'TorchLlmArgs':
         else:
             max_batch_size = config.max_batch_size or 128
             generated_sizes = self._generate_cuda_graph_batch_sizes(
-                max_batch_size, config.padding_enabled)
+                max_batch_size, config.enable_padding)
             config.batch_sizes = generated_sizes
             config.max_batch_size = max_batch_size
 
@@ -1910,9 +1910,9 @@ def get_pytorch_backend_config(self) -> "PyTorchConfig":
             cuda_graph_max_batch_size=self.cuda_graph_config.max_batch_size
             if self.cuda_graph_config else
             CudaGraphConfig.model_fields['max_batch_size'].default,
-            cuda_graph_padding_enabled=self.cuda_graph_config.padding_enabled
+            cuda_graph_padding_enabled=self.cuda_graph_config.enable_padding
             if self.cuda_graph_config else
-            CudaGraphConfig.model_fields['padding_enabled'].default,
+            CudaGraphConfig.model_fields['enable_padding'].default,
             disable_overlap_scheduler=self.disable_overlap_scheduler,
             moe_max_num_tokens=self.moe_max_num_tokens,
             moe_load_balancer=self.moe_load_balancer,
diff --git a/tests/integration/defs/accuracy/test_llm_api_pytorch.py b/tests/integration/defs/accuracy/test_llm_api_pytorch.py
index 0db89d5d5b5..fcdf19f1808 100644
--- a/tests/integration/defs/accuracy/test_llm_api_pytorch.py
+++ b/tests/integration/defs/accuracy/test_llm_api_pytorch.py
@@ -102,7 +102,7 @@ def test_bfloat16(self, attn_backend, torch_compile):
             enable_fullgraph=True) if torch_compile else None
         pytorch_config = dict(
             torch_compile_config=torch_compile_config,
-            cuda_graph_config=CudaGraphConfig(padding_enabled=torch_compile,
+            cuda_graph_config=CudaGraphConfig(enable_padding=torch_compile,
                                               batch_sizes=[4]),
             attn_backend=attn_backend,
             disable_overlap_scheduler=torch_compile,
@@ -129,7 +129,7 @@ def test_bfloat16_4gpus(self, tp_size, pp_size, attn_backend,
             enable_fullgraph=True) if torch_compile else None
         pytorch_config = dict(
             torch_compile_config=torch_compile_config,
-            cuda_graph_config=CudaGraphConfig(padding_enabled=torch_compile,
+            cuda_graph_config=CudaGraphConfig(enable_padding=torch_compile,
                                               batch_sizes=[4]),
             attn_backend=attn_backend,
             disable_overlap_scheduler=torch_compile,
@@ -154,7 +154,7 @@ def test_fp8(self, fp8kv, attn_backend, torch_compile):
             enable_fullgraph=True) if torch_compile else None
         pytorch_config = dict(
             torch_compile_config=torch_compile_config,
-            cuda_graph_config=CudaGraphConfig(padding_enabled=torch_compile,
+            cuda_graph_config=CudaGraphConfig(enable_padding=torch_compile,
                                               batch_sizes=[4]),
             attn_backend=attn_backend,
             disable_overlap_scheduler=torch_compile,
@@ -193,7 +193,7 @@ def test_fp8_4gpus(self, tp_size, pp_size, fp8kv, attn_backend,
             enable_fullgraph=True) if torch_compile else None
         pytorch_config = dict(
             torch_compile_config=torch_compile_config,
-            cuda_graph_config=CudaGraphConfig(padding_enabled=torch_compile,
+            cuda_graph_config=CudaGraphConfig(enable_padding=torch_compile,
                                               batch_sizes=[4]),
             attn_backend=attn_backend,
             disable_overlap_scheduler=torch_compile,
@@ -741,7 +741,7 @@ def test_fp8_block_scales_cuda_graph_padding(self, mtp_nextn):
             disable_overlap_scheduler=False,
             cuda_graph_config=CudaGraphConfig(
                 max_batch_size=512,
-                padding_enabled=True,
+                enable_padding=True,
             ),
         )
         llm = LLM(f"{llm_models_root()}/DeepSeek-V3-Lite/fp8",
@@ -765,7 +765,7 @@ def test_fp8_block_scales_cuda_graph_padding_4gpus(self, mtp_nextn,
             mtp_config = MTPDecodingConfig(num_nextn_predict_layers=mtp_nextn)
         pytorch_config = dict(
             disable_overlap_scheduler=False,
-            cuda_graph_config=CudaGraphConfig(padding_enabled=True),
+            cuda_graph_config=CudaGraphConfig(enable_padding=True),
         )
         quant_config = QuantConfig()
         quant_config.quant_algo = QuantAlgo.FP8_BLOCK_SCALES
@@ -1857,7 +1857,7 @@ class TestKanana_Instruct(LlmapiAccuracyTestHarness):
     def test_auto_dtype(self):
         "RCCA: https://nvbugspro.nvidia.com/bug/5310520"
         pytorch_config = dict(cuda_graph_config=CudaGraphConfig(
-            padding_enabled=True, max_batch_size=384))
+            enable_padding=True, max_batch_size=384))
         with LLM(self.MODEL_PATH, **pytorch_config,
                  enable_attention_dp=True) as llm:
             task = MMLU(self.MODEL_NAME)
diff --git a/tests/integration/defs/disaggregated/test_configs/disagg_config_ctxtp2_gentp2_deepseek_v3_lite_attention_dp_overlap_cuda_graph.yaml b/tests/integration/defs/disaggregated/test_configs/disagg_config_ctxtp2_gentp2_deepseek_v3_lite_attention_dp_overlap_cuda_graph.yaml
index 6135aefa0a7..1171fb4f102 100644
--- a/tests/integration/defs/disaggregated/test_configs/disagg_config_ctxtp2_gentp2_deepseek_v3_lite_attention_dp_overlap_cuda_graph.yaml
+++ b/tests/integration/defs/disaggregated/test_configs/disagg_config_ctxtp2_gentp2_deepseek_v3_lite_attention_dp_overlap_cuda_graph.yaml
@@ -17,7 +17,7 @@ generation_servers:
   pipeline_parallel_size: 1
   enable_attention_dp: true
   cuda_graph_config:
-    padding_enabled: False
+    enable_padding: False
   disable_overlap_scheduler: False
   urls:
       - "localhost:8002"
diff --git a/tests/integration/defs/disaggregated/test_configs/disagg_config_ctxtp2_gentp2_deepseek_v3_lite_overlap_cuda_graph.yaml b/tests/integration/defs/disaggregated/test_configs/disagg_config_ctxtp2_gentp2_deepseek_v3_lite_overlap_cuda_graph.yaml
index e4880434eb0..18acc70f9ac 100644
--- a/tests/integration/defs/disaggregated/test_configs/disagg_config_ctxtp2_gentp2_deepseek_v3_lite_overlap_cuda_graph.yaml
+++ b/tests/integration/defs/disaggregated/test_configs/disagg_config_ctxtp2_gentp2_deepseek_v3_lite_overlap_cuda_graph.yaml
@@ -15,7 +15,7 @@ generation_servers:
   tensor_parallel_size: 2
   pipeline_parallel_size: 1
   cuda_graph_config:
-    padding_enabled: False
+    enable_padding: False
   disable_overlap_scheduler: False
   urls:
       - "localhost:8002"
diff --git a/tests/integration/defs/disaggregated/test_configs/disagg_config_cuda_graph_padding.yaml b/tests/integration/defs/disaggregated/test_configs/disagg_config_cuda_graph_padding.yaml
index 8f1ff654b38..7009df9fd0f 100644
--- a/tests/integration/defs/disaggregated/test_configs/disagg_config_cuda_graph_padding.yaml
+++ b/tests/integration/defs/disaggregated/test_configs/disagg_config_cuda_graph_padding.yaml
@@ -28,7 +28,7 @@ generation_servers:
     free_gpu_memory_fraction: 0.2
     enable_partial_reuse: False
   cuda_graph_config:
-    padding_enabled: True
+    enable_padding: True
     batch_sizes: [1,4,8,16,24,32]
   disable_overlap_scheduler: True
   urls:
diff --git a/tests/integration/defs/perf/pytorch_model_config.py b/tests/integration/defs/perf/pytorch_model_config.py
index cc5e5b799f1..c2b6c6e92c3 100644
--- a/tests/integration/defs/perf/pytorch_model_config.py
+++ b/tests/integration/defs/perf/pytorch_model_config.py
@@ -30,7 +30,7 @@ def get_model_yaml_config(model_label: str,
     base_config = {
         'print_iter_log': True,
         'cuda_graph_config': {
-            'padding_enabled': True,
+            'enable_padding': True,
         },
     }
     if 'kv_cache_dtype' in model_label:
@@ -65,9 +65,10 @@ def get_model_yaml_config(model_label: str,
             ],
             'config': {
                 'enable_attention_dp': True,
-                'cuda_graph_padding_enabled': True,
-                'cuda_graph_batch_sizes':
-                [1, 2, 4, 8, 16, 32, 64, 128, 256, 384]
+                'cuda_graph_config': {
+                    'enable_padding': True,
+                    'batch_sizes': [1, 2, 4, 8, 16, 32, 64, 128, 256, 384]
+                }
             }
         },
         # DeepSeek R1 model with specific batch size 128
@@ -86,7 +87,7 @@ def get_model_yaml_config(model_label: str,
             'config': {
                 'print_iter_log': True,
                 'cuda_graph_config': {
-                    'padding_enabled': True,
+                    'enable_padding': True,
                     'batch_sizes': [1, 512, 1024, 2048]
                 }
             }
diff --git a/tests/integration/defs/stress_test/stress_test.py b/tests/integration/defs/stress_test/stress_test.py
index bfa6abd0177..f0f85fe51e3 100644
--- a/tests/integration/defs/stress_test/stress_test.py
+++ b/tests/integration/defs/stress_test/stress_test.py
@@ -519,7 +519,7 @@ def stress_test(config,
         if config.backend == "pytorch":
             extra_llm_options.update({
                 "cuda_graph_config": {
-                    "padding_enabled": True,
+                    "enable_padding": True,
                     "batch_sizes": [1, 2, 4, 8, 16, 32, 64, 128, 256, 384],
                 },
                 "print_iter_log": True,
diff --git a/tests/unittest/llmapi/test_llm_args.py b/tests/unittest/llmapi/test_llm_args.py
index b2eb9e8d8cd..0c2aaf20a13 100644
--- a/tests/unittest/llmapi/test_llm_args.py
+++ b/tests/unittest/llmapi/test_llm_args.py
@@ -272,7 +272,7 @@ def test_cuda_graph_batch_sizes_case_0_1(self):
             cuda_graph_config=CudaGraphConfig(
                 batch_sizes=CudaGraphConfig._generate_cuda_graph_batch_sizes(
                     128, True),
-                padding_enabled=True,
+                enable_padding=True,
                 max_batch_size=128))
         assert args.cuda_graph_config.batch_sizes == CudaGraphConfig._generate_cuda_graph_batch_sizes(
             128, True)
@@ -282,14 +282,14 @@ def test_cuda_graph_batch_sizes_case_1(self):
         # set cuda_graph_batch_sizes only
         args = TorchLlmArgs(model=llama_model_path,
                             cuda_graph_config=CudaGraphConfig(
-                                batch_sizes=[1, 2, 4], padding_enabled=True))
+                                batch_sizes=[1, 2, 4], enable_padding=True))
         assert args.cuda_graph_config.batch_sizes == [1, 2, 4]
 
     def test_cuda_graph_batch_sizes_case_2(self):
         # set cuda_graph_config.max_batch_size only
         args = TorchLlmArgs(model=llama_model_path,
                             cuda_graph_config=CudaGraphConfig(
-                                max_batch_size=128, padding_enabled=True))
+                                max_batch_size=128, enable_padding=True))
         assert args.cuda_graph_config.batch_sizes == CudaGraphConfig._generate_cuda_graph_batch_sizes(
             128, True)
         assert args.cuda_graph_config.max_batch_size == 128