From b09f596d36a845f8fef5055440d8f77afa2e2e45 Mon Sep 17 00:00:00 2001
From: izhuhaoran <izhuhaoran@qq.com>
Date: Wed, 25 Jun 2025 14:14:36 +0800
Subject: [PATCH 1/5] adjust cuda_graph_sizes shrink to max_num_seqs for v1
 engine

Signed-off-by: izhuhaoran <izhuhaoran@qq.com>
---
 vllm/config.py | 12 ++++++++----
 1 file changed, 8 insertions(+), 4 deletions(-)

diff --git a/vllm/config.py b/vllm/config.py
index 4333dcd3b8af..556c9ffe5896 100644
--- a/vllm/config.py
+++ b/vllm/config.py
@@ -2033,11 +2033,12 @@ class SchedulerConfig:
     NOTE: This will be replaced by speculative config in the future; it is
     present to enable correctness tests until then."""
 
-    cuda_graph_sizes: list[int] = field(default_factory=lambda: [512])
-    """Cuda graph capture sizes, default is 512.
-    1. if one value is provided, then the capture list would follow the
+    cuda_graph_sizes: list[int] = None  # type: ignore
+    """Cuda graph capture sizes
+    1. if none provided, then default set to [max_num_seqs]
+    2. if one value is provided, then the capture list would follow the
     pattern: [1, 2, 4] + [i for i in range(8, cuda_graph_sizes + 1, 8)]
-    2. more than one value (e.g. 1 2 128) is provided, then the capture list
+    3. more than one value (e.g. 1 2 128) is provided, then the capture list
     will follow the provided list."""
 
     delay_factor: float = 0.0
@@ -2202,6 +2203,9 @@ def __post_init__(self) -> None:
                 self.max_num_partial_prefills, self.max_long_partial_prefills,
                 self.long_prefill_token_threshold)
 
+        if self.cuda_graph_sizes is None:
+            self.cuda_graph_sizes = [self.max_num_seqs]
+
     @model_validator(mode='after')
     def _verify_args(self) -> Self:
         if (self.max_num_batched_tokens < self.max_model_len

From b62185a217370afc89af9190ffaae4a0abf9ea4e Mon Sep 17 00:00:00 2001
From: izhuhaoran <izhuhaoran@qq.com>
Date: Wed, 25 Jun 2025 15:12:42 +0800
Subject: [PATCH 2/5] add comment for cuda_graph_sizes set

Signed-off-by: izhuhaoran <izhuhaoran@qq.com>
---
 vllm/config.py | 1 +
 1 file changed, 1 insertion(+)

diff --git a/vllm/config.py b/vllm/config.py
index 556c9ffe5896..6851ef0803af 100644
--- a/vllm/config.py
+++ b/vllm/config.py
@@ -2203,6 +2203,7 @@ def __post_init__(self) -> None:
                 self.max_num_partial_prefills, self.max_long_partial_prefills,
                 self.long_prefill_token_threshold)
 
+        # If cuda_graph_sizes is not specified, default set to [max_num_seqs].
         if self.cuda_graph_sizes is None:
             self.cuda_graph_sizes = [self.max_num_seqs]
 

From 3e0089c78367b881e3d8349588795691c4955665 Mon Sep 17 00:00:00 2001
From: izhuhaoran <izhuhaoran@qq.com>
Date: Wed, 25 Jun 2025 16:00:54 +0800
Subject: [PATCH 3/5] fix lint error

Signed-off-by: izhuhaoran <izhuhaoran@qq.com>
---
 vllm/config.py           | 2 +-
 vllm/engine/arg_utils.py | 4 ++--
 2 files changed, 3 insertions(+), 3 deletions(-)

diff --git a/vllm/config.py b/vllm/config.py
index 6851ef0803af..08fd1d73522a 100644
--- a/vllm/config.py
+++ b/vllm/config.py
@@ -2033,7 +2033,7 @@ class SchedulerConfig:
     NOTE: This will be replaced by speculative config in the future; it is
     present to enable correctness tests until then."""
 
-    cuda_graph_sizes: list[int] = None  # type: ignore
+    cuda_graph_sizes: Optional[list[int]] = None  # type: ignore
     """Cuda graph capture sizes
     1. if none provided, then default set to [max_num_seqs]
     2. if one value is provided, then the capture list would follow the
diff --git a/vllm/engine/arg_utils.py b/vllm/engine/arg_utils.py
index 9d1008b6b350..cc4b4900f138 100644
--- a/vllm/engine/arg_utils.py
+++ b/vllm/engine/arg_utils.py
@@ -303,8 +303,8 @@ class EngineArgs:
     kv_cache_dtype: CacheDType = CacheConfig.cache_dtype
     seed: Optional[int] = ModelConfig.seed
     max_model_len: Optional[int] = ModelConfig.max_model_len
-    cuda_graph_sizes: list[int] = get_field(SchedulerConfig,
-                                            "cuda_graph_sizes")
+    cuda_graph_sizes: Optional[list[int]] = get_field(SchedulerConfig,
+                                                      "cuda_graph_sizes")
     # Note: Specifying a custom executor backend by passing a class
     # is intended for expert use only. The API may change without
     # notice.

From 774f789995f0b1ff232af7bf5637e44a016a2a39 Mon Sep 17 00:00:00 2001
From: izhuhaoran <izhuhaoran@qq.com>
Date: Wed, 25 Jun 2025 16:21:45 +0800
Subject: [PATCH 4/5] fix ci lint error

Signed-off-by: izhuhaoran <izhuhaoran@qq.com>
---
 vllm/config.py           | 4 ++--
 vllm/engine/arg_utils.py | 4 ++--
 2 files changed, 4 insertions(+), 4 deletions(-)

diff --git a/vllm/config.py b/vllm/config.py
index 08fd1d73522a..8bfd45757402 100644
--- a/vllm/config.py
+++ b/vllm/config.py
@@ -2033,7 +2033,7 @@ class SchedulerConfig:
     NOTE: This will be replaced by speculative config in the future; it is
     present to enable correctness tests until then."""
 
-    cuda_graph_sizes: Optional[list[int]] = None  # type: ignore
+    cuda_graph_sizes: list[int] = field(default_factory=lambda: [])
     """Cuda graph capture sizes
     1. if none provided, then default set to [max_num_seqs]
     2. if one value is provided, then the capture list would follow the
@@ -2204,7 +2204,7 @@ def __post_init__(self) -> None:
                 self.long_prefill_token_threshold)
 
         # If cuda_graph_sizes is not specified, default set to [max_num_seqs].
-        if self.cuda_graph_sizes is None:
+        if not self.cuda_graph_sizes:
             self.cuda_graph_sizes = [self.max_num_seqs]
 
     @model_validator(mode='after')
diff --git a/vllm/engine/arg_utils.py b/vllm/engine/arg_utils.py
index cc4b4900f138..9d1008b6b350 100644
--- a/vllm/engine/arg_utils.py
+++ b/vllm/engine/arg_utils.py
@@ -303,8 +303,8 @@ class EngineArgs:
     kv_cache_dtype: CacheDType = CacheConfig.cache_dtype
     seed: Optional[int] = ModelConfig.seed
     max_model_len: Optional[int] = ModelConfig.max_model_len
-    cuda_graph_sizes: Optional[list[int]] = get_field(SchedulerConfig,
-                                                      "cuda_graph_sizes")
+    cuda_graph_sizes: list[int] = get_field(SchedulerConfig,
+                                            "cuda_graph_sizes")
     # Note: Specifying a custom executor backend by passing a class
     # is intended for expert use only. The API may change without
     # notice.

From 8118ceb4fae3e24b587397990599a3c8e3af86d0 Mon Sep 17 00:00:00 2001
From: Aaron Pham <Aaronpham0103@gmail.com>
Date: Wed, 25 Jun 2025 12:53:20 -0400
Subject: [PATCH 5/5] chore: use default constructor as list

---
 vllm/config.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/vllm/config.py b/vllm/config.py
index 8bfd45757402..0c20d31558d6 100644
--- a/vllm/config.py
+++ b/vllm/config.py
@@ -2033,7 +2033,7 @@ class SchedulerConfig:
     NOTE: This will be replaced by speculative config in the future; it is
     present to enable correctness tests until then."""
 
-    cuda_graph_sizes: list[int] = field(default_factory=lambda: [])
+    cuda_graph_sizes: list[int] = field(default_factory=list)
     """Cuda graph capture sizes
     1. if none provided, then default set to [max_num_seqs]
     2. if one value is provided, then the capture list would follow the