From b09f596d36a845f8fef5055440d8f77afa2e2e45 Mon Sep 17 00:00:00 2001 From: izhuhaoran Date: Wed, 25 Jun 2025 14:14:36 +0800 Subject: [PATCH 1/5] adjust cuda_graph_sizes shrink to max_num_seqs for v1 engine Signed-off-by: izhuhaoran --- vllm/config.py | 12 ++++++++---- 1 file changed, 8 insertions(+), 4 deletions(-) diff --git a/vllm/config.py b/vllm/config.py index 4333dcd3b8af..556c9ffe5896 100644 --- a/vllm/config.py +++ b/vllm/config.py @@ -2033,11 +2033,12 @@ class SchedulerConfig: NOTE: This will be replaced by speculative config in the future; it is present to enable correctness tests until then.""" - cuda_graph_sizes: list[int] = field(default_factory=lambda: [512]) - """Cuda graph capture sizes, default is 512. - 1. if one value is provided, then the capture list would follow the + cuda_graph_sizes: list[int] = None # type: ignore + """Cuda graph capture sizes + 1. if none provided, then default set to [max_num_seqs] + 2. if one value is provided, then the capture list would follow the pattern: [1, 2, 4] + [i for i in range(8, cuda_graph_sizes + 1, 8)] - 2. more than one value (e.g. 1 2 128) is provided, then the capture list + 3. more than one value (e.g. 1 2 128) is provided, then the capture list will follow the provided list.""" delay_factor: float = 0.0 @@ -2202,6 +2203,9 @@ def __post_init__(self) -> None: self.max_num_partial_prefills, self.max_long_partial_prefills, self.long_prefill_token_threshold) + if self.cuda_graph_sizes is None: + self.cuda_graph_sizes = [self.max_num_seqs] + @model_validator(mode='after') def _verify_args(self) -> Self: if (self.max_num_batched_tokens < self.max_model_len From b62185a217370afc89af9190ffaae4a0abf9ea4e Mon Sep 17 00:00:00 2001 From: izhuhaoran Date: Wed, 25 Jun 2025 15:12:42 +0800 Subject: [PATCH 2/5] add comment for cuda_graph_sizes set Signed-off-by: izhuhaoran --- vllm/config.py | 1 + 1 file changed, 1 insertion(+) diff --git a/vllm/config.py b/vllm/config.py index 556c9ffe5896..6851ef0803af 100644 --- a/vllm/config.py +++ b/vllm/config.py @@ -2203,6 +2203,7 @@ def __post_init__(self) -> None: self.max_num_partial_prefills, self.max_long_partial_prefills, self.long_prefill_token_threshold) + # If cuda_graph_sizes is not specified, default set to [max_num_seqs]. if self.cuda_graph_sizes is None: self.cuda_graph_sizes = [self.max_num_seqs] From 3e0089c78367b881e3d8349588795691c4955665 Mon Sep 17 00:00:00 2001 From: izhuhaoran Date: Wed, 25 Jun 2025 16:00:54 +0800 Subject: [PATCH 3/5] fix lint error Signed-off-by: izhuhaoran --- vllm/config.py | 2 +- vllm/engine/arg_utils.py | 4 ++-- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/vllm/config.py b/vllm/config.py index 6851ef0803af..08fd1d73522a 100644 --- a/vllm/config.py +++ b/vllm/config.py @@ -2033,7 +2033,7 @@ class SchedulerConfig: NOTE: This will be replaced by speculative config in the future; it is present to enable correctness tests until then.""" - cuda_graph_sizes: list[int] = None # type: ignore + cuda_graph_sizes: Optional[list[int]] = None # type: ignore """Cuda graph capture sizes 1. if none provided, then default set to [max_num_seqs] 2. if one value is provided, then the capture list would follow the diff --git a/vllm/engine/arg_utils.py b/vllm/engine/arg_utils.py index 9d1008b6b350..cc4b4900f138 100644 --- a/vllm/engine/arg_utils.py +++ b/vllm/engine/arg_utils.py @@ -303,8 +303,8 @@ class EngineArgs: kv_cache_dtype: CacheDType = CacheConfig.cache_dtype seed: Optional[int] = ModelConfig.seed max_model_len: Optional[int] = ModelConfig.max_model_len - cuda_graph_sizes: list[int] = get_field(SchedulerConfig, - "cuda_graph_sizes") + cuda_graph_sizes: Optional[list[int]] = get_field(SchedulerConfig, + "cuda_graph_sizes") # Note: Specifying a custom executor backend by passing a class # is intended for expert use only. The API may change without # notice. From 774f789995f0b1ff232af7bf5637e44a016a2a39 Mon Sep 17 00:00:00 2001 From: izhuhaoran Date: Wed, 25 Jun 2025 16:21:45 +0800 Subject: [PATCH 4/5] fix ci lint error Signed-off-by: izhuhaoran --- vllm/config.py | 4 ++-- vllm/engine/arg_utils.py | 4 ++-- 2 files changed, 4 insertions(+), 4 deletions(-) diff --git a/vllm/config.py b/vllm/config.py index 08fd1d73522a..8bfd45757402 100644 --- a/vllm/config.py +++ b/vllm/config.py @@ -2033,7 +2033,7 @@ class SchedulerConfig: NOTE: This will be replaced by speculative config in the future; it is present to enable correctness tests until then.""" - cuda_graph_sizes: Optional[list[int]] = None # type: ignore + cuda_graph_sizes: list[int] = field(default_factory=lambda: []) """Cuda graph capture sizes 1. if none provided, then default set to [max_num_seqs] 2. if one value is provided, then the capture list would follow the @@ -2204,7 +2204,7 @@ def __post_init__(self) -> None: self.long_prefill_token_threshold) # If cuda_graph_sizes is not specified, default set to [max_num_seqs]. - if self.cuda_graph_sizes is None: + if not self.cuda_graph_sizes: self.cuda_graph_sizes = [self.max_num_seqs] @model_validator(mode='after') diff --git a/vllm/engine/arg_utils.py b/vllm/engine/arg_utils.py index cc4b4900f138..9d1008b6b350 100644 --- a/vllm/engine/arg_utils.py +++ b/vllm/engine/arg_utils.py @@ -303,8 +303,8 @@ class EngineArgs: kv_cache_dtype: CacheDType = CacheConfig.cache_dtype seed: Optional[int] = ModelConfig.seed max_model_len: Optional[int] = ModelConfig.max_model_len - cuda_graph_sizes: Optional[list[int]] = get_field(SchedulerConfig, - "cuda_graph_sizes") + cuda_graph_sizes: list[int] = get_field(SchedulerConfig, + "cuda_graph_sizes") # Note: Specifying a custom executor backend by passing a class # is intended for expert use only. The API may change without # notice. From 8118ceb4fae3e24b587397990599a3c8e3af86d0 Mon Sep 17 00:00:00 2001 From: Aaron Pham Date: Wed, 25 Jun 2025 12:53:20 -0400 Subject: [PATCH 5/5] chore: use default constructor as list --- vllm/config.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/vllm/config.py b/vllm/config.py index 8bfd45757402..0c20d31558d6 100644 --- a/vllm/config.py +++ b/vllm/config.py @@ -2033,7 +2033,7 @@ class SchedulerConfig: NOTE: This will be replaced by speculative config in the future; it is present to enable correctness tests until then.""" - cuda_graph_sizes: list[int] = field(default_factory=lambda: []) + cuda_graph_sizes: list[int] = field(default_factory=list) """Cuda graph capture sizes 1. if none provided, then default set to [max_num_seqs] 2. if one value is provided, then the capture list would follow the