vllm-project · vllm-bot · Apr 29, 2026 · Apr 23, 2026 · Apr 23, 2026 · Apr 23, 2026
diff --git a/tests/compile/test_config.py b/tests/compile/test_config.py
@@ -405,6 +405,9 @@ def test_should_split():
         (None, 0, 1, False, 2048, CUDAGraphMode.NONE, 0),
         # truncated to nearest multiple of 8 or 16
         (None, 257, 1, False, 2048, CUDAGraphMode.FULL_AND_PIECEWISE, 256),
+        # max_num_batched_tokens <= max_cudagraph_capture_size should always be
+        # captured even if not landing on a 16-stride step
+        (None, 2048, 1, False, 257, CUDAGraphMode.FULL_AND_PIECEWISE, 257),
         # max from list
         ([1, 2, 4, 15], None, 1, False, 2048, CUDAGraphMode.FULL_AND_PIECEWISE, 15),
         # SP forces full-graph compilation, sizes are filtered by TP

@@ -1432,6 +1432,10 @@ def _set_cudagraph_sizes(self):
         cudagraph_capture_sizes = [1, 2, 4] + list(range(8, 256, 8)) + list(
             range(256, max_graph_size + 1, 16))
 
+        `max_num_batched_tokens` is also appended to the list if it fits
+        within `max_cudagraph_capture_size`, so the max batch size is captured
+        even when off-stride.
+
         In the end, `vllm_config.compilation_config.cudagraph_capture_sizes`
         will be the final sizes to capture cudagraph (in ascending order).
 
@@ -1520,6 +1524,12 @@ def _set_cudagraph_sizes(self):
                     cudagraph_capture_sizes += list(
                         range(256, max_cudagraph_capture_size + 1, 16)
                     )
+                # ensure max_num_tokens is captured if within max capture size
+                if (
+                    max_num_tokens <= max_cudagraph_capture_size
     def _set_cudagraph_sizes(self): 
         """ 
         vLLM defines the default candidate list of batch sizes for CUDA graph 
         capture as: 
         ```python 
         max_graph_size = min(max_num_seqs * 2, 512) 
         # 1, 2, 4, then multiples of 8 up to 256 and then multiples of 16 
         # up to max_graph_size 
         cudagraph_capture_sizes = [1, 2, 4] + list(range(8, 256, 8)) + list( 
             range(256, max_graph_size + 1, 16)) 
         In the end, `vllm_config.compilation_config.cudagraph_capture_sizes` 
         will be the final sizes to capture cudagraph (in ascending order). 
         These sizes are used to capture and reuse CUDA graphs for 
         performance-critical paths (e.g., decoding). Capturing enables 
         significantly faster kernel dispatch by avoiding Python overhead. The 
         list is then filtered based on `max_num_batched_tokens` (e.g., 8192 on 
         most GPUs), which controls the total allowed number of tokens in a 
         batch. Since each sequence may have a variable number of tokens, the 
         maximum usable batch size will depend on actual sequence lengths. 
         Example: 
             With `max_num_batched_tokens = 8192`, and typical sequences 
             averaging ~32 tokens, most practical batch sizes fall below 256. 
             However, the system will still allow capture sizes up to 512 if 
             shape and memory permit. 
         Note: 
             If users explicitly specify cudagraph capture sizes in the 
             compilation config, those will override this default logic. 
             At runtime: 
             - If batch size <= one of the `cudagraph_capture_sizes`, the closest 
             padded CUDA graph will be used. 
             - If batch size > largest `cudagraph_capture_sizes`, cudagraph will 
             not be used. 
     def _set_cudagraph_sizes(self): 
         """ 
         vLLM defines the default candidate list of batch sizes for CUDA graph 
         capture as: 
  
         ```python 
         max_graph_size = min(max_num_seqs * 2, 512) 
         # 1, 2, 4, then multiples of 8 up to 256 and then multiples of 16 
         # up to max_graph_size 
         cudagraph_capture_sizes = [1, 2, 4] + list(range(8, 256, 8)) + list( 
             range(256, max_graph_size + 1, 16)) 
  
         In the end, `vllm_config.compilation_config.cudagraph_capture_sizes` 
         will be the final sizes to capture cudagraph (in ascending order). 
  
         These sizes are used to capture and reuse CUDA graphs for 
         performance-critical paths (e.g., decoding). Capturing enables 
         significantly faster kernel dispatch by avoiding Python overhead. The 
         list is then filtered based on `max_num_batched_tokens` (e.g., 8192 on 
         most GPUs), which controls the total allowed number of tokens in a 
         batch. Since each sequence may have a variable number of tokens, the 
         maximum usable batch size will depend on actual sequence lengths. 
  
         Example: 
             With `max_num_batched_tokens = 8192`, and typical sequences 
             averaging ~32 tokens, most practical batch sizes fall below 256. 
             However, the system will still allow capture sizes up to 512 if 
             shape and memory permit. 
  
         Note: 
             If users explicitly specify cudagraph capture sizes in the 
             compilation config, those will override this default logic. 
             At runtime: 
  
             - If batch size <= one of the `cudagraph_capture_sizes`, the closest 
             padded CUDA graph will be used. 
             - If batch size > largest `cudagraph_capture_sizes`, cudagraph will 
             not be used. 
+                    and max_num_tokens not in cudagraph_capture_sizes
+                ):
+                    cudagraph_capture_sizes.append(max_num_tokens)
                 # de-duplicate and sort the sizes
                 cudagraph_capture_sizes = sorted(set(cudagraph_capture_sizes))