vllm-project · WoosukKwon · Mar 9, 2026 · Mar 4, 2026 · Mar 4, 2026 · Mar 4, 2026
@@ -97,6 +97,9 @@ def is_valid_runtime_mode(self) -> bool:
     def __str__(self) -> str:
         return self.name
 
+    def __bool__(self) -> bool:
+        return self != CUDAGraphMode.NONE
+
 
 @config
 class PassConfig:

@@ -104,19 +104,24 @@ def apply_staged_writes(self) -> None:
         self.num_blocks.copy_to_uva()
 
     def gather_block_tables(
-        self, idx_mapping: torch.Tensor
+        self,
+        idx_mapping: torch.Tensor,
+        num_reqs_padded: int,
     ) -> tuple[torch.Tensor, ...]:
         num_reqs = idx_mapping.shape[0]
-        _gather_block_tables_kernel[(self.num_kv_cache_groups, num_reqs)](
+        # Launch kernel with num_reqs_padded to fuse zeroing of padded rows.
+        _gather_block_tables_kernel[(self.num_kv_cache_groups, num_reqs_padded)](
             idx_mapping,
             self.block_table_ptrs,
             self.input_block_table_ptrs,
             self.block_table_strides,
             self.num_blocks.gpu,
             self.num_blocks.gpu.stride(0),
+            num_reqs,
+            self.input_block_tables[0].shape[1],  # max_num_blocks
             BLOCK_SIZE=1024,  # type: ignore
         )
-        return tuple(block_table[:num_reqs] for block_table in self.input_block_tables)
+        return tuple(bt[:num_reqs_padded] for bt in self.input_block_tables)
 
     def get_dummy_block_tables(self, num_reqs: int) -> tuple[torch.Tensor, ...]:
         # NOTE(woosuk): The output may be used for CUDA graph capture.
@@ -130,6 +135,7 @@ def compute_slot_mappings(
         idx_mapping: torch.Tensor,
         query_start_loc: torch.Tensor,
         positions: torch.Tensor,
+        num_tokens_padded: int,
     ) -> torch.Tensor:
         num_reqs = idx_mapping.shape[0]
         num_tokens = positions.shape[0]
@@ -151,7 +157,7 @@ def compute_slot_mappings(
             PAD_ID=PAD_SLOT_ID,
             TRITON_BLOCK_SIZE=1024,  # type: ignore
         )
-        return self.slot_mappings[:, :num_tokens]
+        return self.slot_mappings[:, :num_tokens_padded]
 
     def get_dummy_slot_mappings(self, num_tokens: int) -> torch.Tensor:
         # Fill the entire slot_mappings tensor, not just the first `num_tokens` entries.
@@ -173,21 +179,31 @@ def _gather_block_tables_kernel(
     block_table_strides,  # [num_kv_cache_groups]
     num_blocks_ptr,  # [num_kv_cache_groups, max_num_reqs]
     num_blocks_stride,
+    num_reqs,  # actual number of requests (for padding)
+    max_num_blocks,  # stride for zeroing padded rows
     BLOCK_SIZE: tl.constexpr,
 ):
     # kv cache group id
     group_id = tl.program_id(0)
     batch_idx = tl.program_id(1)
-    req_idx = tl.load(batch_idx_to_req_idx + batch_idx)
 
+    stride = tl.load(block_table_strides + group_id)
+    dst_block_table_ptr = _load_ptr(dst_block_table_ptrs + group_id, tl.int32)
+    dst_row_ptr = dst_block_table_ptr + batch_idx * stride
+
+    if batch_idx >= num_reqs:
+        # Zero out padded rows.
+        for i in tl.range(0, max_num_blocks, BLOCK_SIZE):
+            offset = i + tl.arange(0, BLOCK_SIZE)
+            tl.store(dst_row_ptr + offset, 0, mask=offset < max_num_blocks)
+        return
+
+    req_idx = tl.load(batch_idx_to_req_idx + batch_idx)
     group_num_blocks_ptr = num_blocks_ptr + group_id * num_blocks_stride
     num_blocks = tl.load(group_num_blocks_ptr + req_idx)
 
-    stride = tl.load(block_table_strides + group_id)
     src_block_table_ptr = _load_ptr(src_block_table_ptrs + group_id, tl.int32)
     src_row_ptr = src_block_table_ptr + req_idx * stride
-    dst_block_table_ptr = _load_ptr(dst_block_table_ptrs + group_id, tl.int32)
-    dst_row_ptr = dst_block_table_ptr + batch_idx * stride
 
     for i in tl.range(0, num_blocks, BLOCK_SIZE):
         offset = i + tl.arange(0, BLOCK_SIZE)