vllm-project · Isotr0py · Jun 16, 2025 · Jun 14, 2025 · Jun 14, 2025 · Jun 15, 2025
@@ -13,6 +13,7 @@
 from vllm.attention.backends.abstract import (AttentionBackend, AttentionImpl,
                                               AttentionMetadata, AttentionType,
                                               is_quantized_kv_cache)
+from vllm.distributed import get_tensor_model_parallel_world_size
 from vllm.logger import init_logger
 from vllm.platforms import current_platform
 from vllm.v1.attention.backends.utils import (AttentionMetadataBuilder,
@@ -236,7 +237,12 @@ def final_mask_mod(
 
     def build_block_mask(self) -> BlockMask:
         assert self.mask_mod is not None
-        return create_block_mask_compiled(
+        # FIXME: With TP>1, create_block_mask_compiled will raise
+        # CUDA error: an illegal memory access was encountered
+        create_block_mask_fn = (create_block_mask_compiled
+                                if get_tensor_model_parallel_world_size() == 1
+                                else create_block_mask)
+        return create_block_mask_fn(
             self.mask_mod,
             None,
             None,

diff --git a/vllm/v1/engine/core.py b/vllm/v1/engine/core.py
@@ -84,6 +84,8 @@ def __init__(self,
 
         vllm_config.cache_config.num_gpu_blocks = num_gpu_blocks
         vllm_config.cache_config.num_cpu_blocks = num_cpu_blocks
+        self.collective_rpc("initialize_cache",
 elif distributed_executor_backend == "mp": 
     from vllm.v1.executor.multiproc_executor import MultiprocExecutor 
     executor_class = MultiprocExecutor 
 elif distributed_executor_backend == "uni": 
     executor_class = UniProcExecutor 
 def collective_rpc(self, 
                    method: Union[str, Callable], 
                    timeout: Optional[float] = None, 
                    args: Tuple = (), 
                    kwargs: Optional[Dict] = None) -> List[Any]: 
     if kwargs is None: 
         kwargs = {} 
     answer = run_method(self.driver_worker, method, args, kwargs) 
     return [answer] 
 elif distributed_executor_backend == "mp": 
     from vllm.v1.executor.multiproc_executor import MultiprocExecutor 
     executor_class = MultiprocExecutor 
 elif distributed_executor_backend == "uni": 
     executor_class = UniProcExecutor 
 def collective_rpc(self, 
                    method: Union[str, Callable], 
                    timeout: Optional[float] = None, 
                    args: Tuple = (), 
                    kwargs: Optional[Dict] = None) -> List[Any]: 
     if kwargs is None: 
         kwargs = {} 
     answer = run_method(self.driver_worker, method, args, kwargs) 
     return [answer] 
+                            args=(num_gpu_blocks, num_cpu_blocks))
 
         self.structured_output_manager = StructuredOutputManager(vllm_config)
 

diff --git a/vllm/v1/worker/gpu_worker.py b/vllm/v1/worker/gpu_worker.py
@@ -112,6 +112,11 @@ def wake_up(self, tags: Optional[list[str]] = None) -> None:
                     buffer.data.copy_(self._sleep_saved_buffers[name].data)
             self._sleep_saved_buffers = {}
 
+    def initialize_cache(self, num_gpu_blocks: int,
     def initialize_cache(self, num_gpu_blocks: int, 
                          num_cpu_blocks: int) -> None: 
         """Initialize the KV cache with the given size in blocks. 
         """ 
         raise NotImplementedError 
     def initialize_cache(self, num_gpu_blocks: int, 
                          num_cpu_blocks: int) -> None: 
         """Allocate GPU and CPU KV cache with the specified number of blocks. 
         This also warms up the model, which may record CUDA graphs. 
         """ 
         raise_if_cache_size_invalid( 
             num_gpu_blocks, self.cache_config.block_size, 
             self.cache_config.is_attention_free, 
             self.model_config.max_model_len, 
             self.parallel_config.pipeline_parallel_size) 
         self.cache_config.num_gpu_blocks = num_gpu_blocks 
         self.cache_config.num_cpu_blocks = num_cpu_blocks 
     def initialize_cache(self, num_gpu_blocks: int, 
                          num_cpu_blocks: int) -> None: 
         """Initialize the KV cache with the given size in blocks. 
         """ 
         raise NotImplementedError 
     def initialize_cache(self, num_gpu_blocks: int, 
                          num_cpu_blocks: int) -> None: 
         """Allocate GPU and CPU KV cache with the specified number of blocks. 
  
         This also warms up the model, which may record CUDA graphs. 
         """ 
         raise_if_cache_size_invalid( 
             num_gpu_blocks, self.cache_config.block_size, 
             self.cache_config.is_attention_free, 
             self.model_config.max_model_len, 
             self.parallel_config.pipeline_parallel_size) 
  
         self.cache_config.num_gpu_blocks = num_gpu_blocks 
         self.cache_config.num_cpu_blocks = num_cpu_blocks 
+                         num_cpu_blocks: int) -> None:
+        self.cache_config.num_gpu_blocks = num_gpu_blocks
+        self.cache_config.num_cpu_blocks = num_cpu_blocks
+
     def init_device(self):
         if self.device_config.device.type == "cuda":
             # torch.distributed.all_reduce does not free the input tensor until

@@ -93,6 +93,11 @@ def __init__(
         if self.model_config.seed is None:
             self.model_config.seed = 0
 
+    def initialize_cache(self, num_gpu_blocks: int,
+                         num_cpu_blocks: int) -> None:
+        self.cache_config.num_gpu_blocks = num_gpu_blocks
+        self.cache_config.num_cpu_blocks = num_cpu_blocks
+
     def init_device(self):
         os.environ["PJRT_DEVICE"] = "TPU"
         # Note: Currently the XLA compiler wrongly uses 2D ring strategy on 1D