vllm-project · HF-001 · Mar 25, 2026 · Mar 25, 2026 · Mar 25, 2026 · Mar 27, 2026
@@ -19,7 +19,7 @@ on:
       continue_on_error:
         required: false
         type: boolean
-        default: false
+        default: true
       # The following inputs are used by comment-triggered E2E tests (/e2e <tests>).
       # They carry space-separated pytest paths, categorized by runner type.
       # Leave empty (default) when running label-triggered full/light suites.

@@ -102,7 +102,7 @@ jobs:
     strategy:
       fail-fast: false
       matrix:
-        vllm_version: [35141a7eeda941a60ad5a4956670c60fd5a77029]
+        vllm_version: [35141a7eeda941a60ad5a4956670c60fd5a77029, v0.18.0]
     needs: [parse-trigger]
     if: ${{ needs.parse-trigger.outputs.allowed == 'true' }}
     uses: ./.github/workflows/_e2e_test.yaml

@@ -18,14 +18,16 @@
 
 import numpy as np
 import torch
+
+# import vllm.utils.cpu_triton_utils as cpu_tl
 from vllm.distributed.parallel_state import GroupCoordinator
 
 from tests.ut.base import TestBase
 
 
 class TestBlockTableComputeSlotMapping(TestBase):
     """Test suite for BlockTable.compute_slot_mapping() method
-    
+
     This test suite covers different configurations of DCP (Decode Context Parallelism),
     PCP (Prefill Context Parallelism), and cp_kv_cache_interleave_size to ensure
     correct slot_mapping calculation on different ranks.
@@ -41,13 +43,13 @@ def setUp(self):
         self.device = torch.device("cpu")
         self.kernel_sizes = [128]
 
-    def create_block_table(self, dcp_world_size, dcp_rank, pcp_world_size,
-                           pcp_rank, cp_kv_cache_interleave_size):
+    def create_block_table(self, dcp_world_size, dcp_rank, pcp_world_size, pcp_rank, cp_kv_cache_interleave_size):
         """Helper method to create BlockTable with mocked distributed groups"""
 
-        with patch('vllm_ascend.worker.block_table.get_dcp_group') as mock_get_dcp_group, \
-             patch('vllm_ascend.worker.block_table.get_pcp_group') as mock_get_pcp_group:
-
+        with (
+            patch("vllm_ascend.worker.block_table.get_dcp_group") as mock_get_dcp_group,
+            patch("vllm_ascend.worker.block_table.get_pcp_group") as mock_get_pcp_group,
+        ):
             # Mock DCP group
             mock_dcp_group = MagicMock(spec=GroupCoordinator)
             mock_dcp_group.world_size = dcp_world_size
@@ -71,23 +73,21 @@ def create_block_table(self, dcp_world_size, dcp_rank, pcp_world_size,
                 device=self.device,
                 kernel_sizes=self.kernel_sizes,
                 cp_kv_cache_interleave_size=cp_kv_cache_interleave_size,
-                num_speculative_tokens=0)
+                num_speculative_tokens=0,
+            )
 
             return block_table
 
     def setup_block_table_data(self, block_table, num_reqs=2):
         """Helper method to populate block table with test data"""
         # Add block IDs for each request
         for i in range(num_reqs):
-            block_ids = list(range(i * 4,
-                                   (i + 1) * 4))  # [0,1,2,3], [4,5,6,7], etc.
+            block_ids = list(range(i * 4, (i + 1) * 4))  # [0,1,2,3], [4,5,6,7], etc.
             block_table.add_row(block_ids, i)
 
-    def _test_slot_mapping_for_ranks(self, dcp_world_size, pcp_world_size,
-                                     cp_kv_cache_interleave_size,
-                                     test_configs):
+    def _test_slot_mapping_for_ranks(self, dcp_world_size, pcp_world_size, cp_kv_cache_interleave_size, test_configs):
         """Helper method to test slot_mapping across multiple ranks
-        
+
         Args:
             dcp_world_size: Number of DCP ranks
             pcp_world_size: Number of PCP ranks
@@ -97,31 +97,46 @@ def _test_slot_mapping_for_ranks(self, dcp_world_size, pcp_world_size,
         for dcp_rank, pcp_rank, req_indices, positions, expected_result in test_configs:
             with self.subTest(dcp_rank=dcp_rank, pcp_rank=pcp_rank):
                 block_table = self.create_block_table(
-                    dcp_world_size, dcp_rank, pcp_world_size, pcp_rank,
-                    cp_kv_cache_interleave_size)
+                    dcp_world_size, dcp_rank, pcp_world_size, pcp_rank, cp_kv_cache_interleave_size
+                )
 
                 num_reqs = max(req_indices) + 1 if len(req_indices) > 0 else 1
                 self.setup_block_table_data(block_table, num_reqs=num_reqs)
 
-                block_table.compute_slot_mapping(req_indices, positions)
+                # Build query_start_loc [num_reqs + 1] from req_indices.
+                # query_start_loc holds the cumulative token count per request,
+                # e.g. req_indices=[0,0,1,1] -> query_start_loc=[0,2,4].
+                num_tokens = len(positions)
+                counts = np.bincount(req_indices, minlength=num_reqs)
+                query_start_loc_np = np.concatenate([[0], np.cumsum(counts)]).astype(np.int32)
+                query_start_loc = torch.from_numpy(query_start_loc_np)
+
+                # positions must be a torch int64 tensor to match the
+                # _compute_slot_mapping_kernel's positions_ptr type.
+                positions_tensor = torch.from_numpy(positions.astype(np.int64))
+                # block_table._compute_slot_mapping_kernel = cpu_tl.compute_slot_mapping_kernel
+                block_table.compute_slot_mapping(num_reqs, query_start_loc, positions_tensor)
+
+                actual_result = block_table.slot_mapping.np[:num_tokens]
 
-                actual_result = block_table.slot_mapping.np[:len(positions)]
                 np.testing.assert_array_equal(
-                    actual_result, expected_result,
+                    actual_result,
+                    expected_result,
                     f"DCP={dcp_world_size}, PCP={pcp_world_size}, "
                     f"interleave={cp_kv_cache_interleave_size}, "
-                    f"dcp_rank={dcp_rank}, pcp_rank={pcp_rank}")
+                    f"dcp_rank={dcp_rank}, pcp_rank={pcp_rank}",
+                )
 
     def test_compute_slot_mapping_dcp1_pcp1_interleave1(self):
         """Test compute_slot_mapping with DCP=1, PCP=1, interleave_size=1
-        
+
         With no parallelism (DCP=1, PCP=1), all tokens are local to the single rank.
-        
+
         Setup:
         - Block size: 16
         - Request 0 has blocks: [0, 1, 2, 3]
         - Request 1 has blocks: [4, 5, 6, 7]
-        
+
         Test positions for each request:
         - Request 0, position 0: block_id=0, offset=0 → slot = 0*128+0 = 0
         - Request 0, position 1: block_id=0, offset=1 → slot = 0*128+1 = 1
@@ -137,14 +152,13 @@ def test_compute_slot_mapping_dcp1_pcp1_interleave1(self):
             (0, 0, req_indices, positions, expected_result),
         ]
 
-        self._test_slot_mapping_for_ranks(dcp_world_size=1,
-                                          pcp_world_size=1,
-                                          cp_kv_cache_interleave_size=1,
-                                          test_configs=test_configs)
+        self._test_slot_mapping_for_ranks(
+            dcp_world_size=1, pcp_world_size=1, cp_kv_cache_interleave_size=1, test_configs=test_configs
+        )
 
     def test_compute_slot_mapping_dcp4_pcp2_interleave1(self):
         """Test compute_slot_mapping with DCP=4, PCP=2, interleave_size=1
-        
+
         With interleave_size=1, tokens are distributed round-robin across all 8 ranks:
         - Position 0 → Rank 0
         - Position 1 → Rank 1
@@ -183,28 +197,25 @@ def test_compute_slot_mapping_dcp4_pcp2_interleave1(self):
         for pcp_rank in range(2):
             for dcp_rank in range(4):
                 current_rank = 4 * pcp_rank + dcp_rank
-                expected_result = np.array(rank_expectations[current_rank],
-                                           dtype=np.int32)
-                test_configs.append((dcp_rank, pcp_rank, req_indices,
-                                     positions, expected_result))
+                expected_result = np.array(rank_expectations[current_rank], dtype=np.int32)
+                test_configs.append((dcp_rank, pcp_rank, req_indices, positions, expected_result))
 
-        self._test_slot_mapping_for_ranks(dcp_world_size=4,
-                                          pcp_world_size=2,
-                                          cp_kv_cache_interleave_size=1,
-                                          test_configs=test_configs)
+        self._test_slot_mapping_for_ranks(
+            dcp_world_size=4, pcp_world_size=2, cp_kv_cache_interleave_size=1, test_configs=test_configs
+        )
 
     def test_compute_slot_mapping_dcp4_pcp2_interleave128(self):
         """Test compute_slot_mapping with DCP=4, PCP=2, interleave_size=128
-        
+
         With interleave_size=128, tokens are distributed in chunks of 128 across ranks.
         Virtual block size = 16 * 4 * 2 = 128
-        
+
         Token distribution with interleave_size=128:
         - Positions 0-127 belong to rank 0 (first chunk of 128)
         - Positions 128-255 belong to rank 1 (second chunk of 128)
         - Positions 256-383 belong to rank 2 (third chunk of 128)
         - And so on...
-        
+
         Using 130 positions ensures we test both rank 0 (positions 0-127) and rank 1 (positions 128-129).
         """
         num_positions = 130
@@ -245,14 +256,13 @@ def test_compute_slot_mapping_dcp4_pcp2_interleave128(self):
                     expected_result = [-1] * 130
 
                 test_configs.append(
-                    (dcp_rank, pcp_rank, req_indices, positions,
-                     np.array(expected_result, dtype=np.int32)))
+                    (dcp_rank, pcp_rank, req_indices, positions, np.array(expected_result, dtype=np.int32))
+                )
 
-        self._test_slot_mapping_for_ranks(dcp_world_size=4,
-                                          pcp_world_size=2,
-                                          cp_kv_cache_interleave_size=128,
-                                          test_configs=test_configs)
+        self._test_slot_mapping_for_ranks(
+            dcp_world_size=4, pcp_world_size=2, cp_kv_cache_interleave_size=128, test_configs=test_configs
+        )
 
 
-if __name__ == '__main__':
+if __name__ == "__main__":
     unittest.main()
@@ -278,7 +278,7 @@ def build(
         )
 
         block_table = common_attn_metadata.block_table_tensor
-        seq_lens = common_attn_metadata.seq_lens_cpu[:num_reqs]
+        seq_lens = common_attn_metadata.seq_lens[:num_reqs].to("cpu")
 
         slot_mapping = common_attn_metadata.slot_mapping[:num_actual_tokens]
         # this slot_mapping override doesn't work since vllm will override it again. We should fix it vllm.
@@ -688,7 +688,26 @@ def full_graph_pa(
             graph_params.handles[num_tokens].append(handle)
             return output
 
-    def _get_fia_params(self, key: torch.Tensor, value: torch.Tensor, attn_metadata: AscendMetadata):
+    def _get_fia_params(self, key: torch.Tensor, value: torch.Tensor, attn_metadata: AscendMetadata, kv_cache=None):
+        # PrefillNoCache doesn't need key_cache, but other modes do
+        # Only initialize/require cache for modes that actually use it
+        if attn_metadata.attn_state != AscendAttentionState.PrefillNoCache:
+            # Initialize cache from kv_cache if not already set (for DecodeOnly mode)
+            if self.key_cache is None and kv_cache is not None:
+                if (
+                    isinstance(kv_cache, torch.Tensor)
+                    and kv_cache.dim() > 0
+                    and kv_cache.shape[0] == 2
+                    or isinstance(kv_cache, (list, tuple))
+                    and len(kv_cache) >= 2
+                ):
+                    self.key_cache, self.value_cache = kv_cache[0], kv_cache[1]
+
+            if self.key_cache is None:
+                raise RuntimeError(
+                    f"key_cache is None in _get_fia_params for mode {attn_metadata.attn_state}. kv_cache={kv_cache}"
+                )
+
         if attn_metadata.attn_state == AscendAttentionState.PrefillNoCache:
             block_size = 128
             block_table = None
@@ -766,6 +785,7 @@ def forward_fused_infer_attention(
         value: torch.Tensor,
         attn_metadata: AscendMetadata,
         output: torch.Tensor,
+        kv_cache=None,
     ):
         # we inherit ForwardContext in model runner v2, when enable model
         # runner v2, there is not capturing attribute in forward_context,
@@ -781,7 +801,9 @@ def forward_fused_infer_attention(
             and self.sinks is None
         ):
             return self._forward_fia_slidingwindow(query, attn_metadata, output)
-        key, value, block_size, block_table, actual_seq_lengths_kv = self._get_fia_params(key, value, attn_metadata)
+        key, value, block_size, block_table, actual_seq_lengths_kv = self._get_fia_params(
+            key, value, attn_metadata, kv_cache
+        )
         num_tokens = attn_metadata.actual_seq_lengths_q[-1]
         query = query[:num_tokens]
         if (
@@ -927,7 +949,7 @@ def forward_impl(
         ):
             output = self.forward_paged_attention(query, attn_metadata, output)
         else:
-            output = self.forward_fused_infer_attention(query, key, value, attn_metadata, output)
+            output = self.forward_fused_infer_attention(query, key, value, attn_metadata, output, kv_cache)
 
         return output
 
@@ -963,6 +985,20 @@ def forward(
         num_tokens = query.shape[0]
         if attn_metadata is None:
             return output.fill_(0)
+
+        # Initialize key_cache and value_cache from kv_cache if not already set.
+        # This is needed for DecodeOnly mode where key/value are None but we still
+        # need access to the cache for attention computation.
+        if self.key_cache is None and kv_cache is not None:
+            if (
+                isinstance(kv_cache, torch.Tensor)
+                and kv_cache.dim() > 0
+                and kv_cache.shape[0] == 2
+                or isinstance(kv_cache, (list, tuple))
+                and len(kv_cache) >= 2
+            ):
+                self.key_cache, self.value_cache = kv_cache[0], kv_cache[1]
+
         output_padded = None
         if key is not None and value is not None:
             output_padded = output

@@ -435,7 +435,11 @@ def build(
 
         query_seq_lens_cpu = query_start_loc_cpu[1:] - query_start_loc_cpu[:-1]
         self.query_lens = query_seq_lens_cpu[:num_reqs]
-        self.seq_lens = common_attn_metadata.seq_lens_cpu[:num_reqs]
+        self.seq_lens = None
+        if common_attn_metadata.seq_lens_cpu is not None:
+            self.seq_lens = common_attn_metadata.seq_lens_cpu[:num_reqs]
+        else:
+            self.seq_lens = common_attn_metadata.seq_lens[:num_reqs].to("cpu")
 
         self.graph_pad_size = common_attn_metadata.graph_pad_size
         block_table_size = self.get_block_table_size(common_attn_metadata, BUILD_METADATA_STEP_PREFILL)

@@ -240,7 +240,12 @@ def build(
 
         cum_query_lens = common_attn_metadata.query_start_loc[1 : num_reqs + 1]
         seq_lens = common_attn_metadata.seq_lens[:num_reqs]
-        seq_lens_cpu = common_attn_metadata.seq_lens_cpu[:num_reqs]
+
+        seq_lens_cpu = None
+        if common_attn_metadata.seq_lens_cpu is not None:
+            seq_lens_cpu = common_attn_metadata.seq_lens_cpu[:num_reqs]
+        else:
+            seq_lens_cpu = common_attn_metadata.seq_lens[:num_reqs].to("cpu")
 
         cos, sin = get_cos_and_sin_mla(input_positions, True)
 

@@ -175,8 +175,10 @@ def unpadded(self, num_actual_tokens: int, num_actual_reqs: int) -> "AscendCommo
             query_start_loc=self.query_start_loc[: num_actual_reqs + 1],
             query_start_loc_cpu=self.query_start_loc_cpu[: num_actual_reqs + 1],
             seq_lens=self.seq_lens[:num_actual_reqs],
-            seq_lens_cpu=self.seq_lens_cpu[:num_actual_reqs],
-            num_computed_tokens_cpu=self.num_computed_tokens_cpu[:num_actual_reqs],
+            seq_lens_cpu=self.seq_lens_cpu[:num_actual_reqs] if self.seq_lens_cpu is not None else None,
+            num_computed_tokens_cpu=self.num_computed_tokens_cpu[:num_actual_reqs]
+            if self.num_computed_tokens_cpu is not None
+            else None,
             num_reqs=num_actual_reqs,
             num_actual_tokens=num_actual_tokens,
             max_query_len=self.max_query_len,

@@ -28,6 +28,20 @@
 torch_sum = torch.sum
 
 
+def vllm_is_batch_invariant() -> bool:
+    """Check if batch-invariant mode is enabled.
+
+    This is a compatibility wrapper for the vllm function that was removed
+    in recent upstream vLLM refactoring.
+    """
+    # Try to access from envs module, fall back to environment variable
+    if hasattr(envs, "VLLM_BATCH_INVARIANT"):
+        return bool(envs.VLLM_BATCH_INVARIANT)
+    else:
+        # Fallback to environment variable for older vLLM versions
+        return bool(int(os.getenv("VLLM_BATCH_INVARIANT", "0")))
+
+
 if HAS_TRITON:
     from vllm_ascend.ops.triton.batch_invariant.matmul import (
         addmm_batch_invariant,