From a5bfa27d214bb0033b566b76ac457d751e952da9 Mon Sep 17 00:00:00 2001 From: 01267596 Date: Wed, 25 Mar 2026 08:55:12 +0000 Subject: [PATCH 01/31] [Async][spec decode] Zero-bubble async scheduling +spec decoding Signed-off-by: 01267596 --- vllm_ascend/attention/attention_v1.py | 2 +- vllm_ascend/attention/utils.py | 2 +- vllm_ascend/spec_decode/eagle_proposer.py | 22 ++- vllm_ascend/worker/block_table.py | 109 ++++-------- vllm_ascend/worker/model_runner_v1.py | 200 +++++++++++++++++----- vllm_ascend/worker/npu_input_batch.py | 2 +- 6 files changed, 202 insertions(+), 135 deletions(-) diff --git a/vllm_ascend/attention/attention_v1.py b/vllm_ascend/attention/attention_v1.py index 6443c5216e2..5a7b8d3a329 100644 --- a/vllm_ascend/attention/attention_v1.py +++ b/vllm_ascend/attention/attention_v1.py @@ -278,7 +278,7 @@ def build( ) block_table = common_attn_metadata.block_table_tensor - seq_lens = common_attn_metadata.seq_lens_cpu[:num_reqs] + seq_lens = common_attn_metadata.seq_lens[:num_reqs] slot_mapping = common_attn_metadata.slot_mapping[:num_actual_tokens] # this slot_mapping override doesn't work since vllm will override it again. We should fix it vllm. diff --git a/vllm_ascend/attention/utils.py b/vllm_ascend/attention/utils.py index 946d5c66d4e..c5513745b0c 100644 --- a/vllm_ascend/attention/utils.py +++ b/vllm_ascend/attention/utils.py @@ -169,7 +169,7 @@ def unpadded(self, num_actual_tokens: int, num_actual_reqs: int) -> "AscendCommo query_start_loc=self.query_start_loc[: num_actual_reqs + 1], query_start_loc_cpu=self.query_start_loc_cpu[: num_actual_reqs + 1], seq_lens=self.seq_lens[:num_actual_reqs], - seq_lens_cpu=self.seq_lens_cpu[:num_actual_reqs], + seq_lens_cpu=self.seq_lens_cpu[:num_actual_reqs] if self.seq_lens_cpu is not None else None, num_computed_tokens_cpu=self.num_computed_tokens_cpu[:num_actual_reqs], num_reqs=num_actual_reqs, num_actual_tokens=num_actual_tokens, diff --git a/vllm_ascend/spec_decode/eagle_proposer.py b/vllm_ascend/spec_decode/eagle_proposer.py index e47f4590908..923ed5d221d 100644 --- a/vllm_ascend/spec_decode/eagle_proposer.py +++ b/vllm_ascend/spec_decode/eagle_proposer.py @@ -90,7 +90,7 @@ class SpecDecodeBaseProposer(EagleProposer): def __init__(self, vllm_config: VllmConfig, device: torch.device, pass_hidden_states_to_model: bool, runner=None): super().__init__(vllm_config, device, runner) - + self.runner = runner self.use_async_scheduling = self.vllm_config.scheduler_config.async_scheduling self.pass_hidden_states_to_model = pass_hidden_states_to_model self.decode_threshold = 1 + self.num_speculative_tokens @@ -370,7 +370,7 @@ def dummy_run( common_attn_metadata = AscendCommonAttentionMetadata( query_start_loc=self.query_start_loc.gpu[: num_reqs + 1], query_start_loc_cpu=self.query_start_loc.cpu[: num_reqs + 1], - seq_lens_cpu=self.runner.seq_lens.cpu, + seq_lens_cpu=self.runner.optimistic_seq_lens_cpu, seq_lens=self.runner.seq_lens.gpu[:num_reqs], num_reqs=num_reqs, num_actual_tokens=num_tokens, @@ -544,7 +544,7 @@ def _propose( common_attn_metadata.block_table_tensor, num_reqs_padded ) common_attn_metadata.seq_lens = self.runner.seq_lens.gpu[:num_reqs_padded] - common_attn_metadata.seq_lens_cpu = self.runner.seq_lens.cpu[:num_reqs_padded] + common_attn_metadata.seq_lens_cpu = self.runner.optimistic_seq_lens_cpu[:num_reqs_padded] if self.supports_mm_inputs: mm_embeds, is_mm_embed = mm_embed_inputs or (None, None) @@ -1190,10 +1190,10 @@ def attn_update_stack_num_spec_norm( # For the requests that exceed the max model length, we set the # sequence length to 1 to minimize their overheads in attention. common_attn_metadata.seq_lens[:batch_size].masked_fill_(exceeds_max_model_len, 1) - - common_attn_metadata.seq_lens_cpu[:batch_size] = common_attn_metadata.seq_lens_cpu[:batch_size] + 1 - exceeds_mask = common_attn_metadata.seq_lens_cpu[:batch_size] >= self.max_model_len - common_attn_metadata.seq_lens_cpu[:batch_size].masked_fill_(exceeds_mask, 1) + if common_attn_metadata.seq_lens_cpu is not None: + common_attn_metadata.seq_lens_cpu[:batch_size] = common_attn_metadata.seq_lens_cpu[:batch_size] + 1 + exceeds_mask = common_attn_metadata.seq_lens_cpu[:batch_size] >= self.max_model_len + common_attn_metadata.seq_lens_cpu[:batch_size].masked_fill_(exceeds_mask, 1) common_attn_metadata.num_computed_tokens_cpu[:batch_size] += 1 if self.uses_mrope: common_attn_metadata.positions[:batch_size].copy_(clamped_positions[0]) @@ -1258,7 +1258,7 @@ def attn_update_stack_num_spec_norm( def prepare_next_token_ids_padded( self, - common_attn_metadata: CommonAttentionMetadata, + seq_lens_cpu: torch.Tensor, sampled_token_ids: torch.Tensor, requests: dict[str, CachedRequestState], gpu_input_batch: InputBatch, @@ -1278,11 +1278,9 @@ def prepare_next_token_ids_padded( # Precompute get_token_id for when there is no valid next token num_reqs = gpu_input_batch.num_reqs + seq_lens_list = seq_lens_cpu[:num_reqs].tolist() self.backup_next_token_ids.np[:num_reqs] = np.array( - [ - requests[gpu_input_batch.req_ids[i]].get_token_id(common_attn_metadata.seq_lens_cpu[i].item()) - for i in range(num_reqs) - ] + [requests[gpu_input_batch.req_ids[i]].get_token_id(seq_lens_list[i]) for i in range(num_reqs)] ) self.backup_next_token_ids.copy_to_gpu(num_reqs) diff --git a/vllm_ascend/worker/block_table.py b/vllm_ascend/worker/block_table.py index 3c812aa4432..f4bf6d4e344 100644 --- a/vllm_ascend/worker/block_table.py +++ b/vllm_ascend/worker/block_table.py @@ -2,7 +2,9 @@ import torch from vllm.distributed import get_dcp_group, get_pcp_group from vllm.utils.math_utils import cdiv +from vllm.v1.attention.backends.utils import PAD_SLOT_ID from vllm.v1.utils import CpuGpuBuffer +from vllm.v1.worker.block_table import _compute_slot_mapping_kernel from vllm.v1.worker.cp_utils import get_total_cp_world_size @@ -117,80 +119,34 @@ def swap_row(self, src: int, tgt: int) -> None: self.block_table.np[[src, tgt]] = self.block_table.np[[tgt, src]] - def compute_slot_mapping(self, req_indices: np.ndarray, positions: np.ndarray) -> None: - # E.g., [0, 1, 0, 1, 2, 3, 4, 0, 1, 2] - # -> [0, 0, K, K, K + 1, K + 1, K + 2, 2 * K, 2 * K, 2 * K + 1] - # where K is the max_num_blocks_per_req and the block size is 2. - # NOTE(woosuk): We can't simply use `token_indices // block_size` - # here because M (max_model_len) is not necessarily divisible by - # block_size. - - if self.dcp_world_size * self.pcp_world_size > 1: - # Note(hc): The DCP implement store kvcache with an interleave - # style, the kvcache for the token whose token_idx is i is - # always stored on the GPU whose dcp_rank equals i % pcp_world_size: - - # Use a "virtual block" which equals to world_size * block_size - # for block_table_indices calculation. - virtual_block_size = self.block_size * self.dcp_world_size * self.pcp_world_size - - # IMPORTANT: In hybrid mode, positions are in logical block space, - # but we need to map them to the correct logical block table indices - logical_block_idx = positions // virtual_block_size - - # Account for the expanded logical table - # (always needed with unified tensor) - # Each physical block is split into multiple logical blocks - # The logical table has been expanded to accommodate this - block_table_indices = ( - req_indices * self.max_num_blocks_per_req * self.blocks_per_phys_block + logical_block_idx - ) - - block_numbers = self.block_table.np.ravel()[block_table_indices] - # Use virtual_block_size for mask calculation, which marks local - # tokens. - virtual_block_offsets = positions % virtual_block_size - self.current_rank = self.dcp_world_size * self.pcp_rank + self.dcp_rank - mask = ( - virtual_block_offsets // self.cp_kv_cache_interleave_size % (self.dcp_world_size * self.pcp_world_size) - == self.current_rank - ) - # Calculate local block_offsets - block_offsets = ( - virtual_block_offsets - // (self.dcp_world_size * self.pcp_world_size * self.cp_kv_cache_interleave_size) - * self.cp_kv_cache_interleave_size - + virtual_block_offsets % self.cp_kv_cache_interleave_size - ) - # Calculate slot_mapping - slot_mapping = block_numbers * self.block_size + block_offsets - # Write final slots, use -1 for not-local - self.slot_mapping.np[: req_indices.shape[0]] = np.where(mask, slot_mapping, -1) - else: - assert self.kernel_sizes is not None - if self.block_size == self.kernel_sizes[0]: - # IMPORTANT: In hybrid mode, positions are in logical block space, - # but we need to map them to the correct logical block table indices - logical_block_idx = positions // self.block_size - - # Account for the expanded logical table - # (always needed with unified tensor) - # Each physical block is split into multiple logical blocks - # The logical table has been expanded to accommodate this - block_table_indices = ( - req_indices * self.max_num_blocks_per_req * self.blocks_per_phys_block + logical_block_idx - ) - - block_numbers = self.block_table.np.ravel()[block_table_indices] - block_offsets = positions % self.block_size - np.add(block_numbers * self.block_size, block_offsets, out=self.slot_mapping.np[: req_indices.shape[0]]) + def compute_slot_mapping( + self, + num_reqs: int, + query_start_loc: torch.Tensor, + positions: torch.Tensor, + ) -> None: + num_tokens = positions.shape[0] + total_cp_world_size = self.pcp_world_size * self.dcp_world_size + total_cp_rank = self.pcp_rank * self.dcp_world_size + self.dcp_rank + _compute_slot_mapping_kernel[(num_reqs + 1,)]( + num_tokens, + self.max_num_batched_tokens, + query_start_loc, + positions, + self.block_table.gpu, + self.block_table.gpu.stride(0), + self.block_size, + self.slot_mapping.gpu, + TOTAL_CP_WORLD_SIZE=total_cp_world_size, + TOTAL_CP_RANK=total_cp_rank, + CP_KV_CACHE_INTERLEAVE_SIZE=self.cp_kv_cache_interleave_size, + PAD_ID=PAD_SLOT_ID, + BLOCK_SIZE=1024, + ) def commit_block_table(self, num_reqs: int) -> None: self.block_table.copy_to_gpu(num_reqs) - def commit_slot_mapping(self, num_tokens: int) -> None: - self.slot_mapping.copy_to_gpu(num_tokens) - def clear(self) -> None: self.block_table.fill_(0) self.block_table.cpu.fill_(0) @@ -299,18 +255,19 @@ def swap_row(self, src: int, tgt: int) -> None: for block_table in self.block_tables: block_table.swap_row(src, tgt) - def compute_slot_mapping(self, req_indices: np.ndarray, positions: np.ndarray) -> None: + def compute_slot_mapping( + self, + num_reqs: int, + query_start_loc: torch.Tensor, + positions: torch.Tensor, + ) -> None: for block_table in self.block_tables: - block_table.compute_slot_mapping(req_indices, positions) + block_table.compute_slot_mapping(num_reqs, query_start_loc, positions) def commit_block_table(self, num_reqs: int) -> None: for block_table in self.block_tables: block_table.commit_block_table(num_reqs) - def commit_slot_mapping(self, num_tokens: int) -> None: - for block_table in self.block_tables: - block_table.commit_slot_mapping(num_tokens) - def clear(self) -> None: for block_table in self.block_tables: block_table.clear() diff --git a/vllm_ascend/worker/model_runner_v1.py b/vllm_ascend/worker/model_runner_v1.py index bac0cba61ec..0b08e471d37 100644 --- a/vllm_ascend/worker/model_runner_v1.py +++ b/vllm_ascend/worker/model_runner_v1.py @@ -76,6 +76,7 @@ from vllm.v1.sample.metadata import SamplingMetadata from vllm.v1.sample.rejection_sampler import RejectionSampler from vllm.v1.spec_decode.metadata import SpecDecodeMetadata +from vllm.v1.spec_decode.utils import update_num_computed_tokens_for_batch_change from vllm.v1.structured_output.utils import apply_grammar_bitmask from vllm.v1.utils import record_function_or_nullcontext from vllm.v1.worker import mamba_utils @@ -637,13 +638,13 @@ def _prepare_inputs( self.with_prefill = with_prefill # Get positions. - positions_np = self.positions.np[:total_num_scheduled_tokens] - cu_num_tokens, arange = self._get_cumsum_and_arange(num_scheduled_tokens) - np.add(self.input_batch.num_computed_tokens_cpu[req_indices], arange, out=positions_np) - - self.input_batch.block_table.compute_slot_mapping(req_indices, positions_np) - self.input_batch.block_table.commit_slot_mapping(total_num_scheduled_tokens) - + cu_num_tokens = self._get_cumsum_and_arange( + num_scheduled_tokens, self.query_pos.np + ) + positions_np = ( + self.input_batch.num_computed_tokens_cpu[req_indices] + + self.query_pos.np[: cu_num_tokens[-1]] + ) if self.use_cp: self.pcp_manager.init_batch_info( num_scheduled_tokens, @@ -760,15 +761,28 @@ def _prepare_inputs( self.gdn_query_start_loc.np[num_reqs + 1 :].fill(cu_num_tokens[-1]) self.gdn_query_start_loc.copy_to_gpu() - self.seq_lens.np[:num_reqs] = self.input_batch.num_computed_tokens_cpu[:num_reqs] + num_scheduled_tokens - self.seq_lens.cpu[num_reqs:].fill_(0) - self.seq_lens.copy_to_gpu() + + # Compute optimistic seq_lens (assumes all draft tokens from previous + # iteration accepted). Store in optimistic_seq_lens_cpu for use by + # _build_attention_metadata (max_seq_len) and discard_request_mask. + # seq_lens (GPU) will be computed later using the same optimistic values. + torch.add( + self.input_batch.num_computed_tokens_cpu_tensor[:num_reqs], + torch.from_numpy(num_scheduled_tokens), + out=self.optimistic_seq_lens_cpu[:num_reqs], + ) + self.optimistic_seq_lens_cpu[num_reqs:].fill_(0) + + # Build prev_positions mapping: current pos -> prev pos (-1 if new). + # Used for gathering from previous iteration's GPU tensors. + prev_req_id_to_index = self.input_batch.prev_req_id_to_index + self._compute_prev_positions(num_reqs) # Fill unused with -1. Needed for reshape_and_cache in attention_cp self.query_start_loc.gpu[num_reqs + 1 :].fill_(-1) # Copy the tensors to the NPU. - self._prepare_input_ids(scheduler_output, total_num_scheduled_tokens, cu_num_tokens) + self._prepare_input_ids(scheduler_output, num_reqs, total_num_scheduled_tokens, cu_num_tokens) # Calculate M-RoPE positions. # Only relevant for models using M-RoPE (e.g, Qwen2-VL) if self.uses_mrope: @@ -785,9 +799,6 @@ def _prepare_inputs( self.xdrope_positions.cpu[:, :total_num_scheduled_tokens], non_blocking=True, ) - else: - # Common case (1D positions) - self.positions.copy_to_gpu(total_num_scheduled_tokens) # Record the index of requests that should not be sampled, # so that we could clear the sampled tokens before returning @@ -805,12 +816,86 @@ def _prepare_inputs( ) discard_requests_mask = original_seq_lens_np < num_tokens_np else: - discard_requests_mask = self.seq_lens.np[:num_reqs] < num_tokens_np + discard_requests_mask = self.optimistic_seq_lens_cpu[:num_reqs].numpy() < num_tokens_np discard_request_indices = np.nonzero(discard_requests_mask)[0] self.num_discarded_requests = len(discard_request_indices) self.discard_request_indices.np[: self.num_discarded_requests] = discard_request_indices self.discard_request_indices.copy_to_gpu(self.num_discarded_requests) + + # Sync num_accepted_tokens from CPU (set by + # _update_states_after_model_execute for hybrid models). + if self.num_accepted_tokens_event is not None: + self.num_accepted_tokens_event.synchronize() + self.num_accepted_tokens.np[:num_reqs] = ( + self.input_batch.num_accepted_tokens_cpu[:num_reqs] + ) + self.num_accepted_tokens.np[num_reqs:].fill(1) + self.num_accepted_tokens.copy_to_gpu() + else: + self.num_accepted_tokens.np.fill(1) + self.num_accepted_tokens.gpu.fill_(1) + + # Update num_computed_tokens on GPU. In async spec decode, + # CPU values are optimistic (all drafts accepted). The kernel + # corrects on GPU using the previous step's + # valid_sampled_token_count_gpu. Otherwise, just copy from CPU. + if ( + self.use_async_spec_decode + and self.valid_sampled_token_count_gpu is not None + and prev_req_id_to_index + ): + self.prev_positions.copy_to_gpu(num_reqs) + self.prev_num_draft_tokens.copy_to_gpu() + cpu_values = self.input_batch.num_computed_tokens_cpu_tensor[:num_reqs].to( + device=self.device, non_blocking=True + ) + update_num_computed_tokens_for_batch_change( + self.num_computed_tokens, + self.num_accepted_tokens.gpu[:num_reqs], + self.prev_positions.gpu[:num_reqs], + self.valid_sampled_token_count_gpu, + self.prev_num_draft_tokens.gpu, + cpu_values, + ) + else: + self.num_computed_tokens[:num_reqs].copy_( + self.input_batch.num_computed_tokens_cpu_tensor[:num_reqs], + non_blocking=True, + ) + + self.req_indices.np[:total_num_scheduled_tokens] = req_indices + self.req_indices.copy_to_gpu(total_num_scheduled_tokens) + req_indices_gpu = self.req_indices.gpu[:total_num_scheduled_tokens] + + self.query_pos.copy_to_gpu(total_num_scheduled_tokens) + self.num_scheduled_tokens.np[:num_reqs] = num_scheduled_tokens + self.num_scheduled_tokens.copy_to_gpu(num_reqs) + num_scheduled_tokens_gpu = self.num_scheduled_tokens.gpu[:num_reqs] + self.positions[:total_num_scheduled_tokens] = ( + self.num_computed_tokens[req_indices_gpu].to(torch.int64) + + self.query_pos.gpu[:total_num_scheduled_tokens] + ) + self.seq_lens[:num_reqs] = ( + self.num_computed_tokens[:num_reqs] + num_scheduled_tokens_gpu + ) + self.seq_lens[num_reqs:].fill_(0) + + self.input_batch.block_table.compute_slot_mapping( + num_reqs, + self.query_start_loc.gpu[: num_reqs + 1], + self.positions[:total_num_scheduled_tokens], + ) + + if self.use_async_spec_decode and (self.uses_mrope or self.uses_xdrope_dim > 0): + drift = self.num_computed_tokens[req_indices_gpu].to( + torch.int64 + ) - self.input_batch.num_computed_tokens_cpu_tensor[req_indices].to( + device=self.device, dtype=torch.int64, non_blocking=True + ) + target = self.mrope_positions if self.uses_mrope else self.xdrope_positions + target.gpu[:, :total_num_scheduled_tokens] += drift + use_spec_decode = len(scheduler_output.scheduled_spec_decode_tokens) > 0 if not use_spec_decode: # NOTE(woosuk): Due to chunked prefills, the batch may contain @@ -840,11 +925,12 @@ def _prepare_inputs( draft_token_ids, ) in scheduler_output.scheduled_spec_decode_tokens.items(): req_idx = self.input_batch.req_id_to_index[req_id] - num_draft_tokens[req_idx] = len(draft_token_ids) + draft_len = len(draft_token_ids) + num_draft_tokens[req_idx] = draft_len if (self.is_kv_consumer and req_id in new_schedule_reqs) or \ (self.input_batch.num_computed_tokens_cpu[req_idx] >= \ self.input_batch.num_prompt_tokens[req_idx]): - num_decode_draft_tokens[req_idx] = len(draft_token_ids) + num_decode_draft_tokens[req_idx] = draft_len else: num_decode_draft_tokens[req_idx] = -1 @@ -927,24 +1013,23 @@ def _calc_spec_decode_metadata( # Compute the logits indices. # [4, 1, 3, 1, 2] num_sampled_tokens = num_draft_tokens + 1 - # Step 1. [4, 5, 8, 9, 11] - cu_num_sampled_tokens = np.cumsum(num_sampled_tokens, dtype=np.int32) - total_num_sampled_tokens = cu_num_sampled_tokens[-1] - # Step 2. [0, 0, 0, 0, 4, 5, 5, 5, 8, 9, 9] - cumsums_offsets = np.repeat(cu_num_sampled_tokens - num_sampled_tokens, num_sampled_tokens) - # Step 3. [0, 1, 2, 3, 0, 0, 1, 2, 0, 0, 1] - arange = self.arange_np[:total_num_sampled_tokens] - cumsums_offsets - # Step 4. [0, 0, 0, 0, 103, 104, 104, 104, 206, 207, 207] + # Step 1. + # cu_num_sampled_tokens: [4, 5, 8, 9, 11] + # _arange_scratch[:11]: [0, 1, 2, 3, 0, 0, 1, 2, 0, 0, 1] + cu_num_sampled_tokens = self._get_cumsum_and_arange( + num_sampled_tokens, self._arange_scratch, cumsum_dtype=np.int32 + ) + # Step 2. [0, 0, 0, 0, 103, 104, 104, 104, 206, 207, 207] logits_indices = np.repeat(cu_num_scheduled_tokens - num_sampled_tokens, num_sampled_tokens) - # Step 5. [0, 1, 2, 3, 103, 104, 105, 106, 206, 207, 208] - logits_indices += arange + # Step 3. [0, 1, 2, 3, 103, 104, 105, 106, 206, 207, 208] + logits_indices += self._arange_scratch[: cu_num_sampled_tokens[-1]] # while pcp > 1, decode results may contain padding (from pcp all-gather), # update logits_indices after getting draft_token_ids from ori logits_indices if self.pcp_size > 1: cu_num_scheduled_tokens = cu_num_scheduled_tokens * self.pcp_size - num_pcp_pads logits_indices_pcp = np.repeat(cu_num_scheduled_tokens - num_sampled_tokens, num_sampled_tokens) - logits_indices_pcp += arange + logits_indices_pcp += self._arange_scratch[: cu_num_sampled_tokens[-1]] logits_indices_pcp = torch.from_numpy(logits_indices_pcp).pin_memory().to(self.device, non_blocking=True) # Compute the bonus logits indices. @@ -1035,7 +1120,7 @@ def propose_draft_token_ids( ) assert self.drafter is not None next_token_ids, valid_sampled_tokens_count = self.drafter.prepare_next_token_ids_padded( - common_attn_metadata, + self.optimistic_seq_lens_cpu, sampled_token_ids, self.requests, self.input_batch, @@ -1164,7 +1249,7 @@ def execute_model( with record_function_or_nullcontext("prepare input"): with self.synchronize_input_prep(): # Update persistent batch states. - self._update_states(scheduler_output) + deferred_state_corrections_fn = self._update_states(scheduler_output) if has_ec_transfer() and get_ec_transfer().is_producer: with self.maybe_get_ec_connector_output( @@ -1267,6 +1352,12 @@ def execute_model( # '_update_states_after_model_execute', which is not overridden in vLLM-Ascend. # We simply utilize the implementation in vLLM. if self.cache_config.mamba_cache_mode == "align": + # preprocess_mamba reads req_state.num_computed_tokens (CPU) + # to decide copy operations, so we must apply deferred + # corrections before it runs. + if deferred_state_corrections_fn: + deferred_state_corrections_fn() + deferred_state_corrections_fn = None mamba_utils.preprocess_mamba( scheduler_output, self.kv_cache_config, @@ -1278,6 +1369,14 @@ def execute_model( self.model.get_mamba_state_copy_func(), self._get_mamba_copy_bufs(), ) + # preprocess_mamba resets num_accepted_tokens_cpu to 1 + # for requests whose state was copied to a new block. + # Re-sync to GPU so the mamba kernel reads from the + # correct initial state slot (init_token_idx = 0). + self.num_accepted_tokens.np[:num_reqs] = ( + self.input_batch.num_accepted_tokens_cpu[:num_reqs] + ) + self.num_accepted_tokens.copy_to_gpu(num_reqs) use_spec_decode = len(scheduler_output.scheduled_spec_decode_tokens) > 0 ubatch_slices_attn = ubatch_slices_padded if pad_attn else ubatch_slices @@ -1465,6 +1564,11 @@ def execute_model( batch_desc, ) self.kv_connector_output = kv_connector_output + + # Now the batch has been launched we can wait for corrections from the + # previous model forward without breaking async scheduling. + if deferred_state_corrections_fn: + deferred_state_corrections_fn() return None @torch.inference_mode() @@ -1529,6 +1633,8 @@ def sample_tokens( assert self.sampling_done_event is not None self.sampling_done_event.record() + self.valid_sampled_token_count_gpu = None + def propose_draft_token_ids(sampled_token_ids): assert spec_decode_common_attn_metadata is not None self._draft_token_ids = self.propose_draft_token_ids( @@ -2042,11 +2148,8 @@ def _build_attention_metadata( # window size when capturing to make sure the correct kernel is selected. max_seq_len = self.max_model_len else: - max_seq_len = self.seq_lens.np[:num_reqs].max().item() - if use_spec_decode and self.need_accepted_tokens: - self.num_accepted_tokens.np[:num_reqs] = self.input_batch.num_accepted_tokens_cpu[:num_reqs] - self.num_accepted_tokens.np[num_reqs:].fill(1) - self.num_accepted_tokens.copy_to_gpu() + max_seq_len = self.optimistic_seq_lens_cpu.numpy()[:num_reqs].max().item() + kv_cache_groups = self.kv_cache_config.kv_cache_groups @@ -2111,14 +2214,21 @@ def _get_block_table_and_slot_mapping(kv_cache_gid: int): block_table_gid_0, slot_mapping_gid_0 = _get_block_table_and_slot_mapping(0) self.long_seq_metadata, block_table_gid_0 = _get_pcp_metadata(block_table_gid_0) + seq_lens_cpu = self.optimistic_seq_lens_cpu[:num_reqs_padded] + if self.use_async_spec_decode: + # GPU tensors are authoritative in async mode. + seq_lens_cpu = None + num_computed_tokens_cpu = None + cm_base = AscendCommonAttentionMetadata( query_start_loc=self.query_start_loc.gpu[: num_reqs_padded + 1], query_start_loc_cpu=self.query_start_loc.cpu[: num_reqs_padded + 1], - seq_lens=self.seq_lens.gpu[:num_reqs_padded], + seq_lens=self.seq_lens[:num_reqs_padded], # TODO - seq_lens_cpu=self.seq_lens.cpu[:num_reqs_padded], + seq_lens_cpu=seq_lens_cpu, # TODO - num_computed_tokens_cpu=self.input_batch.num_computed_tokens_cpu_tensor[:num_reqs_padded], + # num_computed_tokens_cpu=self.input_batch.num_computed_tokens_cpu_tensor[:num_reqs_padded], + num_computed_tokens_cpu=num_computed_tokens_cpu, num_reqs=num_reqs_padded, num_actual_tokens=num_tokens, max_query_len=max_query_len, @@ -2128,7 +2238,7 @@ def _get_block_table_and_slot_mapping(kv_cache_gid: int): causal=True, num_input_tokens=num_tokens_padded, actual_seq_lengths_q=self.actual_seq_lengths_q, - positions=self.positions.gpu, + positions=self.positions, attn_state=self.attn_state, decode_token_per_req=self.decode_token_per_req, prefill_context_parallel_metadata=self.long_seq_metadata, @@ -2392,11 +2502,13 @@ def _dummy_run( if is_graph_capturing and using_paged_attention(num_tokens, self.vllm_config) else max_query_len ) # type: ignore[assignment] - self.seq_lens.np[:num_reqs_padded] = seq_lens - self.seq_lens.np[num_reqs_padded:] = 0 - self.seq_lens.copy_to_gpu() - cum_num_tokens, _ = self._get_cumsum_and_arange(num_scheduled_tokens) + self.optimistic_seq_lens_cpu[:num_reqs] = seq_lens + self.optimistic_seq_lens_cpu[num_reqs:].fill_(0) + self.seq_lens.copy_(self.optimistic_seq_lens_cpu, non_blocking=True) + + cum_num_tokens = self._get_cumsum_and_arange( + num_scheduled_tokens, self.query_pos.np) self.query_start_loc.np[1 : num_reqs_padded + 1] = cum_num_tokens self.query_start_loc.copy_to_gpu() num_reqs_padded = self._pad_query_start_loc_for_fia( @@ -2438,7 +2550,7 @@ def _dummy_run( elif self.uses_xdrope_dim > 0: positions = self.xdrope_positions.gpu[:, :num_tokens_padded] else: - positions = self.positions.gpu[:num_tokens_padded] + positions = self.positions[:num_tokens_padded] # update global cos, sin update_cos_sin(positions) diff --git a/vllm_ascend/worker/npu_input_batch.py b/vllm_ascend/worker/npu_input_batch.py index a48ea5efbb6..98dfafa92b5 100644 --- a/vllm_ascend/worker/npu_input_batch.py +++ b/vllm_ascend/worker/npu_input_batch.py @@ -164,7 +164,7 @@ def __init__( # Speculative decoding self.num_accepted_tokens_cpu_tensor = torch.ones( - (max_num_reqs,), dtype=torch.int64, device="cpu", pin_memory=pin_memory + (max_num_reqs,), dtype=torch.int32, device="cpu", pin_memory=pin_memory ) self.num_accepted_tokens_cpu = self.num_accepted_tokens_cpu_tensor.numpy() From e599872ea4865e777c4a792e58b21f3454363aa5 Mon Sep 17 00:00:00 2001 From: 01267596 Date: Wed, 25 Mar 2026 09:22:32 +0000 Subject: [PATCH 02/31] [Async][spec decode] Zero-bubble async scheduling +spec decoding Signed-off-by: 01267596 --- vllm_ascend/attention/utils.py | 2 +- vllm_ascend/spec_decode/eagle_proposer.py | 3 ++- 2 files changed, 3 insertions(+), 2 deletions(-) diff --git a/vllm_ascend/attention/utils.py b/vllm_ascend/attention/utils.py index c5513745b0c..1c4245ad654 100644 --- a/vllm_ascend/attention/utils.py +++ b/vllm_ascend/attention/utils.py @@ -170,7 +170,7 @@ def unpadded(self, num_actual_tokens: int, num_actual_reqs: int) -> "AscendCommo query_start_loc_cpu=self.query_start_loc_cpu[: num_actual_reqs + 1], seq_lens=self.seq_lens[:num_actual_reqs], seq_lens_cpu=self.seq_lens_cpu[:num_actual_reqs] if self.seq_lens_cpu is not None else None, - num_computed_tokens_cpu=self.num_computed_tokens_cpu[:num_actual_reqs], + num_computed_tokens_cpu=self.num_computed_tokens_cpu[:num_actual_reqs] if self.num_computed_tokens_cpu is not None else None, num_reqs=num_actual_reqs, num_actual_tokens=num_actual_tokens, max_query_len=self.max_query_len, diff --git a/vllm_ascend/spec_decode/eagle_proposer.py b/vllm_ascend/spec_decode/eagle_proposer.py index 923ed5d221d..844bd00148f 100644 --- a/vllm_ascend/spec_decode/eagle_proposer.py +++ b/vllm_ascend/spec_decode/eagle_proposer.py @@ -1194,7 +1194,8 @@ def attn_update_stack_num_spec_norm( common_attn_metadata.seq_lens_cpu[:batch_size] = common_attn_metadata.seq_lens_cpu[:batch_size] + 1 exceeds_mask = common_attn_metadata.seq_lens_cpu[:batch_size] >= self.max_model_len common_attn_metadata.seq_lens_cpu[:batch_size].masked_fill_(exceeds_mask, 1) - common_attn_metadata.num_computed_tokens_cpu[:batch_size] += 1 + if common_attn_metadata.num_computed_tokens_cpu is not None: + common_attn_metadata.num_computed_tokens_cpu[:batch_size] += 1 if self.uses_mrope: common_attn_metadata.positions[:batch_size].copy_(clamped_positions[0]) else: From 33a6d13a182d09b12b424c904521db5f9b865396 Mon Sep 17 00:00:00 2001 From: 01267596 Date: Wed, 25 Mar 2026 10:20:48 +0000 Subject: [PATCH 03/31] [Async][spec decode] Zero-bubble async scheduling +spec decoding Signed-off-by: 01267596 --- vllm_ascend/worker/model_runner_v1.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/vllm_ascend/worker/model_runner_v1.py b/vllm_ascend/worker/model_runner_v1.py index 0b08e471d37..502544c6863 100644 --- a/vllm_ascend/worker/model_runner_v1.py +++ b/vllm_ascend/worker/model_runner_v1.py @@ -2213,7 +2213,9 @@ def _get_block_table_and_slot_mapping(kv_cache_gid: int): block_table_gid_0, slot_mapping_gid_0 = _get_block_table_and_slot_mapping(0) self.long_seq_metadata, block_table_gid_0 = _get_pcp_metadata(block_table_gid_0) - + num_computed_tokens_cpu = self.input_batch.num_computed_tokens_cpu_tensor[ + :num_reqs_padded + ] seq_lens_cpu = self.optimistic_seq_lens_cpu[:num_reqs_padded] if self.use_async_spec_decode: # GPU tensors are authoritative in async mode. From 0a60c267a4b2ccb8e7591d04695955562926cd04 Mon Sep 17 00:00:00 2001 From: 01267596 Date: Fri, 27 Mar 2026 02:47:31 +0000 Subject: [PATCH 04/31] optimize Signed-off-by: 01267596 --- vllm_ascend/spec_decode/utils.py | 35 +++++++++++++++++++++++++++ vllm_ascend/worker/model_runner_v1.py | 2 +- 2 files changed, 36 insertions(+), 1 deletion(-) create mode 100644 vllm_ascend/spec_decode/utils.py diff --git a/vllm_ascend/spec_decode/utils.py b/vllm_ascend/spec_decode/utils.py new file mode 100644 index 00000000000..1d8d82fdcce --- /dev/null +++ b/vllm_ascend/spec_decode/utils.py @@ -0,0 +1,35 @@ +# SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project +import torch + + +def update_num_computed_tokens_for_batch_change( + num_computed_tokens: torch.Tensor, + num_accepted_tokens: torch.Tensor, + prev_positions: torch.Tensor, + valid_sampled_token_count: torch.Tensor, + prev_num_draft_tokens: torch.Tensor, + cpu_num_computed_tokens: torch.Tensor, +) -> None: + """Correct num_computed_tokens for async spec decode drift. + + Requests that had drafts: corrected = prev_gpu + valid_count. + New requests or non-draft (e.g. prefills): use CPU value directly. + """ + # Clamp because prev_positions can be -1 for new requests + gather_indices = prev_positions.clamp(min=0) + + valid_counts = valid_sampled_token_count[gather_indices] + prev_computed = num_computed_tokens[gather_indices] + prev_drafts = prev_num_draft_tokens[gather_indices] + + participating = (prev_positions >= 0) & (prev_drafts > 0) + corrected = prev_computed + valid_counts.int() + + n = prev_positions.shape[0] + num_computed_tokens[:n].copy_( + torch.where(participating, corrected, cpu_num_computed_tokens) + ) + num_accepted_tokens.copy_( + torch.where(participating, valid_counts, num_accepted_tokens) + ) \ No newline at end of file diff --git a/vllm_ascend/worker/model_runner_v1.py b/vllm_ascend/worker/model_runner_v1.py index 502544c6863..98bf538f70a 100644 --- a/vllm_ascend/worker/model_runner_v1.py +++ b/vllm_ascend/worker/model_runner_v1.py @@ -76,7 +76,6 @@ from vllm.v1.sample.metadata import SamplingMetadata from vllm.v1.sample.rejection_sampler import RejectionSampler from vllm.v1.spec_decode.metadata import SpecDecodeMetadata -from vllm.v1.spec_decode.utils import update_num_computed_tokens_for_batch_change from vllm.v1.structured_output.utils import apply_grammar_bitmask from vllm.v1.utils import record_function_or_nullcontext from vllm.v1.worker import mamba_utils @@ -119,6 +118,7 @@ from vllm_ascend.spec_decode.medusa_proposer import AscendMedusaProposer from vllm_ascend.spec_decode.ngram_proposer import AscendNgramProposer from vllm_ascend.spec_decode.suffix_proposer import AscendSuffixDecodingProposer +from vllm_ascend.spec_decode.utils import update_num_computed_tokens_for_batch_change from vllm_ascend.utils import ( calc_split_factor, check_gdn_layer, From 92481840940ee0f5791537d18c621ed027a46583 Mon Sep 17 00:00:00 2001 From: 22dimensions Date: Wed, 25 Mar 2026 11:27:11 +0800 Subject: [PATCH 05/31] update to 0324 Signed-off-by: 01267596 --- .github/workflows/_e2e_test.yaml | 2 +- .github/workflows/bot_pr_create.yaml | 2 +- .github/workflows/dockerfiles/Dockerfile.lint | 2 +- .github/workflows/pr_test_full.yaml | 2 +- .github/workflows/pr_test_light.yaml | 8 ++++++-- .github/workflows/schedule_codecov_refresh.yaml | 2 +- 6 files changed, 11 insertions(+), 7 deletions(-) diff --git a/.github/workflows/_e2e_test.yaml b/.github/workflows/_e2e_test.yaml index a5a0a63aaec..bbad7b5aa3a 100644 --- a/.github/workflows/_e2e_test.yaml +++ b/.github/workflows/_e2e_test.yaml @@ -18,7 +18,7 @@ on: continue_on_error: required: false type: boolean - default: false + default: true env: UV_INDEX_URL: http://cache-service.nginx-pypi-cache.svc.cluster.local/pypi/simple UV_EXTRA_INDEX_URL: https://mirrors.huaweicloud.com/ascend/repos/pypi diff --git a/.github/workflows/bot_pr_create.yaml b/.github/workflows/bot_pr_create.yaml index 7e24b5d396e..bdb75d25ae0 100644 --- a/.github/workflows/bot_pr_create.yaml +++ b/.github/workflows/bot_pr_create.yaml @@ -37,7 +37,7 @@ jobs: steps: - name: Get vLLM version run: | - VLLM_COMMIT=35141a7eeda941a60ad5a4956670c60fd5a77029 + VLLM_COMMIT=14acf429ac08b6d538ca6feb3e06b6d13895804d echo "VLLM_COMMIT=https://github.com/vllm-project/vllm/commit/$VLLM_COMMIT" >> "$GITHUB_ENV" - name: Checkout repository diff --git a/.github/workflows/dockerfiles/Dockerfile.lint b/.github/workflows/dockerfiles/Dockerfile.lint index 9544bba796f..948d88fdf44 100644 --- a/.github/workflows/dockerfiles/Dockerfile.lint +++ b/.github/workflows/dockerfiles/Dockerfile.lint @@ -27,7 +27,7 @@ RUN apt-get update -y && \ ARG VLLM_REPO=https://github.com/vllm-project/vllm.git # For lint purpose, actually we need make a main2main matching. -ARG VLLM_COMMIT=35141a7eeda941a60ad5a4956670c60fd5a77029 +ARG VLLM_COMMIT=14acf429ac08b6d538ca6feb3e06b6d13895804d RUN git clone $VLLM_REPO /vllm-workspace/vllm && \ cd /vllm-workspace/vllm && \ git checkout $VLLM_COMMIT diff --git a/.github/workflows/pr_test_full.yaml b/.github/workflows/pr_test_full.yaml index 5e002c65fbe..d9e8b62e7c1 100644 --- a/.github/workflows/pr_test_full.yaml +++ b/.github/workflows/pr_test_full.yaml @@ -75,7 +75,7 @@ jobs: name: e2e-full strategy: matrix: - vllm_version: [35141a7eeda941a60ad5a4956670c60fd5a77029, v0.18.0] + vllm_version: [14acf429ac08b6d538ca6feb3e06b6d13895804d, v0.18.0] needs: [changes] if: ${{ needs.changes.outputs.e2e_tracker == 'true' || needs.changes.outputs.e2e_tracker == true }} uses: ./.github/workflows/_e2e_test.yaml diff --git a/.github/workflows/pr_test_light.yaml b/.github/workflows/pr_test_light.yaml index aa508c938da..af328a54977 100644 --- a/.github/workflows/pr_test_light.yaml +++ b/.github/workflows/pr_test_light.yaml @@ -41,7 +41,11 @@ jobs: lint: uses: ./.github/workflows/_pre_commit.yml with: +<<<<<<< HEAD vllm: 35141a7eeda941a60ad5a4956670c60fd5a77029 +======= + vllm: 14acf429ac08b6d538ca6feb3e06b6d13895804d +>>>>>>> a8b92f94 (update to 0324) changes: runs-on: linux-aarch64-a2b3-0 outputs: @@ -90,7 +94,7 @@ jobs: if: ${{ needs.lint.result == 'success' && (needs.changes.outputs.e2e_tracker == 'true' || needs.changes.outputs.ut_tracker == 'true') }} strategy: matrix: - vllm_version: [35141a7eeda941a60ad5a4956670c60fd5a77029, v0.18.0] + vllm_version: [14acf429ac08b6d538ca6feb3e06b6d13895804d, v0.18.0] uses: ./.github/workflows/_unit_test.yaml with: vllm: ${{ matrix.vllm_version }} @@ -102,7 +106,7 @@ jobs: name: e2e-light strategy: matrix: - vllm_version: [35141a7eeda941a60ad5a4956670c60fd5a77029, v0.18.0] + vllm_version: [14acf429ac08b6d538ca6feb3e06b6d13895804d, v0.18.0] # Note (yikun): If CI resource are limited we can split job into two chain jobs needs: [lint, changes] # only trigger e2e test after lint passed and the change is e2e related with pull request. diff --git a/.github/workflows/schedule_codecov_refresh.yaml b/.github/workflows/schedule_codecov_refresh.yaml index 2eecbb1cc2e..1b8e6d3837b 100644 --- a/.github/workflows/schedule_codecov_refresh.yaml +++ b/.github/workflows/schedule_codecov_refresh.yaml @@ -33,7 +33,7 @@ jobs: name: refresh codecov strategy: matrix: - vllm_version: [35141a7eeda941a60ad5a4956670c60fd5a77029] + vllm_version: [14acf429ac08b6d538ca6feb3e06b6d13895804d] uses: ./.github/workflows/_unit_test.yaml with: vllm: ${{ matrix.vllm_version }} From 1dfa935fdede5609cc3d1e6c1266135df90fa89b Mon Sep 17 00:00:00 2001 From: Claude Code Date: Wed, 25 Mar 2026 06:42:30 +0000 Subject: [PATCH 06/31] fix: add vllm_is_batch_invariant compatibility wrapper Upstream vLLM has removed the vllm_is_batch_invariant() function from batch_invariant.py and now uses envs.VLLM_BATCH_INVARIANT directly. Create a compatibility wrapper in vllm_ascend/batch_invariant.py that checks envs.VLLM_BATCH_INVARIANT and update all imports across the codebase to use the local implementation instead of trying to import from vllm. Changes: - Add vllm_is_batch_invariant() function to vllm_ascend/batch_invariant.py - Update imports in ascend_config.py, sample/sampler.py, and utils.py Fixes: ImportError when running multicard tests Co-Authored-By: Claude Code Signed-off-by: 01267596 --- vllm_ascend/ascend_config.py | 2 +- vllm_ascend/batch_invariant.py | 16 +++++++++++++++- vllm_ascend/sample/sampler.py | 5 +---- vllm_ascend/utils.py | 2 +- 4 files changed, 18 insertions(+), 7 deletions(-) diff --git a/vllm_ascend/ascend_config.py b/vllm_ascend/ascend_config.py index c872c1796bb..cdb9d0465aa 100644 --- a/vllm_ascend/ascend_config.py +++ b/vllm_ascend/ascend_config.py @@ -129,7 +129,7 @@ def __init__(self, vllm_config: "VllmConfig"): # when enable_async_exponential is True, AscendSampler will be different from vllm Sampler, # which make batch_invariant mode not working. # so we disable async exponential when batch_invariant mode is enabled. - from vllm.model_executor.layers.batch_invariant import vllm_is_batch_invariant + from vllm_ascend.batch_invariant import vllm_is_batch_invariant self.enable_async_exponential = ( bool(additional_config.get("enable_async_exponential", False)) and not vllm_is_batch_invariant() diff --git a/vllm_ascend/batch_invariant.py b/vllm_ascend/batch_invariant.py index 7f27fd6e4ef..20270068eb7 100644 --- a/vllm_ascend/batch_invariant.py +++ b/vllm_ascend/batch_invariant.py @@ -20,14 +20,28 @@ import torch import torch_npu +import vllm.envs as envs from vllm.logger import logger -from vllm.model_executor.layers.batch_invariant import vllm_is_batch_invariant from vllm.triton_utils import HAS_TRITON # in case recursive call in reduce_sum. torch_sum = torch.sum +def vllm_is_batch_invariant() -> bool: + """Check if batch-invariant mode is enabled. + + This is a compatibility wrapper for the vllm function that was removed + in recent upstream vLLM refactoring. + """ + # Try to access from envs module, fall back to environment variable + if hasattr(envs, 'VLLM_BATCH_INVARIANT'): + return bool(envs.VLLM_BATCH_INVARIANT) + else: + # Fallback to environment variable for older vLLM versions + return bool(int(os.getenv("VLLM_BATCH_INVARIANT", "0"))) + + if HAS_TRITON: from vllm_ascend.ops.triton.batch_invariant.matmul import ( addmm_batch_invariant, diff --git a/vllm_ascend/sample/sampler.py b/vllm_ascend/sample/sampler.py index 68082152aed..5e35901112e 100644 --- a/vllm_ascend/sample/sampler.py +++ b/vllm_ascend/sample/sampler.py @@ -1,12 +1,9 @@ import torch -from vllm.model_executor.layers.batch_invariant import vllm_is_batch_invariant -from vllm.triton_utils import HAS_TRITON -from vllm.v1.sample.metadata import SamplingMetadata from vllm.v1.sample.ops.topk_topp_sampler import TopKTopPSampler from vllm.v1.sample.sampler import Sampler from vllm_ascend.ascend_config import get_ascend_config -from vllm_ascend.sample.penalties import apply_all_penalties +from vllm_ascend.batch_invariant import vllm_is_batch_invariant from vllm_ascend.utils import AscendDeviceType, get_ascend_device_type, global_stream, npu_stream_switch DEFAULT_LOGPROBS_MODE = "raw_logprobs" diff --git a/vllm_ascend/utils.py b/vllm_ascend/utils.py index 7e773432f40..9eb164378ab 100644 --- a/vllm_ascend/utils.py +++ b/vllm_ascend/utils.py @@ -259,7 +259,7 @@ def enable_custom_op(): Enable lazy init for vllm_ascend_C to avoid early initialization of CANN's RTS component. Ensure that ASCEND_RT_VISIBLE_DEVICES can be dynamically modified before torch.npu.set_device(). """ - from vllm.model_executor.layers.batch_invariant import vllm_is_batch_invariant + from vllm_ascend.batch_invariant import vllm_is_batch_invariant global _CUSTOM_OP_ENABLED From c82dad6afb1b0a2924b7436e24ed983b73b549ad Mon Sep 17 00:00:00 2001 From: 01267596 Date: Tue, 31 Mar 2026 03:09:24 +0000 Subject: [PATCH 07/31] fix Signed-off-by: 01267596 --- .github/workflows/pr_test_light.yaml | 4 ---- 1 file changed, 4 deletions(-) diff --git a/.github/workflows/pr_test_light.yaml b/.github/workflows/pr_test_light.yaml index af328a54977..6368a0e44f9 100644 --- a/.github/workflows/pr_test_light.yaml +++ b/.github/workflows/pr_test_light.yaml @@ -41,11 +41,7 @@ jobs: lint: uses: ./.github/workflows/_pre_commit.yml with: -<<<<<<< HEAD - vllm: 35141a7eeda941a60ad5a4956670c60fd5a77029 -======= vllm: 14acf429ac08b6d538ca6feb3e06b6d13895804d ->>>>>>> a8b92f94 (update to 0324) changes: runs-on: linux-aarch64-a2b3-0 outputs: From 62c38eeea94cadf96d13d758a282e3477cfaa46d Mon Sep 17 00:00:00 2001 From: 01267596 Date: Tue, 31 Mar 2026 03:17:09 +0000 Subject: [PATCH 08/31] fix format Signed-off-by: 01267596 --- vllm_ascend/attention/utils.py | 4 +++- vllm_ascend/batch_invariant.py | 2 +- vllm_ascend/spec_decode/utils.py | 8 ++------ 3 files changed, 6 insertions(+), 8 deletions(-) diff --git a/vllm_ascend/attention/utils.py b/vllm_ascend/attention/utils.py index 1c4245ad654..5a89d4fa962 100644 --- a/vllm_ascend/attention/utils.py +++ b/vllm_ascend/attention/utils.py @@ -170,7 +170,9 @@ def unpadded(self, num_actual_tokens: int, num_actual_reqs: int) -> "AscendCommo query_start_loc_cpu=self.query_start_loc_cpu[: num_actual_reqs + 1], seq_lens=self.seq_lens[:num_actual_reqs], seq_lens_cpu=self.seq_lens_cpu[:num_actual_reqs] if self.seq_lens_cpu is not None else None, - num_computed_tokens_cpu=self.num_computed_tokens_cpu[:num_actual_reqs] if self.num_computed_tokens_cpu is not None else None, + num_computed_tokens_cpu=self.num_computed_tokens_cpu[:num_actual_reqs] + if self.num_computed_tokens_cpu is not None + else None, num_reqs=num_actual_reqs, num_actual_tokens=num_actual_tokens, max_query_len=self.max_query_len, diff --git a/vllm_ascend/batch_invariant.py b/vllm_ascend/batch_invariant.py index 20270068eb7..b3cff584073 100644 --- a/vllm_ascend/batch_invariant.py +++ b/vllm_ascend/batch_invariant.py @@ -35,7 +35,7 @@ def vllm_is_batch_invariant() -> bool: in recent upstream vLLM refactoring. """ # Try to access from envs module, fall back to environment variable - if hasattr(envs, 'VLLM_BATCH_INVARIANT'): + if hasattr(envs, "VLLM_BATCH_INVARIANT"): return bool(envs.VLLM_BATCH_INVARIANT) else: # Fallback to environment variable for older vLLM versions diff --git a/vllm_ascend/spec_decode/utils.py b/vllm_ascend/spec_decode/utils.py index 1d8d82fdcce..7f407cc2782 100644 --- a/vllm_ascend/spec_decode/utils.py +++ b/vllm_ascend/spec_decode/utils.py @@ -27,9 +27,5 @@ def update_num_computed_tokens_for_batch_change( corrected = prev_computed + valid_counts.int() n = prev_positions.shape[0] - num_computed_tokens[:n].copy_( - torch.where(participating, corrected, cpu_num_computed_tokens) - ) - num_accepted_tokens.copy_( - torch.where(participating, valid_counts, num_accepted_tokens) - ) \ No newline at end of file + num_computed_tokens[:n].copy_(torch.where(participating, corrected, cpu_num_computed_tokens)) + num_accepted_tokens.copy_(torch.where(participating, valid_counts, num_accepted_tokens)) From dfe1b6d6a715a3bc1a98221db591340b36fe3ee2 Mon Sep 17 00:00:00 2001 From: 01267596 Date: Tue, 31 Mar 2026 06:53:09 +0000 Subject: [PATCH 09/31] fix Signed-off-by: 01267596 --- vllm_ascend/sample/sampler.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/vllm_ascend/sample/sampler.py b/vllm_ascend/sample/sampler.py index 5e35901112e..817557e06ad 100644 --- a/vllm_ascend/sample/sampler.py +++ b/vllm_ascend/sample/sampler.py @@ -1,4 +1,6 @@ import torch +from vllm.triton_utils import HAS_TRITON +from vllm.v1.sample.metadata import SamplingMetadata from vllm.v1.sample.ops.topk_topp_sampler import TopKTopPSampler from vllm.v1.sample.sampler import Sampler From 5f157ba37049e7750d2bd043fbcbedc47fa163a8 Mon Sep 17 00:00:00 2001 From: 01267596 Date: Tue, 31 Mar 2026 07:02:40 +0000 Subject: [PATCH 10/31] fix Signed-off-by: 01267596 --- vllm_ascend/sample/sampler.py | 1 + 1 file changed, 1 insertion(+) diff --git a/vllm_ascend/sample/sampler.py b/vllm_ascend/sample/sampler.py index 817557e06ad..e950d88010c 100644 --- a/vllm_ascend/sample/sampler.py +++ b/vllm_ascend/sample/sampler.py @@ -5,6 +5,7 @@ from vllm.v1.sample.sampler import Sampler from vllm_ascend.ascend_config import get_ascend_config +from vllm_ascend.sample.penalties import apply_all_penalties from vllm_ascend.batch_invariant import vllm_is_batch_invariant from vllm_ascend.utils import AscendDeviceType, get_ascend_device_type, global_stream, npu_stream_switch From 9b0ff7314b2fcfe3fb7d54e9e84b304841c4f221 Mon Sep 17 00:00:00 2001 From: 01267596 Date: Tue, 31 Mar 2026 07:10:48 +0000 Subject: [PATCH 11/31] fix Signed-off-by: 01267596 --- vllm_ascend/sample/sampler.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/vllm_ascend/sample/sampler.py b/vllm_ascend/sample/sampler.py index e950d88010c..95d16139859 100644 --- a/vllm_ascend/sample/sampler.py +++ b/vllm_ascend/sample/sampler.py @@ -5,8 +5,8 @@ from vllm.v1.sample.sampler import Sampler from vllm_ascend.ascend_config import get_ascend_config -from vllm_ascend.sample.penalties import apply_all_penalties from vllm_ascend.batch_invariant import vllm_is_batch_invariant +from vllm_ascend.sample.penalties import apply_all_penalties from vllm_ascend.utils import AscendDeviceType, get_ascend_device_type, global_stream, npu_stream_switch DEFAULT_LOGPROBS_MODE = "raw_logprobs" From c39d214dd3669f9386c84e6686cd241d583a3d69 Mon Sep 17 00:00:00 2001 From: 01267596 Date: Tue, 31 Mar 2026 08:20:55 +0000 Subject: [PATCH 12/31] fix Signed-off-by: 01267596 --- vllm_ascend/kv_offload/npu.py | 8 ++++---- vllm_ascend/worker/model_runner_v1.py | 4 ++-- 2 files changed, 6 insertions(+), 6 deletions(-) diff --git a/vllm_ascend/kv_offload/npu.py b/vllm_ascend/kv_offload/npu.py index bd68ed16b27..90816ce3abb 100644 --- a/vllm_ascend/kv_offload/npu.py +++ b/vllm_ascend/kv_offload/npu.py @@ -5,8 +5,7 @@ from vllm.v1.attention.backend import AttentionBackend # type: ignore from vllm.v1.kv_cache_interface import KVCacheConfig from vllm.v1.kv_offload.abstract import LoadStoreSpec, OffloadingManager -from vllm.v1.kv_offload.backends.cpu import CPUBackend -from vllm.v1.kv_offload.lru_manager import LRUOffloadingManager +from vllm.v1.kv_offload.cpu.manager import CPUOffloadingManager from vllm.v1.kv_offload.mediums import CPULoadStoreSpec, GPULoadStoreSpec from vllm.v1.kv_offload.spec import OffloadingSpec from vllm.v1.kv_offload.worker.worker import OffloadingHandler @@ -36,8 +35,9 @@ def get_manager(self) -> OffloadingManager: assert len(self.gpu_block_size) == 1 gpu_block_size = self.gpu_block_size[0] offloaded_block_size = gpu_block_size * self.block_size_factor - self._manager = LRUOffloadingManager( - CPUBackend(block_size=offloaded_block_size, num_blocks=self.num_cpu_blocks), + self._manager = CPUOffloadingManager( + block_size=offloaded_block_size, + num_blocks=self.num_cpu_blocks, enable_events=enable_events, ) return self._manager diff --git a/vllm_ascend/worker/model_runner_v1.py b/vllm_ascend/worker/model_runner_v1.py index 98bf538f70a..7b5baaf4929 100644 --- a/vllm_ascend/worker/model_runner_v1.py +++ b/vllm_ascend/worker/model_runner_v1.py @@ -24,7 +24,7 @@ from copy import copy, deepcopy from dataclasses import dataclass from multiprocessing import Manager -from typing import TYPE_CHECKING, Any, NamedTuple, TypeAlias +from typing import TYPE_CHECKING, Any, NamedTuple, TypeAlias, Optional import numpy as np import torch @@ -1633,7 +1633,7 @@ def sample_tokens( assert self.sampling_done_event is not None self.sampling_done_event.record() - self.valid_sampled_token_count_gpu = None + self.valid_sampled_token_count_gpu: Optional[torch.Tensor] = None def propose_draft_token_ids(sampled_token_ids): assert spec_decode_common_attn_metadata is not None From 013dcbe59724b8da4b42f1951805be3a6d2297e7 Mon Sep 17 00:00:00 2001 From: 01267596 Date: Tue, 31 Mar 2026 08:29:19 +0000 Subject: [PATCH 13/31] fix Signed-off-by: 01267596 --- vllm_ascend/worker/model_runner_v1.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/vllm_ascend/worker/model_runner_v1.py b/vllm_ascend/worker/model_runner_v1.py index 7b5baaf4929..fb36c0f4e07 100644 --- a/vllm_ascend/worker/model_runner_v1.py +++ b/vllm_ascend/worker/model_runner_v1.py @@ -24,7 +24,7 @@ from copy import copy, deepcopy from dataclasses import dataclass from multiprocessing import Manager -from typing import TYPE_CHECKING, Any, NamedTuple, TypeAlias, Optional +from typing import TYPE_CHECKING, Any, NamedTuple, TypeAlias import numpy as np import torch @@ -1633,7 +1633,7 @@ def sample_tokens( assert self.sampling_done_event is not None self.sampling_done_event.record() - self.valid_sampled_token_count_gpu: Optional[torch.Tensor] = None + self.valid_sampled_token_count_gpu: torch.Tensor | None = None def propose_draft_token_ids(sampled_token_ids): assert spec_decode_common_attn_metadata is not None From 69bf146e4c2a126daf34cd0e2362c871ecf61bfa Mon Sep 17 00:00:00 2001 From: HF-001 <1670186653@qq.com> Date: Tue, 31 Mar 2026 20:12:22 +0800 Subject: [PATCH 14/31] fix Signed-off-by: HF-001 <1670186653@qq.com> --- vllm_ascend/spec_decode/eagle_proposer.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/vllm_ascend/spec_decode/eagle_proposer.py b/vllm_ascend/spec_decode/eagle_proposer.py index d6b46d1c5dc..c1ba5ec6bd7 100644 --- a/vllm_ascend/spec_decode/eagle_proposer.py +++ b/vllm_ascend/spec_decode/eagle_proposer.py @@ -559,7 +559,9 @@ def _propose( common_attn_metadata.block_table_tensor, num_reqs_padded ) common_attn_metadata.seq_lens = self._adjust_tensor(self.runner.seq_lens.gpu, num_reqs_padded) - common_attn_metadata.seq_lens_cpu = self._adjust_tensor(self.runner.optimistic_seq_lens_cpu, num_reqs_padded) + common_attn_metadata.seq_lens_cpu = self._adjust_tensor( + self.runner.optimistic_seq_lens_cpu, num_reqs_padded + ) common_attn_metadata.num_computed_tokens_cpu = self._adjust_tensor( common_attn_metadata.num_computed_tokens_cpu, num_reqs_padded ) From c45a06613666cb6833ed318e78efc57347032772 Mon Sep 17 00:00:00 2001 From: 01267596 Date: Wed, 1 Apr 2026 02:28:40 +0000 Subject: [PATCH 15/31] fix ut test Signed-off-by: 01267596 --- tests/ut/worker/test_block_table.py | 83 ++++++++++++++--------------- 1 file changed, 39 insertions(+), 44 deletions(-) diff --git a/tests/ut/worker/test_block_table.py b/tests/ut/worker/test_block_table.py index ba378e9b1da..9218dbddea0 100644 --- a/tests/ut/worker/test_block_table.py +++ b/tests/ut/worker/test_block_table.py @@ -25,7 +25,7 @@ class TestBlockTableComputeSlotMapping(TestBase): """Test suite for BlockTable.compute_slot_mapping() method - + This test suite covers different configurations of DCP (Decode Context Parallelism), PCP (Prefill Context Parallelism), and cp_kv_cache_interleave_size to ensure correct slot_mapping calculation on different ranks. @@ -41,13 +41,13 @@ def setUp(self): self.device = torch.device("cpu") self.kernel_sizes = [128] - def create_block_table(self, dcp_world_size, dcp_rank, pcp_world_size, - pcp_rank, cp_kv_cache_interleave_size): + def create_block_table(self, dcp_world_size, dcp_rank, pcp_world_size, pcp_rank, cp_kv_cache_interleave_size): """Helper method to create BlockTable with mocked distributed groups""" - with patch('vllm_ascend.worker.block_table.get_dcp_group') as mock_get_dcp_group, \ - patch('vllm_ascend.worker.block_table.get_pcp_group') as mock_get_pcp_group: - + with ( + patch("vllm_ascend.worker.block_table.get_dcp_group") as mock_get_dcp_group, + patch("vllm_ascend.worker.block_table.get_pcp_group") as mock_get_pcp_group, + ): # Mock DCP group mock_dcp_group = MagicMock(spec=GroupCoordinator) mock_dcp_group.world_size = dcp_world_size @@ -71,7 +71,8 @@ def create_block_table(self, dcp_world_size, dcp_rank, pcp_world_size, device=self.device, kernel_sizes=self.kernel_sizes, cp_kv_cache_interleave_size=cp_kv_cache_interleave_size, - num_speculative_tokens=0) + num_speculative_tokens=0, + ) return block_table @@ -79,15 +80,12 @@ def setup_block_table_data(self, block_table, num_reqs=2): """Helper method to populate block table with test data""" # Add block IDs for each request for i in range(num_reqs): - block_ids = list(range(i * 4, - (i + 1) * 4)) # [0,1,2,3], [4,5,6,7], etc. + block_ids = list(range(i * 4, (i + 1) * 4)) # [0,1,2,3], [4,5,6,7], etc. block_table.add_row(block_ids, i) - def _test_slot_mapping_for_ranks(self, dcp_world_size, pcp_world_size, - cp_kv_cache_interleave_size, - test_configs): + def _test_slot_mapping_for_ranks(self, dcp_world_size, pcp_world_size, cp_kv_cache_interleave_size, test_configs): """Helper method to test slot_mapping across multiple ranks - + Args: dcp_world_size: Number of DCP ranks pcp_world_size: Number of PCP ranks @@ -97,31 +95,33 @@ def _test_slot_mapping_for_ranks(self, dcp_world_size, pcp_world_size, for dcp_rank, pcp_rank, req_indices, positions, expected_result in test_configs: with self.subTest(dcp_rank=dcp_rank, pcp_rank=pcp_rank): block_table = self.create_block_table( - dcp_world_size, dcp_rank, pcp_world_size, pcp_rank, - cp_kv_cache_interleave_size) + dcp_world_size, dcp_rank, pcp_world_size, pcp_rank, cp_kv_cache_interleave_size + ) num_reqs = max(req_indices) + 1 if len(req_indices) > 0 else 1 self.setup_block_table_data(block_table, num_reqs=num_reqs) block_table.compute_slot_mapping(req_indices, positions) - actual_result = block_table.slot_mapping.np[:len(positions)] + actual_result = block_table.slot_mapping.np[: len(positions)] np.testing.assert_array_equal( - actual_result, expected_result, + actual_result, + expected_result, f"DCP={dcp_world_size}, PCP={pcp_world_size}, " f"interleave={cp_kv_cache_interleave_size}, " - f"dcp_rank={dcp_rank}, pcp_rank={pcp_rank}") + f"dcp_rank={dcp_rank}, pcp_rank={pcp_rank}", + ) def test_compute_slot_mapping_dcp1_pcp1_interleave1(self): """Test compute_slot_mapping with DCP=1, PCP=1, interleave_size=1 - + With no parallelism (DCP=1, PCP=1), all tokens are local to the single rank. - + Setup: - Block size: 16 - Request 0 has blocks: [0, 1, 2, 3] - Request 1 has blocks: [4, 5, 6, 7] - + Test positions for each request: - Request 0, position 0: block_id=0, offset=0 → slot = 0*128+0 = 0 - Request 0, position 1: block_id=0, offset=1 → slot = 0*128+1 = 1 @@ -137,14 +137,13 @@ def test_compute_slot_mapping_dcp1_pcp1_interleave1(self): (0, 0, req_indices, positions, expected_result), ] - self._test_slot_mapping_for_ranks(dcp_world_size=1, - pcp_world_size=1, - cp_kv_cache_interleave_size=1, - test_configs=test_configs) + self._test_slot_mapping_for_ranks( + dcp_world_size=1, pcp_world_size=1, cp_kv_cache_interleave_size=1, test_configs=test_configs + ) def test_compute_slot_mapping_dcp4_pcp2_interleave1(self): """Test compute_slot_mapping with DCP=4, PCP=2, interleave_size=1 - + With interleave_size=1, tokens are distributed round-robin across all 8 ranks: - Position 0 → Rank 0 - Position 1 → Rank 1 @@ -183,28 +182,25 @@ def test_compute_slot_mapping_dcp4_pcp2_interleave1(self): for pcp_rank in range(2): for dcp_rank in range(4): current_rank = 4 * pcp_rank + dcp_rank - expected_result = np.array(rank_expectations[current_rank], - dtype=np.int32) - test_configs.append((dcp_rank, pcp_rank, req_indices, - positions, expected_result)) + expected_result = np.array(rank_expectations[current_rank], dtype=np.int32) + test_configs.append((dcp_rank, pcp_rank, req_indices, positions, expected_result)) - self._test_slot_mapping_for_ranks(dcp_world_size=4, - pcp_world_size=2, - cp_kv_cache_interleave_size=1, - test_configs=test_configs) + self._test_slot_mapping_for_ranks( + dcp_world_size=4, pcp_world_size=2, cp_kv_cache_interleave_size=1, test_configs=test_configs + ) def test_compute_slot_mapping_dcp4_pcp2_interleave128(self): """Test compute_slot_mapping with DCP=4, PCP=2, interleave_size=128 - + With interleave_size=128, tokens are distributed in chunks of 128 across ranks. Virtual block size = 16 * 4 * 2 = 128 - + Token distribution with interleave_size=128: - Positions 0-127 belong to rank 0 (first chunk of 128) - Positions 128-255 belong to rank 1 (second chunk of 128) - Positions 256-383 belong to rank 2 (third chunk of 128) - And so on... - + Using 130 positions ensures we test both rank 0 (positions 0-127) and rank 1 (positions 128-129). """ num_positions = 130 @@ -245,14 +241,13 @@ def test_compute_slot_mapping_dcp4_pcp2_interleave128(self): expected_result = [-1] * 130 test_configs.append( - (dcp_rank, pcp_rank, req_indices, positions, - np.array(expected_result, dtype=np.int32))) + (dcp_rank, pcp_rank, req_indices, positions, np.array(expected_result, dtype=np.int32)) + ) - self._test_slot_mapping_for_ranks(dcp_world_size=4, - pcp_world_size=2, - cp_kv_cache_interleave_size=128, - test_configs=test_configs) + self._test_slot_mapping_for_ranks( + dcp_world_size=4, pcp_world_size=2, cp_kv_cache_interleave_size=128, test_configs=test_configs + ) -if __name__ == '__main__': +if __name__ == "__main__": unittest.main() From 2acc473232da3a2cb9ed1cb903701f29b047a735 Mon Sep 17 00:00:00 2001 From: 01267596 Date: Wed, 1 Apr 2026 02:36:21 +0000 Subject: [PATCH 16/31] fix ut test Signed-off-by: 01267596 --- tests/ut/worker/test_block_table.py | 17 +++++++++++++++-- 1 file changed, 15 insertions(+), 2 deletions(-) diff --git a/tests/ut/worker/test_block_table.py b/tests/ut/worker/test_block_table.py index 9218dbddea0..af3d536cb61 100644 --- a/tests/ut/worker/test_block_table.py +++ b/tests/ut/worker/test_block_table.py @@ -101,9 +101,22 @@ def _test_slot_mapping_for_ranks(self, dcp_world_size, pcp_world_size, cp_kv_cac num_reqs = max(req_indices) + 1 if len(req_indices) > 0 else 1 self.setup_block_table_data(block_table, num_reqs=num_reqs) - block_table.compute_slot_mapping(req_indices, positions) + # Build query_start_loc [num_reqs + 1] from req_indices. + # query_start_loc holds the cumulative token count per request, + # e.g. req_indices=[0,0,1,1] -> query_start_loc=[0,2,4]. + num_tokens = len(positions) + counts = np.bincount(req_indices, minlength=num_reqs) + query_start_loc_np = np.concatenate([[0], np.cumsum(counts)]).astype(np.int32) + query_start_loc = torch.from_numpy(query_start_loc_np) + + # positions must be a torch int64 tensor to match the + # _compute_slot_mapping_kernel's positions_ptr type. + positions_tensor = torch.from_numpy(positions.astype(np.int64)) + + block_table.compute_slot_mapping(num_reqs, query_start_loc, positions_tensor) + + actual_result = block_table.slot_mapping.np[:num_tokens] - actual_result = block_table.slot_mapping.np[: len(positions)] np.testing.assert_array_equal( actual_result, expected_result, From 946107dd011997f0ef12513fe05cc5fcc89e4b12 Mon Sep 17 00:00:00 2001 From: 01267596 Date: Wed, 1 Apr 2026 03:24:46 +0000 Subject: [PATCH 17/31] fix ut test Signed-off-by: 01267596 --- tests/ut/worker/test_block_table.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/tests/ut/worker/test_block_table.py b/tests/ut/worker/test_block_table.py index af3d536cb61..a1849cfa633 100644 --- a/tests/ut/worker/test_block_table.py +++ b/tests/ut/worker/test_block_table.py @@ -18,6 +18,7 @@ import numpy as np import torch +import vllm.utils.cpu_triton_utils as cpu_tl from vllm.distributed.parallel_state import GroupCoordinator from tests.ut.base import TestBase @@ -112,7 +113,7 @@ def _test_slot_mapping_for_ranks(self, dcp_world_size, pcp_world_size, cp_kv_cac # positions must be a torch int64 tensor to match the # _compute_slot_mapping_kernel's positions_ptr type. positions_tensor = torch.from_numpy(positions.astype(np.int64)) - + block_table._compute_slot_mapping_kernel = cpu_tl.compute_slot_mapping_kernel block_table.compute_slot_mapping(num_reqs, query_start_loc, positions_tensor) actual_result = block_table.slot_mapping.np[:num_tokens] From 80a604b4ac1aa90415f9f785c6810a2b631a77f5 Mon Sep 17 00:00:00 2001 From: 01267596 Date: Wed, 1 Apr 2026 06:06:00 +0000 Subject: [PATCH 18/31] fix ut test Signed-off-by: 01267596 --- tests/ut/worker/test_block_table.py | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/tests/ut/worker/test_block_table.py b/tests/ut/worker/test_block_table.py index a1849cfa633..f031b3811b0 100644 --- a/tests/ut/worker/test_block_table.py +++ b/tests/ut/worker/test_block_table.py @@ -18,7 +18,8 @@ import numpy as np import torch -import vllm.utils.cpu_triton_utils as cpu_tl + +# import vllm.utils.cpu_triton_utils as cpu_tl from vllm.distributed.parallel_state import GroupCoordinator from tests.ut.base import TestBase @@ -113,7 +114,7 @@ def _test_slot_mapping_for_ranks(self, dcp_world_size, pcp_world_size, cp_kv_cac # positions must be a torch int64 tensor to match the # _compute_slot_mapping_kernel's positions_ptr type. positions_tensor = torch.from_numpy(positions.astype(np.int64)) - block_table._compute_slot_mapping_kernel = cpu_tl.compute_slot_mapping_kernel + # block_table._compute_slot_mapping_kernel = cpu_tl.compute_slot_mapping_kernel block_table.compute_slot_mapping(num_reqs, query_start_loc, positions_tensor) actual_result = block_table.slot_mapping.np[:num_tokens] From 5c88ee5222741b10ead1fb4738a36f075ffd1220 Mon Sep 17 00:00:00 2001 From: 01267596 Date: Wed, 1 Apr 2026 06:48:46 +0000 Subject: [PATCH 19/31] fix Signed-off-by: 01267596 --- vllm_ascend/attention/attention_v1.py | 30 ++++++++++++++++++++++++--- 1 file changed, 27 insertions(+), 3 deletions(-) diff --git a/vllm_ascend/attention/attention_v1.py b/vllm_ascend/attention/attention_v1.py index 5a7b8d3a329..74293a5baea 100644 --- a/vllm_ascend/attention/attention_v1.py +++ b/vllm_ascend/attention/attention_v1.py @@ -688,7 +688,20 @@ def full_graph_pa( graph_params.handles[num_tokens].append(handle) return output - def _get_fia_params(self, key: torch.Tensor, value: torch.Tensor, attn_metadata: AscendMetadata): + def _get_fia_params(self, key: torch.Tensor, value: torch.Tensor, attn_metadata: AscendMetadata, kv_cache=None): + # PrefillNoCache doesn't need key_cache, but other modes do + # Only initialize/require cache for modes that actually use it + if attn_metadata.attn_state != AscendAttentionState.PrefillNoCache: + # Initialize cache from kv_cache if not already set (for DecodeOnly mode) + if self.key_cache is None and kv_cache is not None: + if isinstance(kv_cache, torch.Tensor) and kv_cache.dim() > 0 and kv_cache.shape[0] == 2: + self.key_cache, self.value_cache = kv_cache[0], kv_cache[1] + elif isinstance(kv_cache, (list, tuple)) and len(kv_cache) >= 2: + self.key_cache, self.value_cache = kv_cache[0], kv_cache[1] + + if self.key_cache is None: + raise RuntimeError(f"key_cache is None in _get_fia_params for mode {attn_metadata.attn_state}. kv_cache={kv_cache}") + if attn_metadata.attn_state == AscendAttentionState.PrefillNoCache: block_size = 128 block_table = None @@ -766,6 +779,7 @@ def forward_fused_infer_attention( value: torch.Tensor, attn_metadata: AscendMetadata, output: torch.Tensor, + kv_cache=None, ): # we inherit ForwardContext in model runner v2, when enable model # runner v2, there is not capturing attribute in forward_context, @@ -781,7 +795,7 @@ def forward_fused_infer_attention( and self.sinks is None ): return self._forward_fia_slidingwindow(query, attn_metadata, output) - key, value, block_size, block_table, actual_seq_lengths_kv = self._get_fia_params(key, value, attn_metadata) + key, value, block_size, block_table, actual_seq_lengths_kv = self._get_fia_params(key, value, attn_metadata, kv_cache) num_tokens = attn_metadata.actual_seq_lengths_q[-1] query = query[:num_tokens] if ( @@ -927,7 +941,7 @@ def forward_impl( ): output = self.forward_paged_attention(query, attn_metadata, output) else: - output = self.forward_fused_infer_attention(query, key, value, attn_metadata, output) + output = self.forward_fused_infer_attention(query, key, value, attn_metadata, output, kv_cache) return output @@ -963,6 +977,16 @@ def forward( num_tokens = query.shape[0] if attn_metadata is None: return output.fill_(0) + + # Initialize key_cache and value_cache from kv_cache if not already set. + # This is needed for DecodeOnly mode where key/value are None but we still + # need access to the cache for attention computation. + if self.key_cache is None and kv_cache is not None: + if isinstance(kv_cache, torch.Tensor) and kv_cache.dim() > 0 and kv_cache.shape[0] == 2: + self.key_cache, self.value_cache = kv_cache[0], kv_cache[1] + elif isinstance(kv_cache, (list, tuple)) and len(kv_cache) >= 2: + self.key_cache, self.value_cache = kv_cache[0], kv_cache[1] + output_padded = None if key is not None and value is not None: output_padded = output From c1e05dbf45f3edeca51bc89ba980d2dadbaa7fa1 Mon Sep 17 00:00:00 2001 From: 01267596 Date: Wed, 1 Apr 2026 10:57:57 +0000 Subject: [PATCH 20/31] fix Signed-off-by: 01267596 --- vllm_ascend/attention/attention_v1.py | 20 +++++++++++++++----- 1 file changed, 15 insertions(+), 5 deletions(-) diff --git a/vllm_ascend/attention/attention_v1.py b/vllm_ascend/attention/attention_v1.py index 74293a5baea..f4dba63282a 100644 --- a/vllm_ascend/attention/attention_v1.py +++ b/vllm_ascend/attention/attention_v1.py @@ -696,11 +696,15 @@ def _get_fia_params(self, key: torch.Tensor, value: torch.Tensor, attn_metadata: if self.key_cache is None and kv_cache is not None: if isinstance(kv_cache, torch.Tensor) and kv_cache.dim() > 0 and kv_cache.shape[0] == 2: self.key_cache, self.value_cache = kv_cache[0], kv_cache[1] + elif isinstance(kv_cache, torch.Tensor) and kv_cache.dim() > 0 and kv_cache.shape[0] == 1: + self.key_cache, self.value_cache = kv_cache[0][0], kv_cache[0][1] elif isinstance(kv_cache, (list, tuple)) and len(kv_cache) >= 2: self.key_cache, self.value_cache = kv_cache[0], kv_cache[1] if self.key_cache is None: - raise RuntimeError(f"key_cache is None in _get_fia_params for mode {attn_metadata.attn_state}. kv_cache={kv_cache}") + raise RuntimeError( + f"key_cache is None in _get_fia_params for mode {attn_metadata.attn_state}. kv_cache={kv_cache}" + ) if attn_metadata.attn_state == AscendAttentionState.PrefillNoCache: block_size = 128 @@ -795,7 +799,9 @@ def forward_fused_infer_attention( and self.sinks is None ): return self._forward_fia_slidingwindow(query, attn_metadata, output) - key, value, block_size, block_table, actual_seq_lengths_kv = self._get_fia_params(key, value, attn_metadata, kv_cache) + key, value, block_size, block_table, actual_seq_lengths_kv = self._get_fia_params( + key, value, attn_metadata, kv_cache + ) num_tokens = attn_metadata.actual_seq_lengths_q[-1] query = query[:num_tokens] if ( @@ -982,9 +988,13 @@ def forward( # This is needed for DecodeOnly mode where key/value are None but we still # need access to the cache for attention computation. if self.key_cache is None and kv_cache is not None: - if isinstance(kv_cache, torch.Tensor) and kv_cache.dim() > 0 and kv_cache.shape[0] == 2: - self.key_cache, self.value_cache = kv_cache[0], kv_cache[1] - elif isinstance(kv_cache, (list, tuple)) and len(kv_cache) >= 2: + if ( + isinstance(kv_cache, torch.Tensor) + and kv_cache.dim() > 0 + and kv_cache.shape[0] == 2 + or isinstance(kv_cache, (list, tuple)) + and len(kv_cache) >= 2 + ): self.key_cache, self.value_cache = kv_cache[0], kv_cache[1] output_padded = None From 687e8c11a44d7b05f53e8c2de977514bd90ed3c8 Mon Sep 17 00:00:00 2001 From: 01267596 Date: Thu, 2 Apr 2026 01:14:04 +0000 Subject: [PATCH 21/31] fix kvcache Signed-off-by: 01267596 --- vllm_ascend/attention/attention_v1.py | 12 +++++++----- vllm_ascend/ops/mla.py | 7 +++++-- vllm_ascend/patch/worker/patch_qwen3_5.py | 7 +++++-- vllm_ascend/patch/worker/patch_qwen3_next.py | 7 +++++-- vllm_ascend/patch/worker/patch_qwen3_next_mtp.py | 3 +-- 5 files changed, 23 insertions(+), 13 deletions(-) diff --git a/vllm_ascend/attention/attention_v1.py b/vllm_ascend/attention/attention_v1.py index f4dba63282a..d6ee960726c 100644 --- a/vllm_ascend/attention/attention_v1.py +++ b/vllm_ascend/attention/attention_v1.py @@ -694,11 +694,13 @@ def _get_fia_params(self, key: torch.Tensor, value: torch.Tensor, attn_metadata: if attn_metadata.attn_state != AscendAttentionState.PrefillNoCache: # Initialize cache from kv_cache if not already set (for DecodeOnly mode) if self.key_cache is None and kv_cache is not None: - if isinstance(kv_cache, torch.Tensor) and kv_cache.dim() > 0 and kv_cache.shape[0] == 2: - self.key_cache, self.value_cache = kv_cache[0], kv_cache[1] - elif isinstance(kv_cache, torch.Tensor) and kv_cache.dim() > 0 and kv_cache.shape[0] == 1: - self.key_cache, self.value_cache = kv_cache[0][0], kv_cache[0][1] - elif isinstance(kv_cache, (list, tuple)) and len(kv_cache) >= 2: + if ( + isinstance(kv_cache, torch.Tensor) + and kv_cache.dim() > 0 + and kv_cache.shape[0] == 2 + or isinstance(kv_cache, (list, tuple)) + and len(kv_cache) >= 2 + ): self.key_cache, self.value_cache = kv_cache[0], kv_cache[1] if self.key_cache is None: diff --git a/vllm_ascend/ops/mla.py b/vllm_ascend/ops/mla.py index 17a5858aa68..c308c56daec 100644 --- a/vllm_ascend/ops/mla.py +++ b/vllm_ascend/ops/mla.py @@ -33,7 +33,7 @@ from vllm_ascend.ascend_config import get_ascend_config from vllm_ascend.ascend_forward_context import _EXTRA_CTX -from vllm_ascend.utils import is_vl_model, parse_layer_idx +from vllm_ascend.utils import is_vl_model, parse_layer_idx, vllm_version_is class IndexerWrapper(nn.Module): @@ -183,7 +183,10 @@ def mla_forward( attn_metadata = forward_context.attn_metadata[self.mla_attn.layer_name] else: attn_metadata = forward_context.attn_metadata - kv_cache = self.mla_attn.kv_cache[0] + if vllm_version_is("0.18.0"): + kv_cache = self.mla_attn.kv_cache[forward_context.virtual_engine] + else: + kv_cache = self.mla_attn.kv_cache self.mla_attn.impl.forward( self.mla_attn.layer_name, hidden_states, kv_cache, attn_metadata, need_gather_q_kv, output ) diff --git a/vllm_ascend/patch/worker/patch_qwen3_5.py b/vllm_ascend/patch/worker/patch_qwen3_5.py index 66f0cde6c86..133bd30eff6 100644 --- a/vllm_ascend/patch/worker/patch_qwen3_5.py +++ b/vllm_ascend/patch/worker/patch_qwen3_5.py @@ -33,7 +33,7 @@ from vllm_ascend.attention.utils import maybe_save_kv_layer_to_connector from vllm_ascend.ops.triton.fla.sigmoid_gating import fused_sigmoid_gating_delta_rule_update from vllm_ascend.ops.triton.fused_gdn_gating import fused_gdn_gating_patch -from vllm_ascend.utils import enable_sp +from vllm_ascend.utils import enable_sp, vllm_version_is def to_int64_tuple(t): @@ -135,7 +135,10 @@ def _forward_core( non_spec_token_indx = attn_metadata.non_spec_token_indx spec_state_indices_tensor = attn_metadata.spec_state_indices_tensor # noqa: E501 non_spec_state_indices_tensor = attn_metadata.non_spec_state_indices_tensor # noqa: E501 - self_kv_cache = self.kv_cache[0] + if vllm_version_is("0.18.0"): + self_kv_cache = self.kv_cache[forward_context.virtual_engine] + else: + self_kv_cache = self.kv_cache conv_state = self_kv_cache[0].transpose(-1, -2) ssm_state = self_kv_cache[1] num_actual_tokens = attn_metadata.num_actual_tokens diff --git a/vllm_ascend/patch/worker/patch_qwen3_next.py b/vllm_ascend/patch/worker/patch_qwen3_next.py index 77aa9d62fa9..642a90f681e 100644 --- a/vllm_ascend/patch/worker/patch_qwen3_next.py +++ b/vllm_ascend/patch/worker/patch_qwen3_next.py @@ -33,7 +33,7 @@ from vllm_ascend.ops.triton.fla.fused_qkvzba_split_reshape import fused_qkvzba_split_reshape_cat from vllm_ascend.ops.triton.fused_gdn_gating import fused_gdn_gating_patch from vllm_ascend.patch.worker.patch_qwen3_5 import to_int64_tuple -from vllm_ascend.utils import enable_sp +from vllm_ascend.utils import enable_sp, vllm_version_is class AscendQwen3Next_GatedDeltaNet(Qwen3NextGatedDeltaNet): @@ -125,7 +125,10 @@ def _forward_core( non_spec_token_indx = attn_metadata.non_spec_token_indx spec_state_indices_tensor = attn_metadata.spec_state_indices_tensor # noqa: E501 non_spec_state_indices_tensor = attn_metadata.non_spec_state_indices_tensor # noqa: E501 - self_kv_cache = self.kv_cache[0] + if vllm_version_is("0.18.0"): + self_kv_cache = self.kv_cache[forward_context.virtual_engine] + else: + self_kv_cache = self.kv_cache conv_state = self_kv_cache[0].transpose(-1, -2) ssm_state = self_kv_cache[1] num_actual_tokens = attn_metadata.num_actual_tokens diff --git a/vllm_ascend/patch/worker/patch_qwen3_next_mtp.py b/vllm_ascend/patch/worker/patch_qwen3_next_mtp.py index 1bd00e0c058..21cb03951e7 100644 --- a/vllm_ascend/patch/worker/patch_qwen3_next_mtp.py +++ b/vllm_ascend/patch/worker/patch_qwen3_next_mtp.py @@ -44,8 +44,7 @@ def bind_kv_cache( # Bind kv_caches to forward context for layer_name, kv_cache in kv_caches.items(): - # NOTE: Use list because of v0 PP virtual engine. - forward_context[layer_name].kv_cache = [kv_cache] + forward_context[layer_name].kv_cache = kv_cache utils.bind_kv_cache = bind_kv_cache From a9bac6f5ba235d79a189c1844f0f767aee6f9dac Mon Sep 17 00:00:00 2001 From: 01267596 Date: Thu, 2 Apr 2026 01:43:36 +0000 Subject: [PATCH 22/31] fix ci Signed-off-by: 01267596 --- .github/workflows/_e2e_test.yaml | 2 +- .github/workflows/dockerfiles/Dockerfile.lint | 2 +- .github/workflows/pr_test_full.yaml | 4 ++-- .github/workflows/pr_test_light.yaml | 4 ++-- 4 files changed, 6 insertions(+), 6 deletions(-) diff --git a/.github/workflows/_e2e_test.yaml b/.github/workflows/_e2e_test.yaml index 3c125ad04a2..12662354fa9 100644 --- a/.github/workflows/_e2e_test.yaml +++ b/.github/workflows/_e2e_test.yaml @@ -19,7 +19,7 @@ on: continue_on_error: required: false type: boolean - default: false + default: true # The following inputs are used by comment-triggered E2E tests (/e2e ). # They carry space-separated pytest paths, categorized by runner type. # Leave empty (default) when running label-triggered full/light suites. diff --git a/.github/workflows/dockerfiles/Dockerfile.lint b/.github/workflows/dockerfiles/Dockerfile.lint index bd0c91fb2c7..bb27cf537b7 100644 --- a/.github/workflows/dockerfiles/Dockerfile.lint +++ b/.github/workflows/dockerfiles/Dockerfile.lint @@ -27,7 +27,7 @@ RUN apt-get update -y && \ ARG VLLM_REPO=https://github.com/vllm-project/vllm.git # For lint purpose, actually we need make a main2main matching. -ARG VLLM_COMMIT=35141a7eeda941a60ad5a4956670c60fd5a77029 +ARG VLLM_COMMIT=14acf429ac08b6d538ca6feb3e06b6d13895804d RUN git init /vllm-workspace/vllm && \ git -C /vllm-workspace/vllm fetch --depth 1 $VLLM_REPO $VLLM_COMMIT && \ git -C /vllm-workspace/vllm checkout FETCH_HEAD diff --git a/.github/workflows/pr_test_full.yaml b/.github/workflows/pr_test_full.yaml index 3e6fbf21ae4..d12b4941f94 100644 --- a/.github/workflows/pr_test_full.yaml +++ b/.github/workflows/pr_test_full.yaml @@ -80,7 +80,7 @@ jobs: name: e2e-full strategy: matrix: - vllm_version: [35141a7eeda941a60ad5a4956670c60fd5a77029] + vllm_version: [14acf429ac08b6d538ca6feb3e06b6d13895804d, v0.18.0] needs: [changes] if: ${{ needs.changes.outputs.e2e_tracker == 'true' || needs.changes.outputs.e2e_tracker == true }} uses: ./.github/workflows/_e2e_test.yaml @@ -102,7 +102,7 @@ jobs: strategy: fail-fast: false matrix: - vllm_version: [35141a7eeda941a60ad5a4956670c60fd5a77029] + vllm_version: [35141a7eeda941a60ad5a4956670c60fd5a77029, v0.18.0] needs: [parse-trigger] if: ${{ needs.parse-trigger.outputs.allowed == 'true' }} uses: ./.github/workflows/_e2e_test.yaml diff --git a/.github/workflows/pr_test_light.yaml b/.github/workflows/pr_test_light.yaml index 105e29a4996..6368a0e44f9 100644 --- a/.github/workflows/pr_test_light.yaml +++ b/.github/workflows/pr_test_light.yaml @@ -90,7 +90,7 @@ jobs: if: ${{ needs.lint.result == 'success' && (needs.changes.outputs.e2e_tracker == 'true' || needs.changes.outputs.ut_tracker == 'true') }} strategy: matrix: - vllm_version: [35141a7eeda941a60ad5a4956670c60fd5a77029] + vllm_version: [14acf429ac08b6d538ca6feb3e06b6d13895804d, v0.18.0] uses: ./.github/workflows/_unit_test.yaml with: vllm: ${{ matrix.vllm_version }} @@ -102,7 +102,7 @@ jobs: name: e2e-light strategy: matrix: - vllm_version: [35141a7eeda941a60ad5a4956670c60fd5a77029] + vllm_version: [14acf429ac08b6d538ca6feb3e06b6d13895804d, v0.18.0] # Note (yikun): If CI resource are limited we can split job into two chain jobs needs: [lint, changes] # only trigger e2e test after lint passed and the change is e2e related with pull request. From e931e98d54ae52536568b0ba04d906e6762f2dad Mon Sep 17 00:00:00 2001 From: 01267596 Date: Thu, 2 Apr 2026 06:35:47 +0000 Subject: [PATCH 23/31] fix Signed-off-by: 01267596 --- vllm_ascend/worker/model_runner_v1.py | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) diff --git a/vllm_ascend/worker/model_runner_v1.py b/vllm_ascend/worker/model_runner_v1.py index 773f4d68055..f2bbc20ddea 100644 --- a/vllm_ascend/worker/model_runner_v1.py +++ b/vllm_ascend/worker/model_runner_v1.py @@ -335,7 +335,8 @@ def __init__(self, vllm_config: VllmConfig, device: torch.device): ) # TODO(zhenwenqi) after https://github.com/vllm-project/vllm/pull/28988 is merged, we can delete this self.input_ids = self._make_buffer(max_buffer_num_tokens, dtype=torch.int32) - self.positions = self._make_buffer(max_buffer_num_tokens, dtype=torch.int64) + self.positions = torch.zeros( + self.max_num_tokens, dtype=torch.int64, device=self.device) self._set_up_drafter() @@ -675,7 +676,10 @@ def _prepare_inputs( total_num_scheduled_tokens = sum(num_scheduled_tokens[:num_reqs]) req_indices = np.repeat(self.arange_np[:num_reqs], num_scheduled_tokens) cu_num_tokens, _ = self._get_cumsum_and_arange(num_scheduled_tokens) - positions_np = self.positions.np[:total_num_scheduled_tokens] + positions_np = ( + self.input_batch.num_computed_tokens_cpu[req_indices] + + self.query_pos.np[: cu_num_tokens[-1]] + ) np.add( self.input_batch.num_computed_tokens_cpu[req_indices], position_pcp[:total_num_scheduled_tokens], From 4363d137d99b8a55cdb50601a795559ac7990126 Mon Sep 17 00:00:00 2001 From: 01267596 Date: Thu, 2 Apr 2026 09:00:13 +0000 Subject: [PATCH 24/31] fix Signed-off-by: 01267596 --- vllm_ascend/spec_decode/eagle_proposer.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/vllm_ascend/spec_decode/eagle_proposer.py b/vllm_ascend/spec_decode/eagle_proposer.py index c1ba5ec6bd7..9042f826b0a 100644 --- a/vllm_ascend/spec_decode/eagle_proposer.py +++ b/vllm_ascend/spec_decode/eagle_proposer.py @@ -374,7 +374,7 @@ def dummy_run( query_start_loc=self.query_start_loc.gpu[: num_reqs + 1], query_start_loc_cpu=self.query_start_loc.cpu[: num_reqs + 1], seq_lens_cpu=self.runner.optimistic_seq_lens_cpu, - seq_lens=self.runner.seq_lens.gpu[:num_reqs], + seq_lens=self.runner.seq_lens[:num_reqs], num_reqs=num_reqs, num_actual_tokens=num_tokens, num_input_tokens=num_tokens, From 5609a66abe7edabfe74ce753aa1f390dd4739e1d Mon Sep 17 00:00:00 2001 From: 01267596 Date: Thu, 2 Apr 2026 09:02:30 +0000 Subject: [PATCH 25/31] fix Signed-off-by: 01267596 --- vllm_ascend/spec_decode/eagle_proposer.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/vllm_ascend/spec_decode/eagle_proposer.py b/vllm_ascend/spec_decode/eagle_proposer.py index 9042f826b0a..7f3a90b7e2c 100644 --- a/vllm_ascend/spec_decode/eagle_proposer.py +++ b/vllm_ascend/spec_decode/eagle_proposer.py @@ -558,7 +558,7 @@ def _propose( common_attn_metadata.block_table_tensor = self._adjust_tensor( common_attn_metadata.block_table_tensor, num_reqs_padded ) - common_attn_metadata.seq_lens = self._adjust_tensor(self.runner.seq_lens.gpu, num_reqs_padded) + common_attn_metadata.seq_lens = self._adjust_tensor(self.runner.seq_lens, num_reqs_padded) common_attn_metadata.seq_lens_cpu = self._adjust_tensor( self.runner.optimistic_seq_lens_cpu, num_reqs_padded ) From 7f441b0d17b8c9d9f5fb0bfb298312cad4d11ed9 Mon Sep 17 00:00:00 2001 From: HF-001 <1670186653@qq.com> Date: Thu, 2 Apr 2026 22:17:44 +0800 Subject: [PATCH 26/31] fix Signed-off-by: HF-001 <1670186653@qq.com> --- vllm_ascend/attention/mla_v1.py | 6 +++++- vllm_ascend/spec_decode/eagle_proposer.py | 8 +++++--- vllm_ascend/worker/model_runner_v1.py | 2 +- 3 files changed, 11 insertions(+), 5 deletions(-) diff --git a/vllm_ascend/attention/mla_v1.py b/vllm_ascend/attention/mla_v1.py index 3b1c575b871..12a773dfdac 100644 --- a/vllm_ascend/attention/mla_v1.py +++ b/vllm_ascend/attention/mla_v1.py @@ -435,7 +435,11 @@ def build( query_seq_lens_cpu = query_start_loc_cpu[1:] - query_start_loc_cpu[:-1] self.query_lens = query_seq_lens_cpu[:num_reqs] - self.seq_lens = common_attn_metadata.seq_lens_cpu[:num_reqs] + self.seq_lens = None + if common_attn_metadata.seq_lens_cpu is not None: + self.seq_lens = common_attn_metadata.seq_lens_cpu[:num_reqs] + else: + self.seq_lens = common_attn_metadata.seq_lens[:num_reqs] self.graph_pad_size = common_attn_metadata.graph_pad_size block_table_size = self.get_block_table_size(common_attn_metadata, BUILD_METADATA_STEP_PREFILL) diff --git a/vllm_ascend/spec_decode/eagle_proposer.py b/vllm_ascend/spec_decode/eagle_proposer.py index 7f3a90b7e2c..ad809bd1d41 100644 --- a/vllm_ascend/spec_decode/eagle_proposer.py +++ b/vllm_ascend/spec_decode/eagle_proposer.py @@ -384,7 +384,7 @@ def dummy_run( block_table_tensor=self.runner.input_batch.block_table[0].get_device_tensor()[:num_reqs], # This is used to hold a position. slot_mapping=self.runner.input_batch.block_table[0].slot_mapping.gpu, - positions=self.runner.positions.gpu, + positions=self.runner.positions, attn_state=self.runner.attn_state, decode_token_per_req=self.runner.decode_token_per_req, max_seq_len=0, @@ -1192,8 +1192,10 @@ def attn_update_stack_num_spec_norm( # in the merged graph, it does not affect position 1 # FIXME(lilinsiman) common_attn_metadata.seq_lens = common_attn_metadata.seq_lens.clone() - common_attn_metadata.seq_lens_cpu = common_attn_metadata.seq_lens_cpu.clone() - common_attn_metadata.num_computed_tokens_cpu = common_attn_metadata.num_computed_tokens_cpu.clone() + if common_attn_metadata.seq_lens_cpu is not None: + common_attn_metadata.seq_lens_cpu = common_attn_metadata.seq_lens_cpu.clone() + if common_attn_metadata.num_computed_tokens_cpu is not None: + common_attn_metadata.num_computed_tokens_cpu = common_attn_metadata.num_computed_tokens_cpu.clone() common_attn_metadata.positions = common_attn_metadata.positions.clone() # NOTE(woosuk): We should handle the case where the draft model diff --git a/vllm_ascend/worker/model_runner_v1.py b/vllm_ascend/worker/model_runner_v1.py index f2bbc20ddea..536b692c1a7 100644 --- a/vllm_ascend/worker/model_runner_v1.py +++ b/vllm_ascend/worker/model_runner_v1.py @@ -675,7 +675,7 @@ def _prepare_inputs( # Re-update after PCP split sequences. total_num_scheduled_tokens = sum(num_scheduled_tokens[:num_reqs]) req_indices = np.repeat(self.arange_np[:num_reqs], num_scheduled_tokens) - cu_num_tokens, _ = self._get_cumsum_and_arange(num_scheduled_tokens) + cu_num_tokens, _ = self._get_cumsum_and_arange(num_scheduled_tokens, self.query_pos.np) positions_np = ( self.input_batch.num_computed_tokens_cpu[req_indices] + self.query_pos.np[: cu_num_tokens[-1]] From bcbee077106f30b0aece3231e4c7dad26e62b757 Mon Sep 17 00:00:00 2001 From: 01267596 Date: Fri, 3 Apr 2026 03:02:36 +0000 Subject: [PATCH 27/31] fix Signed-off-by: 01267596 --- vllm_ascend/attention/attention_v1.py | 2 +- vllm_ascend/attention/mla_v1.py | 2 +- vllm_ascend/attention/sfa_v1.py | 7 ++++++- vllm_ascend/spec_decode/eagle_proposer.py | 7 ++++--- vllm_ascend/worker/model_runner_v1.py | 2 +- 5 files changed, 13 insertions(+), 7 deletions(-) diff --git a/vllm_ascend/attention/attention_v1.py b/vllm_ascend/attention/attention_v1.py index d6ee960726c..f4afbca2913 100644 --- a/vllm_ascend/attention/attention_v1.py +++ b/vllm_ascend/attention/attention_v1.py @@ -278,7 +278,7 @@ def build( ) block_table = common_attn_metadata.block_table_tensor - seq_lens = common_attn_metadata.seq_lens[:num_reqs] + seq_lens = common_attn_metadata.seq_lens[:num_reqs].to("cpu") slot_mapping = common_attn_metadata.slot_mapping[:num_actual_tokens] # this slot_mapping override doesn't work since vllm will override it again. We should fix it vllm. diff --git a/vllm_ascend/attention/mla_v1.py b/vllm_ascend/attention/mla_v1.py index 12a773dfdac..601160344a2 100644 --- a/vllm_ascend/attention/mla_v1.py +++ b/vllm_ascend/attention/mla_v1.py @@ -439,7 +439,7 @@ def build( if common_attn_metadata.seq_lens_cpu is not None: self.seq_lens = common_attn_metadata.seq_lens_cpu[:num_reqs] else: - self.seq_lens = common_attn_metadata.seq_lens[:num_reqs] + self.seq_lens = common_attn_metadata.seq_lens[:num_reqs].to("cpu") self.graph_pad_size = common_attn_metadata.graph_pad_size block_table_size = self.get_block_table_size(common_attn_metadata, BUILD_METADATA_STEP_PREFILL) diff --git a/vllm_ascend/attention/sfa_v1.py b/vllm_ascend/attention/sfa_v1.py index 5ed5cac8782..8a0e3b90b5e 100644 --- a/vllm_ascend/attention/sfa_v1.py +++ b/vllm_ascend/attention/sfa_v1.py @@ -240,7 +240,12 @@ def build( cum_query_lens = common_attn_metadata.query_start_loc[1 : num_reqs + 1] seq_lens = common_attn_metadata.seq_lens[:num_reqs] - seq_lens_cpu = common_attn_metadata.seq_lens_cpu[:num_reqs] + + seq_lens_cpu = None + if common_attn_metadata.seq_lens_cpu is not None: + seq_lens_cpu = common_attn_metadata.seq_lens_cpu[:num_reqs] + else: + seq_lens_cpu = common_attn_metadata.seq_lens[:num_reqs].to("cpu") cos, sin = get_cos_and_sin_mla(input_positions, True) diff --git a/vllm_ascend/spec_decode/eagle_proposer.py b/vllm_ascend/spec_decode/eagle_proposer.py index ad809bd1d41..ece9bdf1966 100644 --- a/vllm_ascend/spec_decode/eagle_proposer.py +++ b/vllm_ascend/spec_decode/eagle_proposer.py @@ -562,9 +562,10 @@ def _propose( common_attn_metadata.seq_lens_cpu = self._adjust_tensor( self.runner.optimistic_seq_lens_cpu, num_reqs_padded ) - common_attn_metadata.num_computed_tokens_cpu = self._adjust_tensor( - common_attn_metadata.num_computed_tokens_cpu, num_reqs_padded - ) + if common_attn_metadata.num_computed_tokens_cpu is not None: + common_attn_metadata.num_computed_tokens_cpu = self._adjust_tensor( + common_attn_metadata.num_computed_tokens_cpu, num_reqs_padded + ) else: num_reqs_padded = common_attn_metadata.num_reqs diff --git a/vllm_ascend/worker/model_runner_v1.py b/vllm_ascend/worker/model_runner_v1.py index dbf62b12767..85f8e59ab85 100644 --- a/vllm_ascend/worker/model_runner_v1.py +++ b/vllm_ascend/worker/model_runner_v1.py @@ -676,7 +676,7 @@ def _prepare_inputs( # Re-update after PCP split sequences. total_num_scheduled_tokens = sum(num_scheduled_tokens[:num_reqs]) req_indices = np.repeat(self.arange_np[:num_reqs], num_scheduled_tokens) - cu_num_tokens, _ = self._get_cumsum_and_arange(num_scheduled_tokens, self.query_pos.np) + cu_num_tokens = self._get_cumsum_and_arange(num_scheduled_tokens, self.query_pos.np) positions_np = ( self.input_batch.num_computed_tokens_cpu[req_indices] + self.query_pos.np[: cu_num_tokens[-1]] From c83ea5512230f8edc40eb70893acf91c63715486 Mon Sep 17 00:00:00 2001 From: 01267596 Date: Fri, 3 Apr 2026 06:44:45 +0000 Subject: [PATCH 28/31] fix Signed-off-by: 01267596 --- vllm_ascend/spec_decode/eagle_proposer.py | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/vllm_ascend/spec_decode/eagle_proposer.py b/vllm_ascend/spec_decode/eagle_proposer.py index ece9bdf1966..caeaccd989b 100644 --- a/vllm_ascend/spec_decode/eagle_proposer.py +++ b/vllm_ascend/spec_decode/eagle_proposer.py @@ -1164,9 +1164,10 @@ def attn_update_stack_num_spec_norm( common_attn_metadata.seq_lens_cpu = self._adjust_tensor( common_attn_metadata.seq_lens_cpu, input_batch_size ) - common_attn_metadata.num_computed_tokens_cpu = self._adjust_tensor( - common_attn_metadata.num_computed_tokens_cpu, input_batch_size - ) + if common_attn_metadata.num_computed_tokens_cpu is not None: + common_attn_metadata.num_computed_tokens_cpu = self._adjust_tensor( + common_attn_metadata.num_computed_tokens_cpu, input_batch_size + ) common_attn_metadata.query_start_loc = self.arange[: input_batch_size + 1] common_attn_metadata.query_start_loc_cpu = torch.from_numpy( self.token_arange_np[: input_batch_size + 1] From c0239cc33ab3263862778b1fbe398c9ff53b7521 Mon Sep 17 00:00:00 2001 From: 01267596 Date: Fri, 3 Apr 2026 08:38:08 +0000 Subject: [PATCH 29/31] fix Signed-off-by: 01267596 --- vllm_ascend/worker/model_runner_v1.py | 21 +++++++++++++++++---- 1 file changed, 17 insertions(+), 4 deletions(-) diff --git a/vllm_ascend/worker/model_runner_v1.py b/vllm_ascend/worker/model_runner_v1.py index 85f8e59ab85..f7baf4d4d26 100644 --- a/vllm_ascend/worker/model_runner_v1.py +++ b/vllm_ascend/worker/model_runner_v1.py @@ -877,10 +877,23 @@ def _prepare_inputs( self.num_scheduled_tokens.np[:num_reqs] = num_scheduled_tokens self.num_scheduled_tokens.copy_to_gpu(num_reqs) num_scheduled_tokens_gpu = self.num_scheduled_tokens.gpu[:num_reqs] - self.positions[:total_num_scheduled_tokens] = ( - self.num_computed_tokens[req_indices_gpu].to(torch.int64) - + self.query_pos.gpu[:total_num_scheduled_tokens] - ) + # fix prefix cache ci test + if self.pcp_size > 1: + # When PCP (Prefill Context Parallel) is enabled, positions use + # special PCP offsets (position_pcp) that are only computed on CPU. + # Copy the correctly-computed CPU positions to GPU instead of + # recomputing on GPU (which would miss the PCP offsets). + self.positions[:total_num_scheduled_tokens].copy_( + torch.from_numpy( + positions_np[:total_num_scheduled_tokens] + ).to(self.device), + non_blocking=True, + ) + else: + self.positions[:total_num_scheduled_tokens] = ( + self.num_computed_tokens[req_indices_gpu].to(torch.int64) + + self.query_pos.gpu[:total_num_scheduled_tokens] + ) self.seq_lens[:num_reqs] = ( self.num_computed_tokens[:num_reqs] + num_scheduled_tokens_gpu ) From 39f24983dade9462198e0100c4029dbc88901e2f Mon Sep 17 00:00:00 2001 From: HF-001 <1670186653@qq.com> Date: Sat, 4 Apr 2026 08:48:51 +0800 Subject: [PATCH 30/31] fix Signed-off-by: HF-001 <1670186653@qq.com> --- vllm_ascend/ops/mla.py | 2 +- vllm_ascend/patch/worker/patch_qwen3_5.py | 2 +- vllm_ascend/patch/worker/patch_qwen3_next.py | 2 +- vllm_ascend/sample/sampler.py | 1 - 4 files changed, 3 insertions(+), 4 deletions(-) diff --git a/vllm_ascend/ops/mla.py b/vllm_ascend/ops/mla.py index 5d247df425b..689ed0cd672 100644 --- a/vllm_ascend/ops/mla.py +++ b/vllm_ascend/ops/mla.py @@ -33,7 +33,7 @@ from vllm_ascend.ascend_config import get_ascend_config from vllm_ascend.ascend_forward_context import _EXTRA_CTX -from vllm_ascend.utils import is_vl_model, parse_layer_idx, vllm_version_is +from vllm_ascend.utils import is_vl_model, parse_layer_idx class IndexerWrapper(nn.Module): diff --git a/vllm_ascend/patch/worker/patch_qwen3_5.py b/vllm_ascend/patch/worker/patch_qwen3_5.py index bd4ab2fa8e3..3cf5ff22bb7 100644 --- a/vllm_ascend/patch/worker/patch_qwen3_5.py +++ b/vllm_ascend/patch/worker/patch_qwen3_5.py @@ -34,7 +34,7 @@ from vllm_ascend.ops.triton.fla.sigmoid_gating import fused_sigmoid_gating_delta_rule_update from vllm_ascend.ops.triton.fla.utils import clear_ssm_states from vllm_ascend.ops.triton.fused_gdn_gating import fused_gdn_gating_patch -from vllm_ascend.utils import enable_sp, vllm_version_is +from vllm_ascend.utils import enable_sp def to_int64_tuple(t): diff --git a/vllm_ascend/patch/worker/patch_qwen3_next.py b/vllm_ascend/patch/worker/patch_qwen3_next.py index 71fe9b6c2b3..cb2c216c729 100644 --- a/vllm_ascend/patch/worker/patch_qwen3_next.py +++ b/vllm_ascend/patch/worker/patch_qwen3_next.py @@ -34,7 +34,7 @@ from vllm_ascend.ops.triton.fla.utils import clear_ssm_states from vllm_ascend.ops.triton.fused_gdn_gating import fused_gdn_gating_patch from vllm_ascend.patch.worker.patch_qwen3_5 import to_int64_tuple -from vllm_ascend.utils import enable_sp, vllm_version_is +from vllm_ascend.utils import enable_sp class AscendQwen3Next_GatedDeltaNet(Qwen3NextGatedDeltaNet): diff --git a/vllm_ascend/sample/sampler.py b/vllm_ascend/sample/sampler.py index eb8cbd4ee7d..aa7a844d9b1 100644 --- a/vllm_ascend/sample/sampler.py +++ b/vllm_ascend/sample/sampler.py @@ -6,7 +6,6 @@ from vllm.v1.sample.sampler import Sampler from vllm_ascend.ascend_config import get_ascend_config -from vllm_ascend.batch_invariant import vllm_is_batch_invariant from vllm_ascend.sample.penalties import apply_all_penalties from vllm_ascend.utils import AscendDeviceType, get_ascend_device_type, global_stream, npu_stream_switch From e27c9001f2b2f48a5f95589d8da117f93c442493 Mon Sep 17 00:00:00 2001 From: HF-001 <1670186653@qq.com> Date: Sun, 5 Apr 2026 21:43:14 +0800 Subject: [PATCH 31/31] fix Signed-off-by: HF-001 <1670186653@qq.com> --- vllm_ascend/attention/attention_v1.py | 5 ++++- vllm_ascend/worker/model_runner_v1.py | 10 +++++++++- 2 files changed, 13 insertions(+), 2 deletions(-) diff --git a/vllm_ascend/attention/attention_v1.py b/vllm_ascend/attention/attention_v1.py index f4afbca2913..756827776b3 100644 --- a/vllm_ascend/attention/attention_v1.py +++ b/vllm_ascend/attention/attention_v1.py @@ -278,7 +278,10 @@ def build( ) block_table = common_attn_metadata.block_table_tensor - seq_lens = common_attn_metadata.seq_lens[:num_reqs].to("cpu") + if common_attn_metadata.seq_lens is not None: + seq_lens = common_attn_metadata.seq_lens[:num_reqs].to("cpu") + else: + seq_lens = common_attn_metadata.seq_lens_cpu[:num_reqs] slot_mapping = common_attn_metadata.slot_mapping[:num_actual_tokens] # this slot_mapping override doesn't work since vllm will override it again. We should fix it vllm. diff --git a/vllm_ascend/worker/model_runner_v1.py b/vllm_ascend/worker/model_runner_v1.py index ba165b216bd..54dfd9a1354 100644 --- a/vllm_ascend/worker/model_runner_v1.py +++ b/vllm_ascend/worker/model_runner_v1.py @@ -337,7 +337,15 @@ def __init__(self, vllm_config: VllmConfig, device: torch.device): # TODO(zhenwenqi) after https://github.com/vllm-project/vllm/pull/28988 is merged, we can delete this self.input_ids = self._make_buffer(max_buffer_num_tokens, dtype=torch.int32) self.positions = torch.zeros( - self.max_num_tokens, dtype=torch.int64, device=self.device) + max_buffer_num_tokens, dtype=torch.int64, device=self.device) + + # Create a CPU numpy buffer for positions computation when + # self.positions is a plain tensor (non-CpuGpuBuffer case). + self._positions_cpu_buf = torch.zeros( + max_buffer_num_tokens, dtype=torch.int64, + pin_memory=self.pin_memory, + ) + self._positions_np_buf = self._positions_cpu_buf.numpy() self._set_up_drafter()