vllm-project · wangxiyuan · Feb 28, 2026 · Feb 27, 2026
@@ -65,7 +65,7 @@ For main branch of vLLM Ascend, we usually make it compatible with the latest vL
 
 | Date       | Event                                     |
 |------------|-------------------------------------------|
-| 2026.02.26 | Release candidates, v0.15.0rc1            |
+| 2026.02.27 | Release candidates, v0.15.0rc1            |
 | 2026.02.06 | v0.13.0 Final release, v0.13.0            |
 | 2026.01.26 | Release candidates, v0.14.0rc1            |
 | 2026.01.24 | Release candidates, v0.13.0rc2            |

@@ -1,6 +1,6 @@
 # Release Notes
 
-## v0.15.0rc1 - 2026.02.26
+## v0.15.0rc1 - 2026.02.27
 
 This is the first release candidate of v0.15.0 for vLLM Ascend. Please follow the [official doc](https://docs.vllm.ai/projects/ascend/en/latest) to get started.
 

@@ -14,7 +14,7 @@
 from vllm_ascend.attention.sfa_v1 import (AscendSFABackend, AscendSFAImpl,
                                           AscendSFAMetadata,
                                           AscendSFAMetadataBuilder)
-from vllm_ascend.utils import enable_dsa_cp, vllm_version_is
+from vllm_ascend.utils import enable_dsa_cp
 
 
 class TestAscendSFABackend(TestBase):

@@ -13,7 +13,7 @@
     MODELSLIM_CONFIG_FILENAME,
     AscendModelSlimConfig,
 )
-from vllm_ascend.utils import ASCEND_QUANTIZATION_METHOD, vllm_version_is
+from vllm_ascend.utils import ASCEND_QUANTIZATION_METHOD
 
 from vllm.model_executor.layers.attention import Attention
 

diff --git a/vllm_ascend/_310p/fused_moe/fused_moe.py b/vllm_ascend/_310p/fused_moe/fused_moe.py
@@ -153,11 +153,10 @@ def __init__(self, *args, **kwargs):
         self.quant_type = self.get_quant_type()
 
         _MoECommMethods[MoECommType.ALLGATHER] = AllGatherCommImpl310(self.moe_config)
-
-        if not vllm_version_is("0.15.0"):
+        if not vllm_version_is("0.16.0"):
             self.runner = self._init_runner()
 
-    if not vllm_version_is("0.15.0"):
+    if not vllm_version_is("0.16.0"):
 
         def _init_runner(self):
             from vllm_ascend.ops.fused_moe.fused_moe import AscendMoERunner

@@ -1,18 +1,14 @@
 import torch
 import torch._inductor.pattern_matcher as pm
 from torch._inductor.pattern_matcher import PatternMatcherPass
-
-from vllm_ascend.utils import is_moe_model, vllm_version_is
-
-if vllm_version_is("0.15.0"):
-    from vllm.compilation.vllm_inductor_pass import VllmInductorPass  # type: ignore
-else:
-    from vllm.compilation.passes.vllm_inductor_pass import VllmInductorPass
+from vllm.compilation.passes.vllm_inductor_pass import VllmInductorPass
 from vllm.config import VllmConfig
 from vllm.config.utils import Range
 from vllm.distributed import get_tensor_model_parallel_world_size, get_tp_group, tensor_model_parallel_all_reduce
 from vllm.logger import logger
 
+from vllm_ascend.utils import is_moe_model
+
 SP_THRESHOLD = 1000
 
 

@@ -21,8 +21,6 @@
 import torch_npu
 from vllm.model_executor.layers.attention.mm_encoder_attention import MMEncoderAttention  # type: ignore
 
-from vllm_ascend.utils import vllm_version_is
-
 MIN_PAD_SIZE: int = 64  # min_size to pad weight
 MAX_PAD_SIZE: int = 128  # max_size to pad weight
 
@@ -64,9 +62,7 @@ def __init__(
             prefix=prefix,
         )
 
-        if not vllm_version_is("0.15.0"):
-            self.layer_index = int("".join(filter(str.isdigit, prefix)))
-
+        self.layer_index = int("".join(filter(str.isdigit, prefix)))
         self.enable_pad = self.head_size > MIN_PAD_SIZE and self.head_size < MAX_PAD_SIZE
         self.scale_value = self.head_size**-0.5
 
@@ -106,19 +102,13 @@ def forward_oot(
         kv_len = key.size(1)
         is_reshaped = query.dim() == 4
 
-        if vllm_version_is("0.15.0"):
+        # Directly use seq_lens cpu cache to avoid d2h copy.
+        global seq_lens_cpu_cache
+        if self.layer_index == 0:
             if cu_seqlens is None:
                 cu_seqlens = torch.arange(0, (bsz + 1) * q_len, step=q_len, dtype=torch.int32, device="cpu")
-            seq_lens_cpu = torch.diff(cu_seqlens).to("cpu")
-        else:
-            global seq_lens_cpu_cache
-            if self.layer_index == 0:
-                if cu_seqlens is None:
-                    cu_seqlens = torch.arange(0, (bsz + 1) * q_len, step=q_len, dtype=torch.int32, device="cpu")
-                # Update seq_lens cpu cache.
-                seq_lens_cpu_cache = torch.diff(cu_seqlens).to("cpu")
-            # Directly use seq_lens cpu cache to avoid d2h copy.
-            seq_lens_cpu = seq_lens_cpu_cache
+            # Update seq_lens cpu cache.
+            seq_lens_cpu_cache = torch.diff(cu_seqlens).to("cpu")
 
         # q, k, v: [b, s, head, head_dim] -> [b * s, head, head_dim]
         q, k, v = self._reshape_qkv_to_3d(query, key, value, bsz, q_len, kv_len)
@@ -138,7 +128,7 @@ def forward_oot(
             query=q,
             key=k,
             value=v,
-            seq_len=seq_lens_cpu,
+            seq_len=seq_lens_cpu_cache,
             scale_value=self.scale_value,
             num_heads=self.num_heads,
             num_kv_heads=self.num_kv_heads,

@@ -19,7 +19,6 @@
     sample_recovered_tokens_kernel,
 )
 from vllm_ascend.sample.sampler import apply_top_k_top_p
-from vllm_ascend.utils import vllm_version_is
 
 
 def apply_sampling_constraints(
@@ -167,10 +166,7 @@ def rejection_sample(
             return output_token_ids
 
     # Compute probability distribution from target logits.
-    if vllm_version_is("0.15.0"):
-        target_probs = target_logits
-    else:
-        target_probs = target_logits.softmax(dim=-1, dtype=torch.float32)
+    target_probs = target_logits.softmax(dim=-1, dtype=torch.float32)
     assert target_probs.is_contiguous()
 
     # Generate uniform probabilities for rejection sampling.