diff --git a/docs/source/community/versioning_policy.md b/docs/source/community/versioning_policy.md index c04969d4942..94b3059b0d3 100644 --- a/docs/source/community/versioning_policy.md +++ b/docs/source/community/versioning_policy.md @@ -65,7 +65,7 @@ For main branch of vLLM Ascend, we usually make it compatible with the latest vL | Date | Event | |------------|-------------------------------------------| -| 2026.02.26 | Release candidates, v0.15.0rc1 | +| 2026.02.27 | Release candidates, v0.15.0rc1 | | 2026.02.06 | v0.13.0 Final release, v0.13.0 | | 2026.01.26 | Release candidates, v0.14.0rc1 | | 2026.01.24 | Release candidates, v0.13.0rc2 | diff --git a/docs/source/user_guide/release_notes.md b/docs/source/user_guide/release_notes.md index 582aae6e65f..fdec32b2118 100644 --- a/docs/source/user_guide/release_notes.md +++ b/docs/source/user_guide/release_notes.md @@ -1,6 +1,6 @@ # Release Notes -## v0.15.0rc1 - 2026.02.26 +## v0.15.0rc1 - 2026.02.27 This is the first release candidate of v0.15.0 for vLLM Ascend. Please follow the [official doc](https://docs.vllm.ai/projects/ascend/en/latest) to get started. diff --git a/tests/ut/attention/test_sfa_v1.py b/tests/ut/attention/test_sfa_v1.py index 8d3cde39ddb..a90f73252af 100644 --- a/tests/ut/attention/test_sfa_v1.py +++ b/tests/ut/attention/test_sfa_v1.py @@ -14,7 +14,7 @@ from vllm_ascend.attention.sfa_v1 import (AscendSFABackend, AscendSFAImpl, AscendSFAMetadata, AscendSFAMetadataBuilder) -from vllm_ascend.utils import enable_dsa_cp, vllm_version_is +from vllm_ascend.utils import enable_dsa_cp class TestAscendSFABackend(TestBase): diff --git a/tests/ut/quantization/test_modelslim_config.py b/tests/ut/quantization/test_modelslim_config.py index 73497568360..556c8a4acd3 100644 --- a/tests/ut/quantization/test_modelslim_config.py +++ b/tests/ut/quantization/test_modelslim_config.py @@ -13,7 +13,7 @@ MODELSLIM_CONFIG_FILENAME, AscendModelSlimConfig, ) -from vllm_ascend.utils import ASCEND_QUANTIZATION_METHOD, vllm_version_is +from vllm_ascend.utils import ASCEND_QUANTIZATION_METHOD from vllm.model_executor.layers.attention import Attention diff --git a/vllm_ascend/_310p/fused_moe/fused_moe.py b/vllm_ascend/_310p/fused_moe/fused_moe.py index 3dfa072e108..4a411dc4cae 100644 --- a/vllm_ascend/_310p/fused_moe/fused_moe.py +++ b/vllm_ascend/_310p/fused_moe/fused_moe.py @@ -153,11 +153,10 @@ def __init__(self, *args, **kwargs): self.quant_type = self.get_quant_type() _MoECommMethods[MoECommType.ALLGATHER] = AllGatherCommImpl310(self.moe_config) - - if not vllm_version_is("0.15.0"): + if not vllm_version_is("0.16.0"): self.runner = self._init_runner() - if not vllm_version_is("0.15.0"): + if not vllm_version_is("0.16.0"): def _init_runner(self): from vllm_ascend.ops.fused_moe.fused_moe import AscendMoERunner diff --git a/vllm_ascend/compilation/passes/sequence_parallelism.py b/vllm_ascend/compilation/passes/sequence_parallelism.py index d60d2f0e931..629edc9397a 100644 --- a/vllm_ascend/compilation/passes/sequence_parallelism.py +++ b/vllm_ascend/compilation/passes/sequence_parallelism.py @@ -1,18 +1,14 @@ import torch import torch._inductor.pattern_matcher as pm from torch._inductor.pattern_matcher import PatternMatcherPass - -from vllm_ascend.utils import is_moe_model, vllm_version_is - -if vllm_version_is("0.15.0"): - from vllm.compilation.vllm_inductor_pass import VllmInductorPass # type: ignore -else: - from vllm.compilation.passes.vllm_inductor_pass import VllmInductorPass +from vllm.compilation.passes.vllm_inductor_pass import VllmInductorPass from vllm.config import VllmConfig from vllm.config.utils import Range from vllm.distributed import get_tensor_model_parallel_world_size, get_tp_group, tensor_model_parallel_all_reduce from vllm.logger import logger +from vllm_ascend.utils import is_moe_model + SP_THRESHOLD = 1000 diff --git a/vllm_ascend/ops/mm_encoder_attention.py b/vllm_ascend/ops/mm_encoder_attention.py index 733cd8888a3..889b88c42da 100644 --- a/vllm_ascend/ops/mm_encoder_attention.py +++ b/vllm_ascend/ops/mm_encoder_attention.py @@ -21,8 +21,6 @@ import torch_npu from vllm.model_executor.layers.attention.mm_encoder_attention import MMEncoderAttention # type: ignore -from vllm_ascend.utils import vllm_version_is - MIN_PAD_SIZE: int = 64 # min_size to pad weight MAX_PAD_SIZE: int = 128 # max_size to pad weight @@ -64,9 +62,7 @@ def __init__( prefix=prefix, ) - if not vllm_version_is("0.15.0"): - self.layer_index = int("".join(filter(str.isdigit, prefix))) - + self.layer_index = int("".join(filter(str.isdigit, prefix))) self.enable_pad = self.head_size > MIN_PAD_SIZE and self.head_size < MAX_PAD_SIZE self.scale_value = self.head_size**-0.5 @@ -106,19 +102,13 @@ def forward_oot( kv_len = key.size(1) is_reshaped = query.dim() == 4 - if vllm_version_is("0.15.0"): + # Directly use seq_lens cpu cache to avoid d2h copy. + global seq_lens_cpu_cache + if self.layer_index == 0: if cu_seqlens is None: cu_seqlens = torch.arange(0, (bsz + 1) * q_len, step=q_len, dtype=torch.int32, device="cpu") - seq_lens_cpu = torch.diff(cu_seqlens).to("cpu") - else: - global seq_lens_cpu_cache - if self.layer_index == 0: - if cu_seqlens is None: - cu_seqlens = torch.arange(0, (bsz + 1) * q_len, step=q_len, dtype=torch.int32, device="cpu") - # Update seq_lens cpu cache. - seq_lens_cpu_cache = torch.diff(cu_seqlens).to("cpu") - # Directly use seq_lens cpu cache to avoid d2h copy. - seq_lens_cpu = seq_lens_cpu_cache + # Update seq_lens cpu cache. + seq_lens_cpu_cache = torch.diff(cu_seqlens).to("cpu") # q, k, v: [b, s, head, head_dim] -> [b * s, head, head_dim] q, k, v = self._reshape_qkv_to_3d(query, key, value, bsz, q_len, kv_len) @@ -138,7 +128,7 @@ def forward_oot( query=q, key=k, value=v, - seq_len=seq_lens_cpu, + seq_len=seq_lens_cpu_cache, scale_value=self.scale_value, num_heads=self.num_heads, num_kv_heads=self.num_kv_heads, diff --git a/vllm_ascend/sample/rejection_sampler.py b/vllm_ascend/sample/rejection_sampler.py index 04552b97922..695ae649dab 100644 --- a/vllm_ascend/sample/rejection_sampler.py +++ b/vllm_ascend/sample/rejection_sampler.py @@ -19,7 +19,6 @@ sample_recovered_tokens_kernel, ) from vllm_ascend.sample.sampler import apply_top_k_top_p -from vllm_ascend.utils import vllm_version_is def apply_sampling_constraints( @@ -167,10 +166,7 @@ def rejection_sample( return output_token_ids # Compute probability distribution from target logits. - if vllm_version_is("0.15.0"): - target_probs = target_logits - else: - target_probs = target_logits.softmax(dim=-1, dtype=torch.float32) + target_probs = target_logits.softmax(dim=-1, dtype=torch.float32) assert target_probs.is_contiguous() # Generate uniform probabilities for rejection sampling.