Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion docs/source/community/versioning_policy.md
Original file line number Diff line number Diff line change
Expand Up @@ -65,7 +65,7 @@ For main branch of vLLM Ascend, we usually make it compatible with the latest vL

| Date | Event |
|------------|-------------------------------------------|
| 2026.02.26 | Release candidates, v0.15.0rc1 |
| 2026.02.27 | Release candidates, v0.15.0rc1 |
| 2026.02.06 | v0.13.0 Final release, v0.13.0 |
| 2026.01.26 | Release candidates, v0.14.0rc1 |
| 2026.01.24 | Release candidates, v0.13.0rc2 |
Expand Down
2 changes: 1 addition & 1 deletion docs/source/user_guide/release_notes.md
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
# Release Notes

## v0.15.0rc1 - 2026.02.26
## v0.15.0rc1 - 2026.02.27

This is the first release candidate of v0.15.0 for vLLM Ascend. Please follow the [official doc](https://docs.vllm.ai/projects/ascend/en/latest) to get started.

Expand Down
2 changes: 1 addition & 1 deletion tests/ut/attention/test_sfa_v1.py
Original file line number Diff line number Diff line change
Expand Up @@ -14,7 +14,7 @@
from vllm_ascend.attention.sfa_v1 import (AscendSFABackend, AscendSFAImpl,
AscendSFAMetadata,
AscendSFAMetadataBuilder)
from vllm_ascend.utils import enable_dsa_cp, vllm_version_is
from vllm_ascend.utils import enable_dsa_cp


class TestAscendSFABackend(TestBase):
Expand Down
2 changes: 1 addition & 1 deletion tests/ut/quantization/test_modelslim_config.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,7 +13,7 @@
MODELSLIM_CONFIG_FILENAME,
AscendModelSlimConfig,
)
from vllm_ascend.utils import ASCEND_QUANTIZATION_METHOD, vllm_version_is
from vllm_ascend.utils import ASCEND_QUANTIZATION_METHOD

from vllm.model_executor.layers.attention import Attention

Expand Down
5 changes: 2 additions & 3 deletions vllm_ascend/_310p/fused_moe/fused_moe.py
Original file line number Diff line number Diff line change
Expand Up @@ -153,11 +153,10 @@ def __init__(self, *args, **kwargs):
self.quant_type = self.get_quant_type()

_MoECommMethods[MoECommType.ALLGATHER] = AllGatherCommImpl310(self.moe_config)

if not vllm_version_is("0.15.0"):
if not vllm_version_is("0.16.0"):
self.runner = self._init_runner()

if not vllm_version_is("0.15.0"):
if not vllm_version_is("0.16.0"):

def _init_runner(self):
from vllm_ascend.ops.fused_moe.fused_moe import AscendMoERunner
Expand Down
10 changes: 3 additions & 7 deletions vllm_ascend/compilation/passes/sequence_parallelism.py
Original file line number Diff line number Diff line change
@@ -1,18 +1,14 @@
import torch
import torch._inductor.pattern_matcher as pm
from torch._inductor.pattern_matcher import PatternMatcherPass

from vllm_ascend.utils import is_moe_model, vllm_version_is

if vllm_version_is("0.15.0"):
from vllm.compilation.vllm_inductor_pass import VllmInductorPass # type: ignore
else:
from vllm.compilation.passes.vllm_inductor_pass import VllmInductorPass
from vllm.compilation.passes.vllm_inductor_pass import VllmInductorPass
from vllm.config import VllmConfig
from vllm.config.utils import Range
from vllm.distributed import get_tensor_model_parallel_world_size, get_tp_group, tensor_model_parallel_all_reduce
from vllm.logger import logger

from vllm_ascend.utils import is_moe_model

SP_THRESHOLD = 1000


Expand Down
24 changes: 7 additions & 17 deletions vllm_ascend/ops/mm_encoder_attention.py
Original file line number Diff line number Diff line change
Expand Up @@ -21,8 +21,6 @@
import torch_npu
from vllm.model_executor.layers.attention.mm_encoder_attention import MMEncoderAttention # type: ignore

from vllm_ascend.utils import vllm_version_is

MIN_PAD_SIZE: int = 64 # min_size to pad weight
MAX_PAD_SIZE: int = 128 # max_size to pad weight

Expand Down Expand Up @@ -64,9 +62,7 @@ def __init__(
prefix=prefix,
)

if not vllm_version_is("0.15.0"):
self.layer_index = int("".join(filter(str.isdigit, prefix)))

self.layer_index = int("".join(filter(str.isdigit, prefix)))
self.enable_pad = self.head_size > MIN_PAD_SIZE and self.head_size < MAX_PAD_SIZE
self.scale_value = self.head_size**-0.5

Expand Down Expand Up @@ -106,19 +102,13 @@ def forward_oot(
kv_len = key.size(1)
is_reshaped = query.dim() == 4

if vllm_version_is("0.15.0"):
# Directly use seq_lens cpu cache to avoid d2h copy.
global seq_lens_cpu_cache
if self.layer_index == 0:
if cu_seqlens is None:
cu_seqlens = torch.arange(0, (bsz + 1) * q_len, step=q_len, dtype=torch.int32, device="cpu")
seq_lens_cpu = torch.diff(cu_seqlens).to("cpu")
else:
global seq_lens_cpu_cache
if self.layer_index == 0:
if cu_seqlens is None:
cu_seqlens = torch.arange(0, (bsz + 1) * q_len, step=q_len, dtype=torch.int32, device="cpu")
# Update seq_lens cpu cache.
seq_lens_cpu_cache = torch.diff(cu_seqlens).to("cpu")
# Directly use seq_lens cpu cache to avoid d2h copy.
seq_lens_cpu = seq_lens_cpu_cache
# Update seq_lens cpu cache.
seq_lens_cpu_cache = torch.diff(cu_seqlens).to("cpu")

# q, k, v: [b, s, head, head_dim] -> [b * s, head, head_dim]
q, k, v = self._reshape_qkv_to_3d(query, key, value, bsz, q_len, kv_len)
Expand All @@ -138,7 +128,7 @@ def forward_oot(
query=q,
key=k,
value=v,
seq_len=seq_lens_cpu,
seq_len=seq_lens_cpu_cache,
scale_value=self.scale_value,
num_heads=self.num_heads,
num_kv_heads=self.num_kv_heads,
Expand Down
6 changes: 1 addition & 5 deletions vllm_ascend/sample/rejection_sampler.py
Original file line number Diff line number Diff line change
Expand Up @@ -19,7 +19,6 @@
sample_recovered_tokens_kernel,
)
from vllm_ascend.sample.sampler import apply_top_k_top_p
from vllm_ascend.utils import vllm_version_is


def apply_sampling_constraints(
Expand Down Expand Up @@ -167,10 +166,7 @@ def rejection_sample(
return output_token_ids

# Compute probability distribution from target logits.
if vllm_version_is("0.15.0"):
target_probs = target_logits
else:
target_probs = target_logits.softmax(dim=-1, dtype=torch.float32)
target_probs = target_logits.softmax(dim=-1, dtype=torch.float32)
assert target_probs.is_contiguous()

# Generate uniform probabilities for rejection sampling.
Expand Down