Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion .github/workflows/bot_pr_create.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -37,7 +37,7 @@ jobs:
steps:
- name: Get vLLM version
run: |
VLLM_COMMIT=8b6325758cce5f9c36d38f2462edbd368b97a07c
VLLM_COMMIT=6a9cceb219fcbd6b1eb540ddfdc77ec160f0e209
echo "VLLM_COMMIT=https://github.com/vllm-project/vllm/commit/$VLLM_COMMIT" >> "$GITHUB_ENV"

- name: Checkout repository
Expand Down
2 changes: 1 addition & 1 deletion .github/workflows/dockerfiles/Dockerfile.lint
Original file line number Diff line number Diff line change
Expand Up @@ -27,7 +27,7 @@ RUN apt-get update -y && \

ARG VLLM_REPO=https://github.com/vllm-project/vllm.git
# For lint purpose, actually we need make a main2main matching.
ARG VLLM_COMMIT=8b6325758cce5f9c36d38f2462edbd368b97a07c
ARG VLLM_COMMIT=6a9cceb219fcbd6b1eb540ddfdc77ec160f0e209
RUN git clone $VLLM_REPO /vllm-workspace/vllm && \
cd /vllm-workspace/vllm && \
git checkout $VLLM_COMMIT
Expand Down
2 changes: 1 addition & 1 deletion .github/workflows/pr_test_full.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -75,7 +75,7 @@ jobs:
name: e2e-full
strategy:
matrix:
vllm_version: [8b6325758cce5f9c36d38f2462edbd368b97a07c, v0.18.0]
vllm_version: [6a9cceb219fcbd6b1eb540ddfdc77ec160f0e209, v0.18.0]
needs: [changes]
if: ${{ needs.changes.outputs.e2e_tracker == 'true' || needs.changes.outputs.e2e_tracker == true }}
uses: ./.github/workflows/_e2e_test.yaml
Expand Down
6 changes: 3 additions & 3 deletions .github/workflows/pr_test_light.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -41,7 +41,7 @@ jobs:
lint:
uses: ./.github/workflows/_pre_commit.yml
with:
vllm: 8b6325758cce5f9c36d38f2462edbd368b97a07c
vllm: 6a9cceb219fcbd6b1eb540ddfdc77ec160f0e209
changes:
runs-on: linux-aarch64-a2b3-0
outputs:
Expand Down Expand Up @@ -90,7 +90,7 @@ jobs:
if: ${{ needs.lint.result == 'success' && (needs.changes.outputs.e2e_tracker == 'true' || needs.changes.outputs.ut_tracker == 'true') }}
strategy:
matrix:
vllm_version: [8b6325758cce5f9c36d38f2462edbd368b97a07c, v0.18.0]
vllm_version: [6a9cceb219fcbd6b1eb540ddfdc77ec160f0e209, v0.18.0]
uses: ./.github/workflows/_unit_test.yaml
with:
vllm: ${{ matrix.vllm_version }}
Expand All @@ -102,7 +102,7 @@ jobs:
name: e2e-light
strategy:
matrix:
vllm_version: [8b6325758cce5f9c36d38f2462edbd368b97a07c, v0.18.0]
vllm_version: [6a9cceb219fcbd6b1eb540ddfdc77ec160f0e209, v0.18.0]
# Note (yikun): If CI resource are limited we can split job into two chain jobs
needs: [lint, changes]
# only trigger e2e test after lint passed and the change is e2e related with pull request.
Expand Down
2 changes: 1 addition & 1 deletion .github/workflows/schedule_codecov_refresh.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -33,7 +33,7 @@ jobs:
name: refresh codecov
strategy:
matrix:
vllm_version: [8b6325758cce5f9c36d38f2462edbd368b97a07c]
vllm_version: [6a9cceb219fcbd6b1eb540ddfdc77ec160f0e209]
uses: ./.github/workflows/_unit_test.yaml
with:
vllm: ${{ matrix.vllm_version }}
Expand Down
2 changes: 1 addition & 1 deletion docs/source/community/versioning_policy.md
Original file line number Diff line number Diff line change
Expand Up @@ -59,7 +59,7 @@ For main branch of vLLM Ascend, we usually make it compatible with the latest vL

| vLLM Ascend | vLLM | Python | Stable CANN | PyTorch/torch_npu |
|-------------|--------------|------------------|-------------|--------------------|
| main | 8b6325758cce5f9c36d38f2462edbd368b97a07c, v0.18.0 tag | >= 3.10, < 3.12 | 8.5.0 | 2.9.0 / 2.9.0 |
| main | 6a9cceb219fcbd6b1eb540ddfdc77ec160f0e209, v0.18.0 tag | >= 3.10, < 3.12 | 8.5.0 | 2.9.0 / 2.9.0 |

## Release cadence

Expand Down
14 changes: 10 additions & 4 deletions tests/ut/distributed/device_communicators/test_pyhccl.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,7 @@
from tests.ut.base import TestBase
from vllm_ascend.distributed.device_communicators.pyhccl import \
PyHcclCommunicator
from vllm_ascend.utils import ACL_FORMAT_FRACTAL_NZ, vllm_version_is


class MockHcclLib:
Expand Down Expand Up @@ -45,10 +46,15 @@ def test_load_hccl_fail(self):
@patch("vllm_ascend.utils.current_stream",
return_value=MagicMock(npu_stream=5678))
def test_stateless_group(self, *_):
group = StatelessProcessGroup(rank=3,
world_size=4,
store=None,
socket=None)
if vllm_version_is("0.18.0"):
group = StatelessProcessGroup(rank=3,
world_size=4,
store=None,
socket=None)
else:
group = StatelessProcessGroup(rank=3,
world_size=4,
store=None)

comm = PyHcclCommunicator(group=group, device=3)

Expand Down
4 changes: 3 additions & 1 deletion vllm_ascend/ascend_forward_context.py
Original file line number Diff line number Diff line change
Expand Up @@ -19,6 +19,7 @@
is_drafter_moe_model,
is_moe_model,
speculative_enable_dispatch_gmm_combine_decode,
vllm_version_is,
)


Expand Down Expand Up @@ -53,13 +54,14 @@ def set_ascend_forward_context(
forward_context_kwargs = {
"attn_metadata": attn_metadata,
"vllm_config": vllm_config,
"virtual_engine": virtual_engine,
"num_tokens": num_tokens,
"num_tokens_across_dp": num_tokens_across_dp,
"cudagraph_runtime_mode": aclgraph_runtime_mode,
"batch_descriptor": batch_descriptor,
"skip_compiled": skip_compiled,
}
if vllm_version_is("0.18.0"):
forward_context_kwargs["virtual_engine"] = virtual_engine

with set_forward_context(**forward_context_kwargs):
forward_context = get_forward_context()
Expand Down
4 changes: 2 additions & 2 deletions vllm_ascend/ops/mla.py
Original file line number Diff line number Diff line change
Expand Up @@ -33,7 +33,7 @@

from vllm_ascend.ascend_config import get_ascend_config
from vllm_ascend.ascend_forward_context import _EXTRA_CTX
from vllm_ascend.utils import is_vl_model, parse_layer_idx
from vllm_ascend.utils import is_vl_model, parse_layer_idx, vllm_version_is


class IndexerWrapper(nn.Module):
Expand Down Expand Up @@ -183,7 +183,7 @@ def mla_forward(
attn_metadata = forward_context.attn_metadata[self.mla_attn.layer_name]
else:
attn_metadata = forward_context.attn_metadata
kv_cache = self.mla_attn.kv_cache[forward_context.virtual_engine]
kv_cache = self.mla_attn.kv_cache[forward_context.virtual_engine if vllm_version_is("0.18.0") else 0]
self.mla_attn.impl.forward(
self.mla_attn.layer_name, hidden_states, kv_cache, attn_metadata, need_gather_q_kv, output
)
Expand Down
4 changes: 2 additions & 2 deletions vllm_ascend/patch/worker/patch_qwen3_5.py
Original file line number Diff line number Diff line change
Expand Up @@ -29,7 +29,7 @@
from vllm_ascend.attention.utils import maybe_save_kv_layer_to_connector
from vllm_ascend.ops.triton.fla.sigmoid_gating import fused_sigmoid_gating_delta_rule_update
from vllm_ascend.ops.triton.fused_gdn_gating import fused_gdn_gating_patch
from vllm_ascend.utils import enable_sp
from vllm_ascend.utils import enable_sp, vllm_version_is


class AscendQwen3_5GatedDeltaNet(Qwen3_5GatedDeltaNet):
Expand Down Expand Up @@ -66,7 +66,7 @@ def _forward_core(
non_spec_token_indx = attn_metadata.non_spec_token_indx
spec_state_indices_tensor = attn_metadata.spec_state_indices_tensor # noqa: E501
non_spec_state_indices_tensor = attn_metadata.non_spec_state_indices_tensor # noqa: E501
self_kv_cache = self.kv_cache[forward_context.virtual_engine]
self_kv_cache = self.kv_cache[forward_context.virtual_engine if vllm_version_is("0.18.0") else 0]
conv_state = self_kv_cache[0].transpose(-1, -2)
ssm_state = self_kv_cache[1]
num_actual_tokens = attn_metadata.num_actual_tokens
Expand Down
4 changes: 2 additions & 2 deletions vllm_ascend/patch/worker/patch_qwen3_next.py
Original file line number Diff line number Diff line change
Expand Up @@ -32,7 +32,7 @@
from vllm_ascend.attention.utils import maybe_save_kv_layer_to_connector
from vllm_ascend.ops.triton.fla.fused_qkvzba_split_reshape import fused_qkvzba_split_reshape_cat
from vllm_ascend.ops.triton.fused_gdn_gating import fused_gdn_gating_patch
from vllm_ascend.utils import enable_sp
from vllm_ascend.utils import enable_sp, vllm_version_is


class AscendQwen3Next_GatedDeltaNet(Qwen3NextGatedDeltaNet):
Expand Down Expand Up @@ -124,7 +124,7 @@ def _forward_core(
non_spec_token_indx = attn_metadata.non_spec_token_indx
spec_state_indices_tensor = attn_metadata.spec_state_indices_tensor # noqa: E501
non_spec_state_indices_tensor = attn_metadata.non_spec_state_indices_tensor # noqa: E501
self_kv_cache = self.kv_cache[forward_context.virtual_engine]
self_kv_cache = self.kv_cache[forward_context.virtual_engine if vllm_version_is("0.18.0") else 0]
conv_state = self_kv_cache[0].transpose(-1, -2)
ssm_state = self_kv_cache[1]
num_actual_tokens = attn_metadata.num_actual_tokens
Expand Down
2 changes: 1 addition & 1 deletion vllm_ascend/platform.py
Original file line number Diff line number Diff line change
Expand Up @@ -551,7 +551,7 @@ def set_additional_forward_context(
attn_metadata: dict[str, Any],
vllm_config: VllmConfig,
dp_metadata,
virtual_engine: int = 0,
virtual_engine: int = 0, # ToDo:: Remove me when upgrade to vllm 0.19.0 from 0.18.0
num_tokens: int = 0,
num_tokens_across_dp: torch.Tensor | None = None,
cudagraph_runtime_mode=None,
Expand Down
Loading