Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion .github/workflows/bot_pr_create.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -34,7 +34,7 @@ jobs:
steps:
- name: Get vLLM version
run: |
VLLM_COMMIT=7157596103666ee7ccb7008acee8bff8a8ff1731
VLLM_COMMIT=8be6432bdaf6275664d857b1e5e9bf8ed1ce299e
echo "VLLM_COMMIT=https://github.com/vllm-project/vllm/commit/$VLLM_COMMIT" >> $GITHUB_ENV

- name: Checkout repository
Expand Down
2 changes: 1 addition & 1 deletion .github/workflows/pr_test_full.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -74,7 +74,7 @@ jobs:
name: e2e-full
strategy:
matrix:
vllm_version: [7157596103666ee7ccb7008acee8bff8a8ff1731, v0.13.0]
vllm_version: [8be6432bdaf6275664d857b1e5e9bf8ed1ce299e, v0.13.0]
needs: [changes]
if: ${{ needs.changes.outputs.e2e_tracker == 'true' }}
uses: ./.github/workflows/_e2e_test.yaml
Expand Down
6 changes: 3 additions & 3 deletions .github/workflows/pr_test_light.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -42,7 +42,7 @@ jobs:
lint:
uses: ./.github/workflows/_pre_commit.yml
with:
vllm: 7157596103666ee7ccb7008acee8bff8a8ff1731
vllm: 8be6432bdaf6275664d857b1e5e9bf8ed1ce299e
changes:
runs-on: linux-aarch64-a2-0
outputs:
Expand Down Expand Up @@ -90,7 +90,7 @@ jobs:
SOC_VERSION: ascend910b1
strategy:
matrix:
vllm_version: [7157596103666ee7ccb7008acee8bff8a8ff1731, v0.13.0]
vllm_version: [8be6432bdaf6275664d857b1e5e9bf8ed1ce299e, v0.13.0]

steps:
- name: Free up disk space
Expand Down Expand Up @@ -163,7 +163,7 @@ jobs:
name: e2e-light
strategy:
matrix:
vllm_version: [7157596103666ee7ccb7008acee8bff8a8ff1731, v0.13.0]
vllm_version: [8be6432bdaf6275664d857b1e5e9bf8ed1ce299e, v0.13.0]
# Note (yikun): If CI resource are limited we can split job into two chain jobs
needs: [lint, changes]
# only trigger e2e test after lint passed and the change is e2e related with pull request.
Expand Down
2 changes: 1 addition & 1 deletion docs/source/community/versioning_policy.md
Original file line number Diff line number Diff line change
Expand Up @@ -51,7 +51,7 @@ If you're using v0.7.3, don't forget to install [mindie-turbo](https://pypi.org/
For main branch of vLLM Ascend, we usually make it compatible with the latest vLLM release and a newer commit hash of vLLM. Please note that this table is usually updated. Please check it regularly.
| vLLM Ascend | vLLM | Python | Stable CANN | PyTorch/torch_npu |
|-------------|--------------|------------------|-------------|--------------------|
| main | 7157596103666ee7ccb7008acee8bff8a8ff1731, v0.13.0 tag | >= 3.10, < 3.12 | 8.3.RC2 | 2.8.0 / 2.8.0 |
| main | 8be6432bdaf6275664d857b1e5e9bf8ed1ce299e, v0.13.0 tag | >= 3.10, < 3.12 | 8.3.RC2 | 2.8.0 / 2.8.0 |

## Release cadence

Expand Down
3 changes: 2 additions & 1 deletion tests/e2e/multicard/test_aclgraph_capture_replay.py
Original file line number Diff line number Diff line change
Expand Up @@ -28,7 +28,8 @@
from vllm_ascend.utils import AscendDeviceType, get_ascend_device_type

MODELS = [
"Qwen/Qwen3-0.6B",
# Offline data parallel mode will be not supported/useful for dense models
# "Qwen/Qwen3-0.6B",
"vllm-ascend/DeepSeek-V2-Lite-W8A8",
]

Expand Down
4 changes: 1 addition & 3 deletions tests/e2e/multicard/test_data_parallel.py
Original file line number Diff line number Diff line change
Expand Up @@ -27,9 +27,7 @@

import pytest

MODELS = [
"Qwen/Qwen3-0.6B", "Qwen/Qwen3-30B-A3B", "vllm-ascend/Qwen3-30B-A3B-W8A8"
]
MODELS = ["Qwen/Qwen3-30B-A3B", "vllm-ascend/Qwen3-30B-A3B-W8A8"]


@pytest.mark.parametrize("model", MODELS)
Expand Down
2 changes: 1 addition & 1 deletion tests/e2e/multicard/test_data_parallel_tp2.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,7 +9,7 @@

import pytest

MODELS = ["Qwen/Qwen3-0.6B"]
MODELS = ["Qwen/Qwen3-30B-A3B"]


@pytest.mark.parametrize("model", MODELS)
Expand Down
6 changes: 5 additions & 1 deletion tests/ut/attention/test_mla_v1.py
Original file line number Diff line number Diff line change
Expand Up @@ -17,6 +17,7 @@
AscendMLAPrefillMetadata,
ChunkedContextMetadata)
from vllm_ascend.attention.utils import AscendCommonAttentionMetadata
from vllm_ascend.utils import vllm_version_is


class TestAscendMLABackend(TestBase):
Expand Down Expand Up @@ -392,7 +393,10 @@ def setUp(self):
self.mock_vllm_config.model_config = model_config
self.kv_cache_spec = MagicMock()
self.kv_cache_spec.num_layers = 32
self.kv_cache_spec.head_size = 128
if vllm_version_is('0.13.0'):
self.kv_cache_spec.head_size = 128
else:
self.kv_cache_spec.head_size = 64
self.kv_cache_spec.num_heads = 32

@patch("vllm_ascend.attention.mla_v1.get_cos_and_sin_mla")
Expand Down
14 changes: 7 additions & 7 deletions tests/ut/conftest.py
Original file line number Diff line number Diff line change
Expand Up @@ -18,20 +18,20 @@
import sys
from unittest.mock import MagicMock

from vllm_ascend.utils import adapt_patch # noqa E402
from vllm_ascend.utils import register_ascend_customop

# triton and torch_npu is not available in the environment, so we need to mock them
sys.modules['torch_npu'].npu.current_device = MagicMock(return_value=0)
sys.modules['torch_npu._inductor'] = MagicMock()

triton_runtime = MagicMock()
triton_runtime.driver.active.utils.get_device_properties.return_value = {
'num_aic': 8,
'num_vectorcore': 8,
}
sys.modules['triton.runtime'] = triton_runtime

from vllm_ascend.utils import adapt_patch # noqa E402
from vllm_ascend.utils import register_ascend_customop # noqa E402

# triton and torch_npu is not available in the environment, so we need to mock them
sys.modules['torch_npu'].npu.current_device = MagicMock(return_value=0)
sys.modules['torch_npu._inductor'] = MagicMock()

adapt_patch()
adapt_patch(True)

Expand Down
1 change: 0 additions & 1 deletion tests/ut/spec_decode/test_eagle_proposer.py
Original file line number Diff line number Diff line change
Expand Up @@ -58,7 +58,6 @@ def test_initialization_eagle_graph(self):
device=self.device,
runner=self.runner)

self.assertEqual(proposer.block_size, 16)
self.assertEqual(proposer.hidden_size, 4096)
self.assertTrue(proposer.use_cuda_graph)

Expand Down
1 change: 0 additions & 1 deletion tests/ut/spec_decode/test_mtp_proposer.py
Original file line number Diff line number Diff line change
Expand Up @@ -86,7 +86,6 @@ def test_init(self, mock_cpu_gpu_buffer, vllm_config, runner):
assert proposer.dtype == torch.float16
assert proposer.num_speculative_tokens == 2
assert proposer.hidden_size == 4096
assert proposer.block_size == 16

# Test with mrope enabled
assert hasattr(proposer, "positions")
Expand Down
1 change: 1 addition & 0 deletions vllm_ascend/attention/attention_v1.py
Original file line number Diff line number Diff line change
Expand Up @@ -197,6 +197,7 @@ def __init__(
vllm_config: VllmConfig,
device: torch.device,
):
super().__init__(kv_cache_spec, layer_names, vllm_config, device)
self.vllm_config = vllm_config
self.model_config = vllm_config.model_config
self.compilation_config = vllm_config.compilation_config
Expand Down
14 changes: 10 additions & 4 deletions vllm_ascend/spec_decode/eagle_proposer.py
Original file line number Diff line number Diff line change
Expand Up @@ -136,6 +136,7 @@ def load_model(self, model: nn.Module) -> None:
draft_attn_layer_names = draft_attn_layer_names - draft_indexer_layer_names
assert len(draft_attn_layer_names) == 1
self.attn_layer_name = list(draft_attn_layer_names)
self.attn_layer_names = self.attn_layer_name

# share embed_tokens with the target model if needed
if get_pp_group().world_size == 1:
Expand Down Expand Up @@ -442,14 +443,19 @@ def _propose(
# For the requests that exceed the max model length, we set the
# TODO: sequence length to 1 to minimize their overheads in attention.

if self.attn_metadata_builder is None:
attn_metadata_builder = self._get_attention_metadata_builder()
else:
attn_metadata_builder = self.attn_metadata_builder
block_size = attn_metadata_builder.kv_cache_spec.block_size

# Compute the slot mapping.
block_numbers = (clamped_positions // self.block_size)
block_numbers = (clamped_positions // block_size)
block_ids = attn_metadata.block_tables.gather(
dim=1, index=block_numbers.view(-1, 1))
block_ids = block_ids.view(-1)
slot_mapping_tmp = (
block_ids * self.vllm_config.cache_config.block_size +
clamped_positions % self.block_size)
slot_mapping_tmp = (block_ids * block_size +
clamped_positions % block_size)

# Mask out the slot mappings that exceed the max model length.
# Otherwise, the KV cache will be inadvertently updated with the
Expand Down
36 changes: 25 additions & 11 deletions vllm_ascend/worker/model_runner_v1.py
Original file line number Diff line number Diff line change
Expand Up @@ -107,7 +107,7 @@
from vllm_ascend.utils import (AscendDeviceType, ProfileExecuteDuration,
enable_sp, get_ascend_device_type, is_moe_model,
lmhead_tp_enable, maybe_trans_nz,
set_weight_prefetch_method)
set_weight_prefetch_method, vllm_version_is)
from vllm_ascend.worker.npu_input_batch import NPUInputBatch
from vllm_ascend.worker.pcp_utils import PCPManager

Expand Down Expand Up @@ -1097,12 +1097,20 @@ def _generate_process_reqs_hidden_states(self, maybe_padded_num_tokens,
intermediate_tensors,
inputs_embeds):
assert self.model is not None
hidden_states = self.model(
input_ids=input_ids,
positions=positions,
intermediate_tensors=intermediate_tensors,
inputs_embeds=inputs_embeds,
**self._init_model_kwargs(maybe_padded_num_tokens))
if vllm_version_is('0.13.0'):
hidden_states = self.model(
input_ids=input_ids,
positions=positions,
intermediate_tensors=intermediate_tensors,
inputs_embeds=inputs_embeds,
**self._init_model_kwargs(maybe_padded_num_tokens))
else:
hidden_states = self.model(
input_ids=input_ids,
positions=positions,
intermediate_tensors=intermediate_tensors,
inputs_embeds=inputs_embeds,
**self._init_model_kwargs())

forward_context = get_forward_context()
if forward_context.cudagraph_runtime_mode == CUDAGraphMode.FULL \
Expand Down Expand Up @@ -1548,10 +1556,16 @@ def execute_model(
logits = None
else:
if self.input_batch.pooling_params:
pool_output = self._pool(
hidden_states,
scheduler_output.total_num_scheduled_tokens,
num_scheduled_tokens_np)
if vllm_version_is('0.13.0'):
pool_output = self._pool(
hidden_states,
scheduler_output.total_num_scheduled_tokens,
num_scheduled_tokens_np)
else:
pool_output = self._pool(
hidden_states,
scheduler_output.total_num_scheduled_tokens,
num_scheduled_tokens_np, kv_connector_output)
if self.debugger is not None:
self.debugger.stop()
self.debugger.step()
Expand Down
5 changes: 3 additions & 2 deletions vllm_ascend/worker/worker.py
Original file line number Diff line number Diff line change
Expand Up @@ -294,7 +294,7 @@ def determine_available_memory(self) -> int:
def execute_model(
self,
scheduler_output: "SchedulerOutput",
) -> ModelRunnerOutput | None:
) -> ModelRunnerOutput | AsyncModelRunnerOutput | None:
# enable msMonitor to monitor the performance of vllm-ascend
if envs_ascend.MSMONITOR_USE_DAEMON:
dp.step()
Expand All @@ -313,7 +313,8 @@ def execute_model(

output = self.model_runner.execute_model(scheduler_output,
intermediate_tensors)
if isinstance(output, (ModelRunnerOutput, NoneType)):
if isinstance(output,
(ModelRunnerOutput, AsyncModelRunnerOutput, NoneType)):
return output

assert isinstance(output, IntermediateTensors)
Expand Down
Loading