Skip to content
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion .github/workflows/_e2e_test.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -18,7 +18,7 @@ on:
continue_on_error:
required: false
type: boolean
default: false
default: true
env:
UV_INDEX_URL: http://cache-service.nginx-pypi-cache.svc.cluster.local/pypi/simple
UV_EXTRA_INDEX_URL: https://mirrors.huaweicloud.com/ascend/repos/pypi
Expand Down
2 changes: 1 addition & 1 deletion .github/workflows/bot_pr_create.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -37,7 +37,7 @@ jobs:
steps:
- name: Get vLLM version
run: |
VLLM_COMMIT=35141a7eeda941a60ad5a4956670c60fd5a77029
VLLM_COMMIT=14acf429ac08b6d538ca6feb3e06b6d13895804d
echo "VLLM_COMMIT=https://github.com/vllm-project/vllm/commit/$VLLM_COMMIT" >> "$GITHUB_ENV"

- name: Checkout repository
Expand Down
2 changes: 1 addition & 1 deletion .github/workflows/dockerfiles/Dockerfile.lint
Original file line number Diff line number Diff line change
Expand Up @@ -27,7 +27,7 @@ RUN apt-get update -y && \

ARG VLLM_REPO=https://github.com/vllm-project/vllm.git
# For lint purpose, actually we need make a main2main matching.
ARG VLLM_COMMIT=35141a7eeda941a60ad5a4956670c60fd5a77029
ARG VLLM_COMMIT=14acf429ac08b6d538ca6feb3e06b6d13895804d
RUN git clone $VLLM_REPO /vllm-workspace/vllm && \
cd /vllm-workspace/vllm && \
git checkout $VLLM_COMMIT
Expand Down
2 changes: 1 addition & 1 deletion .github/workflows/pr_test_full.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -75,7 +75,7 @@ jobs:
name: e2e-full
strategy:
matrix:
vllm_version: [35141a7eeda941a60ad5a4956670c60fd5a77029, v0.18.0]
vllm_version: [14acf429ac08b6d538ca6feb3e06b6d13895804d, v0.18.0]
needs: [changes]
if: ${{ needs.changes.outputs.e2e_tracker == 'true' || needs.changes.outputs.e2e_tracker == true }}
uses: ./.github/workflows/_e2e_test.yaml
Expand Down
6 changes: 3 additions & 3 deletions .github/workflows/pr_test_light.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -41,7 +41,7 @@ jobs:
lint:
uses: ./.github/workflows/_pre_commit.yml
with:
vllm: 35141a7eeda941a60ad5a4956670c60fd5a77029
vllm: 14acf429ac08b6d538ca6feb3e06b6d13895804d
changes:
runs-on: linux-aarch64-a2b3-0
outputs:
Expand Down Expand Up @@ -90,7 +90,7 @@ jobs:
if: ${{ needs.lint.result == 'success' && (needs.changes.outputs.e2e_tracker == 'true' || needs.changes.outputs.ut_tracker == 'true') }}
strategy:
matrix:
vllm_version: [35141a7eeda941a60ad5a4956670c60fd5a77029, v0.18.0]
vllm_version: [14acf429ac08b6d538ca6feb3e06b6d13895804d, v0.18.0]
uses: ./.github/workflows/_unit_test.yaml
with:
vllm: ${{ matrix.vllm_version }}
Expand All @@ -102,7 +102,7 @@ jobs:
name: e2e-light
strategy:
matrix:
vllm_version: [35141a7eeda941a60ad5a4956670c60fd5a77029, v0.18.0]
vllm_version: [14acf429ac08b6d538ca6feb3e06b6d13895804d, v0.18.0]
# Note (yikun): If CI resource are limited we can split job into two chain jobs
needs: [lint, changes]
# only trigger e2e test after lint passed and the change is e2e related with pull request.
Expand Down
2 changes: 1 addition & 1 deletion .github/workflows/schedule_codecov_refresh.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -33,7 +33,7 @@ jobs:
name: refresh codecov
strategy:
matrix:
vllm_version: [35141a7eeda941a60ad5a4956670c60fd5a77029]
vllm_version: [14acf429ac08b6d538ca6feb3e06b6d13895804d]
uses: ./.github/workflows/_unit_test.yaml
with:
vllm: ${{ matrix.vllm_version }}
Expand Down
2 changes: 1 addition & 1 deletion vllm_ascend/ascend_config.py
Original file line number Diff line number Diff line change
Expand Up @@ -129,7 +129,7 @@ def __init__(self, vllm_config: "VllmConfig"):
# when enable_async_exponential is True, AscendSampler will be different from vllm Sampler,
# which make batch_invariant mode not working.
# so we disable async exponential when batch_invariant mode is enabled.
from vllm.model_executor.layers.batch_invariant import vllm_is_batch_invariant
from vllm_ascend.batch_invariant import vllm_is_batch_invariant

self.enable_async_exponential = (
bool(additional_config.get("enable_async_exponential", False)) and not vllm_is_batch_invariant()
Expand Down
30 changes: 27 additions & 3 deletions vllm_ascend/attention/attention_v1.py
Original file line number Diff line number Diff line change
Expand Up @@ -688,7 +688,20 @@ def full_graph_pa(
graph_params.handles[num_tokens].append(handle)
return output

def _get_fia_params(self, key: torch.Tensor, value: torch.Tensor, attn_metadata: AscendMetadata):
def _get_fia_params(self, key: torch.Tensor, value: torch.Tensor, attn_metadata: AscendMetadata, kv_cache=None):
# PrefillNoCache doesn't need key_cache, but other modes do
# Only initialize/require cache for modes that actually use it
if attn_metadata.attn_state != AscendAttentionState.PrefillNoCache:
# Initialize cache from kv_cache if not already set (for DecodeOnly mode)
if self.key_cache is None and kv_cache is not None:
if isinstance(kv_cache, torch.Tensor) and kv_cache.dim() > 0 and kv_cache.shape[0] == 2:
self.key_cache, self.value_cache = kv_cache[0], kv_cache[1]
elif isinstance(kv_cache, (list, tuple)) and len(kv_cache) >= 2:
self.key_cache, self.value_cache = kv_cache[0], kv_cache[1]

if self.key_cache is None:
raise RuntimeError(f"key_cache is None in _get_fia_params for mode {attn_metadata.attn_state}. kv_cache={kv_cache}")

if attn_metadata.attn_state == AscendAttentionState.PrefillNoCache:
block_size = 128
block_table = None
Expand Down Expand Up @@ -766,6 +779,7 @@ def forward_fused_infer_attention(
value: torch.Tensor,
attn_metadata: AscendMetadata,
output: torch.Tensor,
kv_cache=None,
):
# we inherit ForwardContext in model runner v2, when enable model
# runner v2, there is not capturing attribute in forward_context,
Expand All @@ -781,7 +795,7 @@ def forward_fused_infer_attention(
and self.sinks is None
):
return self._forward_fia_slidingwindow(query, attn_metadata, output)
key, value, block_size, block_table, actual_seq_lengths_kv = self._get_fia_params(key, value, attn_metadata)
key, value, block_size, block_table, actual_seq_lengths_kv = self._get_fia_params(key, value, attn_metadata, kv_cache)
num_tokens = attn_metadata.actual_seq_lengths_q[-1]
query = query[:num_tokens]
if (
Expand Down Expand Up @@ -927,7 +941,7 @@ def forward_impl(
):
output = self.forward_paged_attention(query, attn_metadata, output)
else:
output = self.forward_fused_infer_attention(query, key, value, attn_metadata, output)
output = self.forward_fused_infer_attention(query, key, value, attn_metadata, output, kv_cache)

return output

Expand Down Expand Up @@ -963,6 +977,16 @@ def forward(
num_tokens = query.shape[0]
if attn_metadata is None:
return output.fill_(0)

# Initialize key_cache and value_cache from kv_cache if not already set.
# This is needed for DecodeOnly mode where key/value are None but we still
# need access to the cache for attention computation.
if self.key_cache is None and kv_cache is not None:
if isinstance(kv_cache, torch.Tensor) and kv_cache.dim() > 0 and kv_cache.shape[0] == 2:
self.key_cache, self.value_cache = kv_cache[0], kv_cache[1]
elif isinstance(kv_cache, (list, tuple)) and len(kv_cache) >= 2:
self.key_cache, self.value_cache = kv_cache[0], kv_cache[1]

output_padded = None
if key is not None and value is not None:
output_padded = output
Expand Down
16 changes: 15 additions & 1 deletion vllm_ascend/batch_invariant.py
Original file line number Diff line number Diff line change
Expand Up @@ -20,14 +20,28 @@

import torch
import torch_npu
import vllm.envs as envs
from vllm.logger import logger
from vllm.model_executor.layers.batch_invariant import vllm_is_batch_invariant
from vllm.triton_utils import HAS_TRITON

# in case recursive call in reduce_sum.
torch_sum = torch.sum


def vllm_is_batch_invariant() -> bool:
"""Check if batch-invariant mode is enabled.

This is a compatibility wrapper for the vllm function that was removed
in recent upstream vLLM refactoring.
"""
# Try to access from envs module, fall back to environment variable
if hasattr(envs, 'VLLM_BATCH_INVARIANT'):
return bool(envs.VLLM_BATCH_INVARIANT)
else:
# Fallback to environment variable for older vLLM versions
return bool(int(os.getenv("VLLM_BATCH_INVARIANT", "0")))


if HAS_TRITON:
from vllm_ascend.ops.triton.batch_invariant.matmul import (
addmm_batch_invariant,
Expand Down
18 changes: 14 additions & 4 deletions vllm_ascend/kv_offload/npu.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,12 +5,21 @@
from vllm.v1.attention.backend import AttentionBackend # type: ignore
from vllm.v1.kv_cache_interface import KVCacheConfig
from vllm.v1.kv_offload.abstract import LoadStoreSpec, OffloadingManager
from vllm.v1.kv_offload.backends.cpu import CPUBackend
from vllm.v1.kv_offload.lru_manager import LRUOffloadingManager
from vllm.v1.kv_offload.mediums import CPULoadStoreSpec, GPULoadStoreSpec
from vllm.v1.kv_offload.spec import OffloadingSpec
from vllm.v1.kv_offload.worker.worker import OffloadingHandler

# Handle import compatibility with different vLLM versions
try:
from vllm.v1.kv_offload.cpu.manager import CPUOffloadingManager
except ModuleNotFoundError:
# Fallback for older vLLM versions where the path might be different
try:
from vllm.v1.kv_offload.cpu_manager import CPUOffloadingManager # noqa: F401
except ModuleNotFoundError:
# If still not found, let it fail at usage time with better error message
CPUOffloadingManager = None # type: ignore

from vllm_ascend.kv_offload.cpu_npu import CpuNpuOffloadingHandler


Expand All @@ -36,8 +45,9 @@ def get_manager(self) -> OffloadingManager:
assert len(self.gpu_block_size) == 1
gpu_block_size = self.gpu_block_size[0]
offloaded_block_size = gpu_block_size * self.block_size_factor
self._manager = LRUOffloadingManager(
CPUBackend(block_size=offloaded_block_size, num_blocks=self.num_cpu_blocks),
self._manager = CPUOffloadingManager(
block_size=offloaded_block_size,
num_blocks=self.num_cpu_blocks,
enable_events=enable_events,
)
return self._manager
Expand Down
5 changes: 4 additions & 1 deletion vllm_ascend/ops/mla.py
Original file line number Diff line number Diff line change
Expand Up @@ -183,7 +183,10 @@ def mla_forward(
attn_metadata = forward_context.attn_metadata[self.mla_attn.layer_name]
else:
attn_metadata = forward_context.attn_metadata
kv_cache = self.mla_attn.kv_cache[forward_context.virtual_engine if vllm_version_is("0.18.0") else 0]
if vllm_version_is("0.18.0"):
kv_cache = self.mla_attn.kv_cache[forward_context.virtual_engine]
else:
kv_cache = self.mla_attn.kv_cache
self.mla_attn.impl.forward(
self.mla_attn.layer_name, hidden_states, kv_cache, attn_metadata, need_gather_q_kv, output
)
Expand Down
5 changes: 4 additions & 1 deletion vllm_ascend/patch/worker/patch_qwen3_5.py
Original file line number Diff line number Diff line change
Expand Up @@ -135,7 +135,10 @@ def _forward_core(
non_spec_token_indx = attn_metadata.non_spec_token_indx
spec_state_indices_tensor = attn_metadata.spec_state_indices_tensor # noqa: E501
non_spec_state_indices_tensor = attn_metadata.non_spec_state_indices_tensor # noqa: E501
self_kv_cache = self.kv_cache[forward_context.virtual_engine if vllm_version_is("0.18.0") else 0]
if vllm_version_is("0.18.0"):
self_kv_cache = self.kv_cache[forward_context.virtual_engine]
else:
self_kv_cache = self.kv_cache
conv_state = self_kv_cache[0].transpose(-1, -2)
ssm_state = self_kv_cache[1]
num_actual_tokens = attn_metadata.num_actual_tokens
Expand Down
5 changes: 4 additions & 1 deletion vllm_ascend/patch/worker/patch_qwen3_next.py
Original file line number Diff line number Diff line change
Expand Up @@ -125,7 +125,10 @@ def _forward_core(
non_spec_token_indx = attn_metadata.non_spec_token_indx
spec_state_indices_tensor = attn_metadata.spec_state_indices_tensor # noqa: E501
non_spec_state_indices_tensor = attn_metadata.non_spec_state_indices_tensor # noqa: E501
self_kv_cache = self.kv_cache[forward_context.virtual_engine if vllm_version_is("0.18.0") else 0]
if vllm_version_is("0.18.0"):
self_kv_cache = self.kv_cache[forward_context.virtual_engine]
else:
self_kv_cache = self.kv_cache
conv_state = self_kv_cache[0].transpose(-1, -2)
ssm_state = self_kv_cache[1]
num_actual_tokens = attn_metadata.num_actual_tokens
Expand Down
3 changes: 1 addition & 2 deletions vllm_ascend/patch/worker/patch_qwen3_next_mtp.py
Original file line number Diff line number Diff line change
Expand Up @@ -44,8 +44,7 @@ def bind_kv_cache(

# Bind kv_caches to forward context
for layer_name, kv_cache in kv_caches.items():
# NOTE: Use list because of v0 PP virtual engine.
forward_context[layer_name].kv_cache = [kv_cache]
forward_context[layer_name].kv_cache = kv_cache


utils.bind_kv_cache = bind_kv_cache
2 changes: 1 addition & 1 deletion vllm_ascend/sample/sampler.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
import torch
from vllm.model_executor.layers.batch_invariant import vllm_is_batch_invariant
from vllm_ascend.batch_invariant import vllm_is_batch_invariant
from vllm.triton_utils import HAS_TRITON
from vllm.v1.sample.metadata import SamplingMetadata
from vllm.v1.sample.ops.topk_topp_sampler import TopKTopPSampler
Expand Down
3 changes: 3 additions & 0 deletions vllm_ascend/spec_decode/eagle_proposer.py
Original file line number Diff line number Diff line change
Expand Up @@ -91,6 +91,9 @@ class SpecDecodeBaseProposer(EagleProposer):
def __init__(self, vllm_config: VllmConfig, device: torch.device, pass_hidden_states_to_model: bool, runner=None):
super().__init__(vllm_config, device, runner)

# Assign runner before it's used in the methods below
self.runner = runner

self.use_async_scheduling = self.vllm_config.scheduler_config.async_scheduling
self.pass_hidden_states_to_model = pass_hidden_states_to_model
self.decode_threshold = 1 + self.num_speculative_tokens
Expand Down
2 changes: 1 addition & 1 deletion vllm_ascend/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -259,7 +259,7 @@ def enable_custom_op():
Enable lazy init for vllm_ascend_C to avoid early initialization of CANN's RTS component.
Ensure that ASCEND_RT_VISIBLE_DEVICES can be dynamically modified before torch.npu.set_device().
"""
from vllm.model_executor.layers.batch_invariant import vllm_is_batch_invariant
from vllm_ascend.batch_invariant import vllm_is_batch_invariant

global _CUSTOM_OP_ENABLED

Expand Down
Loading
Loading