Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion .github/workflows/bot_pr_create.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -37,7 +37,7 @@ jobs:
steps:
- name: Get vLLM version
run: |
VLLM_COMMIT=4497431df654e46fb1fb5e64bf8611e762ae5d87
VLLM_COMMIT=8a680463fab3bc9e6760417cd5c0a6aa58283065
echo "VLLM_COMMIT=https://github.com/vllm-project/vllm/commit/$VLLM_COMMIT" >> "$GITHUB_ENV"

- name: Checkout repository
Expand Down
2 changes: 1 addition & 1 deletion .github/workflows/dockerfiles/Dockerfile.lint
Original file line number Diff line number Diff line change
Expand Up @@ -27,7 +27,7 @@ RUN apt-get update -y && \

ARG VLLM_REPO=https://github.com/vllm-project/vllm.git
# For lint purpose, actually we need make a main2main matching.
ARG VLLM_COMMIT=4497431df654e46fb1fb5e64bf8611e762ae5d87
ARG VLLM_COMMIT=8a680463fab3bc9e6760417cd5c0a6aa58283065
RUN git clone $VLLM_REPO /vllm-workspace/vllm && \
cd /vllm-workspace/vllm && \
git checkout $VLLM_COMMIT
Expand Down
2 changes: 1 addition & 1 deletion .github/workflows/pr_test_full.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -75,7 +75,7 @@ jobs:
name: e2e-full
strategy:
matrix:
vllm_version: [4497431df654e46fb1fb5e64bf8611e762ae5d87, v0.17.0]
vllm_version: [8a680463fab3bc9e6760417cd5c0a6aa58283065, v0.17.0]
needs: [changes]
if: ${{ needs.changes.outputs.e2e_tracker == 'true' || needs.changes.outputs.e2e_tracker == true }}
uses: ./.github/workflows/_e2e_test.yaml
Expand Down
6 changes: 3 additions & 3 deletions .github/workflows/pr_test_light.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -41,7 +41,7 @@ jobs:
lint:
uses: ./.github/workflows/_pre_commit.yml
with:
vllm: 4497431df654e46fb1fb5e64bf8611e762ae5d87
vllm: 8a680463fab3bc9e6760417cd5c0a6aa58283065
changes:
runs-on: linux-aarch64-a2b3-0
outputs:
Expand Down Expand Up @@ -90,7 +90,7 @@ jobs:
if: ${{ needs.lint.result == 'success' && (needs.changes.outputs.e2e_tracker == 'true' || needs.changes.outputs.ut_tracker == 'true') }}
strategy:
matrix:
vllm_version: [4497431df654e46fb1fb5e64bf8611e762ae5d87, v0.17.0]
vllm_version: [8a680463fab3bc9e6760417cd5c0a6aa58283065, v0.17.0]
uses: ./.github/workflows/_unit_test.yaml
with:
vllm: ${{ matrix.vllm_version }}
Expand All @@ -102,7 +102,7 @@ jobs:
name: e2e-light
strategy:
matrix:
vllm_version: [4497431df654e46fb1fb5e64bf8611e762ae5d87, v0.17.0]
vllm_version: [8a680463fab3bc9e6760417cd5c0a6aa58283065, v0.17.0]
# Note (yikun): If CI resource are limited we can split job into two chain jobs
needs: [lint, changes]
# only trigger e2e test after lint passed and the change is e2e related with pull request.
Expand Down
2 changes: 1 addition & 1 deletion .github/workflows/schedule_codecov_refresh.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -33,7 +33,7 @@ jobs:
name: refresh codecov
strategy:
matrix:
vllm_version: [4497431df654e46fb1fb5e64bf8611e762ae5d87]
vllm_version: [8a680463fab3bc9e6760417cd5c0a6aa58283065]
uses: ./.github/workflows/_unit_test.yaml
with:
vllm: ${{ matrix.vllm_version }}
Expand Down
2 changes: 1 addition & 1 deletion docs/source/community/versioning_policy.md
Original file line number Diff line number Diff line change
Expand Up @@ -59,7 +59,7 @@ For main branch of vLLM Ascend, we usually make it compatible with the latest vL

| vLLM Ascend | vLLM | Python | Stable CANN | PyTorch/torch_npu |
|-------------|--------------|------------------|-------------|--------------------|
| main | 4497431df654e46fb1fb5e64bf8611e762ae5d87, v0.17.0 tag | >= 3.10, < 3.12 | 8.5.0 | 2.9.0 / 2.9.0 |
| main | 8a680463fab3bc9e6760417cd5c0a6aa58283065, v0.17.0 tag | >= 3.10, < 3.12 | 8.5.0 | 2.9.0 / 2.9.0 |

## Release cadence

Expand Down
6 changes: 6 additions & 0 deletions tests/e2e/multicard/2-cards/test_disaggregated_encoder.py
Original file line number Diff line number Diff line change
Expand Up @@ -32,6 +32,12 @@
@pytest.mark.parametrize("model", MODELS)
@pytest.mark.parametrize("tp_size", TENSOR_PARALLELS)
async def test_models(model: str, tp_size: int) -> None:
from vllm_ascend.utils import vllm_version_is

if not vllm_version_is("0.17.0"):
pytest.skip(
"EPLB output is different without EPLB, see issue: https://github.com/vllm-project/vllm-ascend/issues/7408",
)
encode_port = get_open_port()
pd_port = get_open_port()
vllm_server_args = [
Expand Down
6 changes: 6 additions & 0 deletions tests/e2e/multicard/2-cards/test_qwen3_moe.py
Original file line number Diff line number Diff line change
Expand Up @@ -76,6 +76,12 @@ def test_qwen3_moe_distributed_aiv_tp2():

@pytest.mark.asyncio
async def test_qwen3_moe_w8a8_distributed_tp2_ep_dynamic_eplb():
from vllm_ascend.utils import vllm_version_is

if not vllm_version_is("0.17.0"):
pytest.skip(
"EPLB output is different without EPLB, see issue: https://github.com/vllm-project/vllm-ascend/issues/7408",
)
model = "vllm-ascend/Qwen3-30B-A3B-W8A8"
port = get_open_port()
compilation_config = json.dumps({"cudagraph_capture_sizes": [8]})
Expand Down
31 changes: 24 additions & 7 deletions vllm_ascend/ascend_config.py
Original file line number Diff line number Diff line change
Expand Up @@ -181,30 +181,47 @@ def _construct_weight_prefetch_config(self, additional_config):
stacklevel=2,
)

@staticmethod
def _get_compile_ranges(compilation_config):
from vllm_ascend.utils import vllm_version_is

if vllm_version_is("0.17.0"):
return compilation_config.compile_ranges_split_points
else:
return compilation_config.compile_ranges_endpoints

@staticmethod
def _set_compile_ranges(compilation_config, value):
from vllm_ascend.utils import vllm_version_is

if vllm_version_is("0.17.0"):
compilation_config.compile_ranges_split_points = value
else:
compilation_config.compile_ranges_endpoints = value

def update_compile_ranges_split_points(self):
vllm_config = self.vllm_config
if self.ascend_compilation_config.enable_npugraph_ex:
if self.ascend_compilation_config.fuse_allreduce_rms:
from vllm_ascend.compilation.passes.allreduce_rmsnorm_fusion_pass import ALLREDUCE_NORM_FUSE_THRESHOLD

new_compile_ranges_split_points = vllm_config.compilation_config.compile_ranges_split_points
new_compile_ranges_split_points = self._get_compile_ranges(vllm_config.compilation_config)
new_compile_ranges_split_points.append(ALLREDUCE_NORM_FUSE_THRESHOLD)
new_compile_ranges_split_points = sorted(new_compile_ranges_split_points)
vllm_config.compilation_config.compile_ranges_split_points = new_compile_ranges_split_points
self._set_compile_ranges(vllm_config.compilation_config, new_compile_ranges_split_points)
logger.debug(
"set compile_ranges_split_points to "
"{new_compile_ranges_split_points} for matmul and allreduce fusion"
)

else:
new_compile_ranges_split_points = vllm_config.compilation_config.compile_ranges_split_points
new_compile_ranges_split_points = self._get_compile_ranges(vllm_config.compilation_config)
if vllm_config.additional_config.get("ascend_compilation_config", {}).get("fuse_allreduce_rms", True):
from vllm_ascend.compilation.passes.allreduce_rmsnorm_fusion_pass import ALLREDUCE_NORM_FUSE_THRESHOLD

new_compile_ranges_split_points = vllm_config.compilation_config.compile_ranges_split_points
new_compile_ranges_split_points.append(ALLREDUCE_NORM_FUSE_THRESHOLD)
new_compile_ranges_split_points = sorted(new_compile_ranges_split_points)
vllm_config.compilation_config.compile_ranges_split_points = new_compile_ranges_split_points
self._set_compile_ranges(vllm_config.compilation_config, new_compile_ranges_split_points)
logger.debug(
"set compile_ranges_split_points to "
"{new_compile_ranges_split_points} for matmul and allreduce fusion"
Expand All @@ -218,9 +235,9 @@ def update_compile_ranges_split_points(self):
sp_threshold = get_sp_threshold(vllm_config)
new_compile_ranges_split_points.append(sp_threshold)
logger.debug(f"add {sp_threshold} to compile_ranges_split_points for sequence parallelism")
if len(new_compile_ranges_split_points) > len(vllm_config.compilation_config.compile_ranges_split_points):
if len(new_compile_ranges_split_points) > len(self._get_compile_ranges(vllm_config.compilation_config)):
new_compile_ranges_split_points = sorted(new_compile_ranges_split_points)
vllm_config.compilation_config.compile_ranges_split_points = new_compile_ranges_split_points
self._set_compile_ranges(vllm_config.compilation_config, new_compile_ranges_split_points)


class FinegrainedTPConfig:
Expand Down
49 changes: 36 additions & 13 deletions vllm_ascend/kv_offload/npu.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,6 +12,7 @@
from vllm.v1.kv_offload.worker.worker import OffloadingHandler

from vllm_ascend.kv_offload.cpu_npu import CpuNpuOffloadingHandler
from vllm_ascend.utils import vllm_version_is


class NPUOffloadingSpec(OffloadingSpec):
Expand All @@ -31,12 +32,23 @@ def __init__(self, vllm_config: VllmConfig, kv_cache_config: KVCacheConfig | Non

def get_manager(self) -> OffloadingManager:
if not self._manager:
kv_events_config = self.vllm_config.kv_events_config
enable_events = kv_events_config is not None and kv_events_config.enable_kv_cache_events
self._manager = LRUOffloadingManager(
CPUBackend(block_size=self.offloaded_block_size, num_blocks=self.num_cpu_blocks),
enable_events=enable_events,
)
if vllm_version_is("0.17.0"):
kv_events_config = self.vllm_config.kv_events_config
enable_events = kv_events_config is not None and kv_events_config.enable_kv_cache_events
self._manager = LRUOffloadingManager(
CPUBackend(block_size=self.offloaded_block_size, num_blocks=self.num_cpu_blocks),
enable_events=enable_events,
)
else:
kv_events_config = self.vllm_config.kv_events_config
enable_events = kv_events_config is not None and kv_events_config.enable_kv_cache_events
assert len(self.gpu_block_size) == 1
gpu_block_size = self.gpu_block_size[0]
offloaded_block_size = gpu_block_size * self.block_size_factor
self._manager = LRUOffloadingManager(
CPUBackend(block_size=offloaded_block_size, num_blocks=self.num_cpu_blocks),
enable_events=enable_events,
)
Comment thread
leo-pony marked this conversation as resolved.
return self._manager

def get_handlers(
Expand All @@ -45,13 +57,24 @@ def get_handlers(
attn_backends: dict[str, type[AttentionBackend]],
) -> Iterator[tuple[type[LoadStoreSpec], type[LoadStoreSpec], OffloadingHandler]]:
if not self._handler:
self._handler = CpuNpuOffloadingHandler(
attn_backends=attn_backends,
gpu_block_size=self.gpu_block_size,
cpu_block_size=self.offloaded_block_size,
num_cpu_blocks=self.num_cpu_blocks,
gpu_caches=kv_caches,
)
if vllm_version_is("0.17.0"):
self._handler = CpuNpuOffloadingHandler(
attn_backends=attn_backends,
gpu_block_size=self.gpu_block_size,
cpu_block_size=self.offloaded_block_size,
num_cpu_blocks=self.num_cpu_blocks,
gpu_caches=kv_caches,
)
else:
assert len(self.gpu_block_size) == 1
gpu_block_size = self.gpu_block_size[0]
self._handler = CpuNpuOffloadingHandler(
attn_backends=attn_backends,
gpu_block_size=gpu_block_size,
cpu_block_size=gpu_block_size * self.block_size_factor,
num_cpu_blocks=self.num_cpu_blocks,
gpu_caches=kv_caches,
)
Comment thread
leo-pony marked this conversation as resolved.

assert self._handler is not None
yield GPULoadStoreSpec, CPULoadStoreSpec, self._handler
Expand Down
8 changes: 8 additions & 0 deletions vllm_ascend/patch/platform/patch_torch_accelerator.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,3 +6,11 @@ def patch_empty_cache() -> None:


torch.accelerator.empty_cache = patch_empty_cache

# Monkey-patch torch.accelerator memory APIs for NPU compatibility.
# Upstream vLLM (commit 747b068) replaced current_platform.memory_stats()
# with torch.accelerator.memory_stats(), but torch.accelerator does not
# properly delegate to NPU. We redirect to torch.npu.* equivalents.
torch.accelerator.memory_stats = torch.npu.memory_stats # type: ignore[attr-defined]
torch.accelerator.memory_reserved = torch.npu.memory_reserved # type: ignore[attr-defined]
torch.accelerator.reset_peak_memory_stats = torch.npu.reset_peak_memory_stats # type: ignore[attr-defined]
43 changes: 32 additions & 11 deletions vllm_ascend/worker/model_runner_v1.py
Original file line number Diff line number Diff line change
Expand Up @@ -292,16 +292,27 @@ def __init__(self, vllm_config: VllmConfig, device: torch.device):
if self.use_sparse_c8_indexer:
self.c8_k_cache_dtype = torch.int8
self.c8_k_scale_cache_dtype = torch.float16

self.attn_backend = get_attn_backend(
0,
self.dtype,
None,
self.block_size,
use_mla=self.model_config.use_mla,
use_sparse=self.use_sparse,
use_mm_prefix=self.model_config is not None and self.model_config.is_mm_prefix_lm,
)
from vllm_ascend.utils import vllm_version_is

if vllm_version_is("0.17.0"):
self.attn_backend = get_attn_backend(
0,
self.dtype,
None,
self.block_size,
use_mla=self.model_config.use_mla,
use_sparse=self.use_sparse,
use_mm_prefix=self.model_config is not None and self.model_config.is_mm_prefix_lm,
)
else:
self.attn_backend = get_attn_backend(
0,
self.dtype,
None,
use_mla=self.model_config.use_mla,
use_sparse=self.use_sparse,
use_mm_prefix=self.model_config is not None and self.model_config.is_mm_prefix_lm,
)

try:
self.dcp_size = get_dcp_group().world_size
Expand Down Expand Up @@ -2553,7 +2564,17 @@ def load_model(self) -> None:
with get_tp_context(self.drafter):
self.drafter.load_model(self.model)
if self.use_aux_hidden_state_outputs:
self.model.set_aux_hidden_state_layers(self.model.get_eagle3_aux_hidden_state_layers())
if vllm_version_is("0.17.0"):
self.model.set_aux_hidden_state_layers(self.model.get_eagle3_aux_hidden_state_layers())
else:
from vllm.model_executor.models.interfaces import supports_eagle3
if not supports_eagle3(self.model):
raise RuntimeError(
"Model does not support EAGLE3 interface but "
"aux_hidden_state_outputs was requested"
)
aux_layers = self.model.get_eagle3_default_aux_hidden_state_layers()
self.model.set_aux_hidden_state_layers(aux_layers)

if self.lora_config:
self.model = self.load_lora_model(self.model, self.vllm_config, self.device)
Expand Down
9 changes: 6 additions & 3 deletions vllm_ascend/xlite/xlite.py
Original file line number Diff line number Diff line change
Expand Up @@ -92,9 +92,12 @@ def _build_model_config(self, vllm_config: VllmConfig) -> ModelConfig:

vision_config = getattr(vllm_config.model_config.hf_config, "vision_config", None)
rope_parameters = getattr(hf_config, "rope_parameters", {})
config.deepstack_num_level = len(getattr(vision_config, "deepstack_visual_indexes", []))
config.mrope_section = rope_parameters.get("mrope_section", [])
config.mrope_interleaved = rope_parameters.get("mrope_interleaved", False)
if hasattr(config, "deepstack_num_level"):
config.deepstack_num_level = len(getattr(vision_config, "deepstack_visual_indexes", []))
if hasattr(config, "mrope_section"):
config.mrope_section = rope_parameters.get("mrope_section", [])
if hasattr(config, "mrope_interleaved"):
config.mrope_interleaved = rope_parameters.get("mrope_interleaved", False)
return config

def _build_model(self, runnable: nn.Module, vllm_config: VllmConfig, config: ModelConfig) -> Model:
Expand Down
Loading