Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 2 additions & 2 deletions .github/workflows/accuracy_test.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -37,7 +37,7 @@ on:
# Current supported vLLM versions
options:
- main
- v0.9.2
- v0.10.0
- v0.9.1
- v0.7.3
vllm-ascend-version:
Expand Down Expand Up @@ -163,7 +163,7 @@ jobs:
repository: vllm-project/vllm
path: ./vllm-empty
# Please also update this when bump matched version
ref: ${{ github.event.inputs.vllm-version || 'v0.9.2' }}
ref: ${{ github.event.inputs.vllm-version || 'v0.10.0' }}

- name: Install vllm-project/vllm from source
working-directory: ./vllm-empty
Expand Down
2 changes: 1 addition & 1 deletion .github/workflows/nightly_benchmarks.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -51,7 +51,7 @@ jobs:
strategy:
matrix:
include:
- vllm_branch: v0.9.2
- vllm_branch: v0.10.0
vllm_ascend_branch: main
vllm_use_v1: 1
max-parallel: 1
Expand Down
6 changes: 3 additions & 3 deletions .github/workflows/vllm_ascend_test.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -81,7 +81,7 @@ jobs:
VLLM_USE_MODELSCOPE: True
strategy:
matrix:
vllm_version: [main, v0.9.2]
vllm_version: [main, v0.10.0]
steps:
- name: Install packages
run: |
Expand Down Expand Up @@ -137,7 +137,7 @@ jobs:
max-parallel: 2
matrix:
os: [linux-arm64-npu-1]
vllm_version: [main, v0.9.2]
vllm_version: [main, v0.10.0]
name: singlecard e2e test
runs-on: ${{ matrix.os }}
container:
Expand Down Expand Up @@ -216,7 +216,7 @@ jobs:
max-parallel: 1
matrix:
os: [linux-arm64-npu-4]
vllm_version: [main, v0.9.2]
vllm_version: [main, v0.10.0]
name: multicard e2e test
runs-on: ${{ matrix.os }}
container:
Expand Down
2 changes: 1 addition & 1 deletion .github/workflows/vllm_ascend_test_long_term.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -43,7 +43,7 @@ jobs:
max-parallel: 2
matrix:
os: [linux-arm64-npu-1, linux-arm64-npu-4]
vllm_version: [main, v0.9.2]
vllm_version: [main, v0.10.0]
name: vLLM Ascend long term test
runs-on: ${{ matrix.os }}
container:
Expand Down
2 changes: 1 addition & 1 deletion Dockerfile
Original file line number Diff line number Diff line change
Expand Up @@ -37,7 +37,7 @@ RUN pip config set global.index-url ${PIP_INDEX_URL}

# Install vLLM
ARG VLLM_REPO=https://github.com/vllm-project/vllm.git
ARG VLLM_TAG=v0.9.2
ARG VLLM_TAG=v0.10.0
RUN git clone --depth 1 $VLLM_REPO --branch $VLLM_TAG /vllm-workspace/vllm
# In x86, triton will be installed by vllm. But in Ascend, triton doesn't work correctly. we need to uninstall it.
RUN VLLM_TARGET_DEVICE="empty" python3 -m pip install -v -e /vllm-workspace/vllm/ --extra-index https://download.pytorch.org/whl/cpu/ && \
Expand Down
2 changes: 1 addition & 1 deletion Dockerfile.310p
Original file line number Diff line number Diff line change
Expand Up @@ -37,7 +37,7 @@ RUN pip config set global.index-url ${PIP_INDEX_URL}

# Install vLLM
ARG VLLM_REPO=https://github.com/vllm-project/vllm.git
ARG VLLM_TAG=v0.9.2
ARG VLLM_TAG=v0.10.0
RUN git clone --depth 1 $VLLM_REPO --branch $VLLM_TAG /vllm-workspace/vllm
# In x86, triton will be installed by vllm. But in Ascend, triton doesn't work correctly. we need to uninstall it.
RUN VLLM_TARGET_DEVICE="empty" python3 -m pip install -v -e /vllm-workspace/vllm/ --extra-index https://download.pytorch.org/whl/cpu/ && \
Expand Down
2 changes: 1 addition & 1 deletion Dockerfile.310p.openEuler
Original file line number Diff line number Diff line change
Expand Up @@ -34,7 +34,7 @@ COPY . /vllm-workspace/vllm-ascend/

# Install vLLM
ARG VLLM_REPO=https://github.com/vllm-project/vllm.git
ARG VLLM_TAG=v0.9.2
ARG VLLM_TAG=v0.10.0

RUN git clone --depth 1 $VLLM_REPO --branch $VLLM_TAG /vllm-workspace/vllm
# In x86, triton will be installed by vllm. But in Ascend, triton doesn't work correctly. we need to uninstall it.
Expand Down
2 changes: 1 addition & 1 deletion Dockerfile.a3
Original file line number Diff line number Diff line change
Expand Up @@ -37,7 +37,7 @@ RUN pip config set global.index-url ${PIP_INDEX_URL}

# Install vLLM
ARG VLLM_REPO=https://github.com/vllm-project/vllm.git
ARG VLLM_TAG=v0.9.2
ARG VLLM_TAG=v0.10.0
RUN git clone --depth 1 $VLLM_REPO --branch $VLLM_TAG /vllm-workspace/vllm
# In x86, triton will be installed by vllm. But in Ascend, triton doesn't work correctly. we need to uninstall it.
RUN VLLM_TARGET_DEVICE="empty" python3 -m pip install -v -e /vllm-workspace/vllm/ --extra-index https://download.pytorch.org/whl/cpu/ && \
Expand Down
2 changes: 1 addition & 1 deletion Dockerfile.a3.openEuler
Original file line number Diff line number Diff line change
Expand Up @@ -34,7 +34,7 @@ COPY . /vllm-workspace/vllm-ascend/

# Install vLLM
ARG VLLM_REPO=https://github.com/vllm-project/vllm.git
ARG VLLM_TAG=v0.9.2
ARG VLLM_TAG=v0.10.0

RUN git clone --depth 1 $VLLM_REPO --branch $VLLM_TAG /vllm-workspace/vllm
# In x86, triton will be installed by vllm. But in Ascend, triton doesn't work correctly. we need to uninstall it.
Expand Down
2 changes: 1 addition & 1 deletion Dockerfile.openEuler
Original file line number Diff line number Diff line change
Expand Up @@ -34,7 +34,7 @@ COPY . /vllm-workspace/vllm-ascend/

# Install vLLM
ARG VLLM_REPO=https://github.com/vllm-project/vllm.git
ARG VLLM_TAG=v0.9.2
ARG VLLM_TAG=v0.10.0

RUN git clone --depth 1 $VLLM_REPO --branch $VLLM_TAG /vllm-workspace/vllm
# In x86, triton will be installed by vllm. But in Ascend, triton doesn't work correctly. we need to uninstall it.
Expand Down
2 changes: 1 addition & 1 deletion docs/source/conf.py
Original file line number Diff line number Diff line change
Expand Up @@ -77,7 +77,7 @@
# CANN image tag
'cann_image_tag': "8.1.rc1-910b-ubuntu22.04-py3.10",
# vllm version in ci
'ci_vllm_version': 'v0.9.2',
'ci_vllm_version': 'v0.10.0',
}

# Add any paths that contain templates here, relative to this directory.
Expand Down
8 changes: 4 additions & 4 deletions docs/source/developer_guide/feature_guide/patch.md
Original file line number Diff line number Diff line change
Expand Up @@ -38,15 +38,15 @@ vllm_ascend

In both **platform** and **worker** folder, there are several patch modules. They are used for patching different version of vLLM.

- `patch_0_9_2`: This module is used for patching vLLM 0.9.2. The version is always the nearest version of vLLM. Once vLLM is released, we will drop this patch module and bump to a new version. For example, `patch_0_9_2` is used for patching vLLM 0.9.2.
- `patch_0_10_0`: This module is used for patching vLLM 0.10.0. The version is always the nearest version of vLLM. Once vLLM is released, we will drop this patch module and bump to a new version. For example, `patch_0_10_0` is used for patching vLLM 0.10.0.
- `patch_main`: This module is used for patching the code in vLLM main branch.
- `patch_common`: This module is used for patching both vLLM 0.9.2 and vLLM main branch.
- `patch_common`: This module is used for patching both vLLM 0.10.0 and vLLM main branch.

## How to write a patch

Before writing a patch, following the principle above, we should patch the least code. If it's necessary, we can patch the code in either **platform** and **worker** folder. Here is an example to patch `distributed` module in vLLM.

1. Decide which version of vLLM we should patch. For example, after analysis, here we want to patch both 0.9.2 and main of vLLM.
1. Decide which version of vLLM we should patch. For example, after analysis, here we want to patch both 0.10.0 and main of vLLM.
2. Decide which process we should patch. For example, here `distributed` belongs to the vLLM main process, so we should patch `platform`.
3. Create the patch file in the right folder. The file should be named as `patch_{module_name}.py`. The example here is `vllm_ascend/patch/platform/patch_common/patch_distributed.py`.
4. Write your patch code in the new file. Here is an example:
Expand Down Expand Up @@ -82,4 +82,4 @@ Before writing a patch, following the principle above, we should patch the least

## Limitation
1. In V1 Engine, vLLM starts three kinds of process: Main process, EngineCore process and Worker process. Now vLLM Ascend only support patch the code in Main process and Worker process by default. If you want to patch the code runs in EngineCore process, you should patch EngineCore process entirely during setup, the entry code is here `vllm.v1.engine.core`. Please override `EngineCoreProc` and `DPEngineCoreProc` entirely.
2. If you are running an edited vLLM code, the version of the vLLM may be changed automatically. For example, if you runs an edited vLLM based on v0.9.n, the version of vLLM may be change to v0.9.nxxx, in this case, the patch for v0.9.n in vLLM Ascend would not work as expect, because that vLLM Ascend can't distinguish the version of vLLM you're using. In this case, you can set the environment variable `VLLM_VERSION` to specify the version of vLLM you're using, then the patch for v0.9.2 should work.
2. If you are running an edited vLLM code, the version of the vLLM may be changed automatically. For example, if you runs an edited vLLM based on v0.9.n, the version of vLLM may be change to v0.9.nxxx, in this case, the patch for v0.9.n in vLLM Ascend would not work as expect, because that vLLM Ascend can't distinguish the version of vLLM you're using. In this case, you can set the environment variable `VLLM_VERSION` to specify the version of vLLM you're using, then the patch for v0.10.0 should work.
2 changes: 2 additions & 0 deletions pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -19,6 +19,8 @@ requires = [
"msgpack",
"quart",
"numba",
# Remove after https://github.com/vllm-project/vllm-ascend/issues/2034
"transformers<4.54.0",
]
build-backend = "setuptools.build_meta"

Expand Down
2 changes: 2 additions & 0 deletions requirements.txt
Original file line number Diff line number Diff line change
Expand Up @@ -13,6 +13,8 @@ setuptools-scm>=8
torch>=2.5.1
torchvision<0.21.0
wheel
# Remove after https://github.com/vllm-project/vllm-ascend/issues/2034
transformers<4.54.0

# requirements for disaggregated prefill
msgpack
Expand Down
16 changes: 16 additions & 0 deletions tests/e2e/singlecard/test_offline_inference.py
Original file line number Diff line number Diff line change
Expand Up @@ -127,3 +127,19 @@ def test_models_topk() -> None:
enforce_eager=True,
gpu_memory_utilization=0.7) as vllm_model:
vllm_model.generate(example_prompts, sampling_params)


def test_models_prompt_logprobs() -> None:

example_prompts = [
"Hello, my name is",
]

with VllmRunner("Qwen/Qwen2.5-0.5B-Instruct",
max_model_len=8192,
dtype="float16",
enforce_eager=True,
gpu_memory_utilization=0.7) as vllm_model:
vllm_model.generate_greedy_logprobs(example_prompts,
max_tokens=5,
num_logprobs=1)
11 changes: 2 additions & 9 deletions tests/ut/attention/test_attention_v1.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,15 +3,12 @@
import torch

from tests.ut.base import TestBase
from vllm_ascend.attention.attention_v1 import \
AscendAttentionBackendImpl092 # isort: skip
from vllm_ascend.attention.attention_v1 import (AscendAttentionBackend,
AscendAttentionBackendImpl,
AscendAttentionMetadataBuilder,
AscendAttentionState,
AscendMetadata,
CommonAttentionState)
from vllm_ascend.utils import vllm_version_is


class TestAscendAttentionBackend(TestBase):
Expand All @@ -20,12 +17,8 @@ def test_get_name(self):
self.assertEqual(AscendAttentionBackend.get_name(), "ASCEND")

def test_get_impl_cls(self):
if vllm_version_is("0.9.2"):
self.assertEqual(AscendAttentionBackend.get_impl_cls(),
AscendAttentionBackendImpl092)
else:
self.assertEqual(AscendAttentionBackend.get_impl_cls(),
AscendAttentionBackendImpl)
self.assertEqual(AscendAttentionBackend.get_impl_cls(),
AscendAttentionBackendImpl)

def test_get_metadata_cls(self):
self.assertEqual(AscendAttentionBackend.get_metadata_cls(),
Expand Down
38 changes: 2 additions & 36 deletions vllm_ascend/attention/attention_v1.py
Original file line number Diff line number Diff line change
Expand Up @@ -17,7 +17,7 @@

from dataclasses import dataclass
from enum import Enum
from typing import Any, Dict, List, Optional, Tuple, Type
from typing import List, Optional, Tuple, Type

import torch
import torch_npu
Expand All @@ -31,7 +31,7 @@

from vllm_ascend.ops.attention import vanilla_chunked_prefill
from vllm_ascend.utils import (ACL_FORMAT_FRACTAL_NZ, aligned_16, is_310p,
nd_to_nz_2d, nd_to_nz_spec, vllm_version_is)
nd_to_nz_2d, nd_to_nz_spec)


class AscendAttentionBackend(AttentionBackend):
Expand All @@ -43,8 +43,6 @@ def get_name() -> str:

@staticmethod
def get_impl_cls() -> Type["AscendAttentionBackendImpl"]:
if vllm_version_is("0.9.2"):
return AscendAttentionBackendImpl092
return AscendAttentionBackendImpl

@staticmethod
Expand Down Expand Up @@ -440,38 +438,6 @@ def forward(
return output.view(num_tokens, self.hidden_size)


class AscendAttentionBackendImpl092(AscendAttentionBackendImpl):

def __init__(
self,
num_heads: int,
head_size: int,
scale: float,
num_kv_heads: int,
alibi_slopes: Optional[List[float]],
sliding_window: Optional[int],
kv_cache_dtype: str,
blocksparse_params: Optional[Dict[str, Any]] = None,
logits_soft_cap: Optional[float] = None,
attn_type: str = AttentionType.DECODER,
kv_sharing_target_layer_name: Optional[str] = None,
use_irope: bool = False,
) -> None:
super().__init__(
num_heads=num_heads,
head_size=head_size,
scale=scale,
num_kv_heads=num_kv_heads,
alibi_slopes=alibi_slopes,
sliding_window=sliding_window,
kv_cache_dtype=kv_cache_dtype,
logits_soft_cap=logits_soft_cap,
attn_type=attn_type,
kv_sharing_target_layer_name=kv_sharing_target_layer_name,
use_irope=use_irope,
)


def unified_ascend_attention_with_output(
query: torch.Tensor,
key: torch.Tensor,
Expand Down
39 changes: 2 additions & 37 deletions vllm_ascend/attention/attention_v1_torchair.py
Original file line number Diff line number Diff line change
Expand Up @@ -16,7 +16,7 @@
#

from dataclasses import dataclass
from typing import Any, Dict, List, Optional, Tuple, Type
from typing import List, Optional, Tuple, Type

import numpy as np
import torch
Expand All @@ -29,7 +29,7 @@

from vllm_ascend.attention.attention_v1 import AscendAttentionState
from vllm_ascend.utils import (ACL_FORMAT_FRACTAL_NZ, aligned_16, is_310p,
nd_to_nz_2d, vllm_version_is)
nd_to_nz_2d)


class AscendAttentionTorchairBackend(AttentionBackend):
Expand All @@ -41,8 +41,6 @@ def get_name() -> str:

@staticmethod
def get_impl_cls() -> Type["AscendAttentionTorchairBackendImpl"]:
if vllm_version_is("0.9.2"):
return AscendAttentionTorchairBackendImpl092
return AscendAttentionTorchairBackendImpl

@staticmethod
Expand Down Expand Up @@ -489,36 +487,3 @@ def forward(
"to use ascend scheduler.")

return output.view(num_tokens, self.hidden_size)


class AscendAttentionTorchairBackendImpl092(AscendAttentionTorchairBackendImpl
):

def __init__(
self,
num_heads: int,
head_size: int,
scale: float,
num_kv_heads: int,
alibi_slopes: Optional[List[float]],
sliding_window: Optional[int],
kv_cache_dtype: str,
blocksparse_params: Optional[Dict[str, Any]] = None,
logits_soft_cap: Optional[float] = None,
attn_type: str = AttentionType.DECODER,
kv_sharing_target_layer_name: Optional[str] = None,
use_irope: bool = False,
) -> None:
super().__init__(
num_heads=num_heads,
head_size=head_size,
scale=scale,
num_kv_heads=num_kv_heads,
alibi_slopes=alibi_slopes,
sliding_window=sliding_window,
kv_cache_dtype=kv_cache_dtype,
logits_soft_cap=logits_soft_cap,
attn_type=attn_type,
kv_sharing_target_layer_name=kv_sharing_target_layer_name,
use_irope=use_irope,
)
Loading
Loading