Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
5 changes: 2 additions & 3 deletions .github/workflows/accuracy_test.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -34,8 +34,7 @@ on:
# Current supported vLLM versions
options:
- main
- v0.9.0.1
- v0.9.0
- v0.9.1
- v0.7.3
vllm-ascend-version:
description: 'vllm-ascend version:'
Expand Down Expand Up @@ -159,7 +158,7 @@ jobs:
repository: vllm-project/vllm
path: ./vllm-empty
# Please also update this when bump matched version
ref: ${{ github.event.inputs.vllm-version || 'v0.9.0' }}
ref: ${{ github.event.inputs.vllm-version || 'v0.9.1' }}

- name: Install vllm-project/vllm from source
working-directory: ./vllm-empty
Expand Down
53 changes: 0 additions & 53 deletions .github/workflows/actionlint.yml

This file was deleted.

2 changes: 1 addition & 1 deletion .github/workflows/nightly_benchmarks.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -50,7 +50,7 @@ jobs:
strategy:
matrix:
include:
- vllm_branch: v0.9.0
- vllm_branch: v0.9.1
vllm_ascend_branch: main
container:
image: m.daocloud.io/quay.io/ascend/cann:8.1.rc1-910b-ubuntu22.04-py3.10
Expand Down
13 changes: 12 additions & 1 deletion .github/workflows/vllm_ascend_test.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -33,6 +33,9 @@ on:
- '!benchmarks/**'
- 'tools/mypy.sh'
- 'mypy.ini'
- '.github/workflows/*.ya?ml'
- '.github/workflows/actionlint.*'
- '.github/workflows/matchers/actionlint.json'

# Bash shells do not use ~/.profile or ~/.bashrc so these shells need to be explicitly
# declared as "shell: bash -el {0}" on steps that need to be properly activated.
Expand Down Expand Up @@ -87,6 +90,13 @@ jobs:
repository: vllm-project/vllm
path: vllm-empty

- name: Actionlint Check
env:
SHELLCHECK_OPTS: --exclude=SC2046,SC2006,SC2086
run: |
echo "::add-matcher::.github/workflows/matchers/actionlint.json"
tools/actionlint.sh -color

- name: Install vllm-project/vllm from source
working-directory: vllm-empty
run: |
Expand All @@ -105,7 +115,7 @@ jobs:
max-parallel: 2
matrix:
os: [linux-arm64-npu-1, linux-arm64-npu-4]
vllm_version: [main, v0.9.0]
vllm_version: [main, v0.9.1]
concurrency:
group: >
${{
Expand Down Expand Up @@ -192,6 +202,7 @@ jobs:
fi

- name: Run vllm-project/vllm-ascend test on V0 engine
if: ${{ github.event_name == 'schedule' }}
env:
VLLM_USE_V1: 0
run: |
Expand Down
2 changes: 1 addition & 1 deletion .github/workflows/vllm_ascend_test_long_term.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -43,7 +43,7 @@ jobs:
max-parallel: 2
matrix:
os: [linux-arm64-npu-1, linux-arm64-npu-4]
vllm_version: [main, v0.9.0]
vllm_version: [main, v0.9.1]
name: vLLM Ascend long term test
runs-on: ${{ matrix.os }}
container:
Expand Down
2 changes: 1 addition & 1 deletion .github/workflows/vllm_ascend_test_pd.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -41,7 +41,7 @@ jobs:
if: ${{ contains(github.event.pull_request.labels.*.name, 'pd-test') && contains(github.event.pull_request.labels.*.name, 'ready-for-test') || github.event_name == 'schedule' }}
strategy:
matrix:
vllm_verison: [main, v0.9.0]
vllm_verison: [main, v0.9.1]
name: vLLM Ascend prefilling decoding disaggregation test
runs-on: linux-arm64-npu-static-8

Expand Down
2 changes: 1 addition & 1 deletion Dockerfile
Original file line number Diff line number Diff line change
Expand Up @@ -37,7 +37,7 @@ RUN pip config set global.index-url ${PIP_INDEX_URL}

# Install vLLM
ARG VLLM_REPO=https://github.com/vllm-project/vllm.git
ARG VLLM_TAG=v0.9.0
ARG VLLM_TAG=v0.9.1
RUN git clone --depth 1 $VLLM_REPO --branch $VLLM_TAG /vllm-workspace/vllm
# In x86, triton will be installed by vllm. But in Ascend, triton doesn't work correctly. we need to uninstall it.
RUN VLLM_TARGET_DEVICE="empty" python3 -m pip install -v -e /vllm-workspace/vllm/ --extra-index https://download.pytorch.org/whl/cpu/ && \
Expand Down
2 changes: 1 addition & 1 deletion Dockerfile.openEuler
Original file line number Diff line number Diff line change
Expand Up @@ -34,7 +34,7 @@ COPY . /vllm-workspace/vllm-ascend/

# Install vLLM
ARG VLLM_REPO=https://github.com/vllm-project/vllm.git
ARG VLLM_TAG=v0.9.0
ARG VLLM_TAG=v0.9.1

RUN git clone --depth 1 $VLLM_REPO --branch $VLLM_TAG /vllm-workspace/vllm
# In x86, triton will be installed by vllm. But in Ascend, triton doesn't work correctly. we need to uninstall it.
Expand Down
32 changes: 8 additions & 24 deletions tests/singlecard/compile/test_simple.py
Original file line number Diff line number Diff line change
Expand Up @@ -14,8 +14,6 @@
set_current_vllm_config)
from vllm.utils import direct_register_custom_op

from vllm_ascend.utils import vllm_version_is

global_counter = 0

# create a library to hold the custom op
Expand Down Expand Up @@ -93,28 +91,14 @@ def test_simple_piecewise_compile():
model = SillyModel(vllm_config=vllm_config, prefix="")

inputs = torch.randn(100).npu()

if vllm_version_is("0.9.0"):
kwargs = {
"num_graphs_seen": 1, # one graph for the model
"num_piecewise_graphs_seen": 5, # 2 * num_layers + 1
"num_piecewise_capturable_graphs_seen": 3, # 1 + num_layers
"num_backend_compilations":
3, # num_piecewise_capturable_graphs_seen
"num_cudagraph_caputured":
6 # num_cudagraph_sizes * num_piecewise_capturable_graphs_seen
}
else:
kwargs = {
"num_graphs_seen": 1, # one graph for the model
"num_piecewise_graphs_seen": 5, # 2 * num_layers + 1
"num_piecewise_capturable_graphs_seen": 3, # 1 + num_layers
"num_backend_compilations":
3, # num_piecewise_capturable_graphs_seen
"num_cudagraph_captured":
6 # num_cudagraph_sizes * num_piecewise_capturable_graphs_seen
}

kwargs = {
"num_graphs_seen": 1, # one graph for the model
"num_piecewise_graphs_seen": 5, # 2 * num_layers + 1
"num_piecewise_capturable_graphs_seen": 3, # 1 + num_layers
"num_backend_compilations": 3, # num_piecewise_capturable_graphs_seen
"num_cudagraph_captured":
6 # num_cudagraph_sizes * num_piecewise_capturable_graphs_seen
}
with compilation_counter.expect(kwargs):

model(inputs)
Expand Down
61 changes: 18 additions & 43 deletions tests/singlecard/test_scheduler.py
Original file line number Diff line number Diff line change
Expand Up @@ -31,7 +31,6 @@
from vllm.v1.structured_output import StructuredOutputManager

from vllm_ascend.core.scheduler import AscendScheduler
from vllm_ascend.utils import vllm_version_is

EOS_TOKEN_ID = 50256

Expand Down Expand Up @@ -87,27 +86,15 @@ def create_scheduler(
vllm_config = VllmConfig(scheduler_config=scheduler_config,
model_config=model_config,
cache_config=cache_config)

if vllm_version_is("0.9.0"):
kv_cache_config = KVCacheConfig(
num_blocks=10000, # A large number of blocks to hold all requests
tensors={},
kv_cache_groups=[
KVCacheGroupSpec(['layer'],
FullAttentionSpec(16, 1, 1, torch.float32,
False))
],
)
else:
kv_cache_config = KVCacheConfig(
num_blocks=10000, # A large number of blocks to hold all requests
kv_cache_tensors=[KVCacheTensor(size=1024, shared_by=[1])],
kv_cache_groups=[
KVCacheGroupSpec(['layer'],
FullAttentionSpec(16, 1, 1, torch.float32,
False, None))
],
)
kv_cache_config = KVCacheConfig(
num_blocks=10000, # A large number of blocks to hold all requests
kv_cache_tensors=[KVCacheTensor(size=1024, shared_by=[1])],
kv_cache_groups=[
KVCacheGroupSpec(['layer'],
FullAttentionSpec(16, 1, 1, torch.float32, False,
None))
],
)
cache_config.num_gpu_blocks = 10000
return AscendScheduler(
vllm_config,
Expand Down Expand Up @@ -135,27 +122,15 @@ def create_requests(num_requests: int,
else:
mm_position = None
mm_inputs = None
if vllm_version_is("0.9.0"):
request = Request(
request_id=f"{i}",
prompt_token_ids=[i] * num_tokens,
sampling_params=sampling_params,
multi_modal_inputs=mm_inputs,
multi_modal_placeholders=mm_position,
multi_modal_hashes=None,
arrival_time=0,
eos_token_id=EOS_TOKEN_ID,
)
else:
request = Request(
request_id=f"{i}",
prompt_token_ids=[i] * num_tokens,
sampling_params=sampling_params,
multi_modal_inputs=mm_inputs,
multi_modal_placeholders=mm_position,
multi_modal_hashes=None,
eos_token_id=EOS_TOKEN_ID,
)
request = Request(
request_id=f"{i}",
prompt_token_ids=[i] * num_tokens,
sampling_params=sampling_params,
multi_modal_inputs=mm_inputs,
multi_modal_placeholders=mm_position,
multi_modal_hashes=None,
eos_token_id=EOS_TOKEN_ID,
)
requests.append(request)
return requests

Expand Down
8 changes: 1 addition & 7 deletions vllm_ascend/compilation/piecewise_backend.py
Original file line number Diff line number Diff line change
Expand Up @@ -31,8 +31,6 @@
from vllm.logger import logger
from vllm.utils import weak_ref_tensors

from vllm_ascend.utils import vllm_version_is


@dataclasses.dataclass
class ConcreteSizeEntry:
Expand Down Expand Up @@ -206,11 +204,7 @@ def __call__(self, *args) -> Any:
# to save memory
entry.output = weak_ref_tensors(output)
entry.aclgraph = aclgraph

if vllm_version_is("0.9.0"):
compilation_counter.num_cudagraph_caputured += 1
else:
compilation_counter.num_cudagraph_captured += 1
compilation_counter.num_cudagraph_captured += 1

# important: we need to return the output, rather than
# the weak ref of the output, so that pytorch can correctly
Expand Down
19 changes: 3 additions & 16 deletions vllm_ascend/core/scheduler.py
Original file line number Diff line number Diff line change
Expand Up @@ -29,8 +29,6 @@
from vllm.v1.request import Request, RequestStatus
from vllm.v1.structured_output import StructuredOutputManager

from vllm_ascend.utils import vllm_version_is


class AscendScheduler(Scheduler):
"""This Scheduler extends vllm's original v1 scheduler
Expand Down Expand Up @@ -129,12 +127,7 @@ def skip_cur_request():
continue

assert num_new_tokens > 0

if vllm_version_is("0.9.0"):
blocks = computed_blocks.blocks
else:
blocks = computed_blocks.blocks[0]

blocks = computed_blocks.blocks[0]
watermark = getattr(self.scheduler_config, "watermark", 0.01)
if not self._check_watermark_for_prefill(request, num_new_tokens,
blocks, watermark):
Expand Down Expand Up @@ -330,14 +323,8 @@ def _check_watermark_for_prefill(self,
len(computed_blocks) * self.block_size)
num_required_blocks = cdiv(num_new_tokens + num_computed_tokens,
self.block_size)

if vllm_version_is("0.9.0"):
req_blocks = self.kv_cache_manager.single_type_manager.req_to_blocks[
request.request_id]
else:
req_blocks = self.kv_cache_manager.coordinator.get_blocks(
request.request_id)

req_blocks = self.kv_cache_manager.coordinator.get_blocks(
request.request_id)
num_new_blocks = (num_required_blocks - len(req_blocks) -
len(computed_blocks))
num_evictable_computed_blocks = sum(1 for blk in computed_blocks
Expand Down
15 changes: 2 additions & 13 deletions vllm_ascend/patch/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -24,9 +24,9 @@
# each worker's `__init__` function.
#
# Then in each kind of patch, there are three folders:
# - patch_0_9_0: contains the patches applied when vllm version is 0.9.0.
# - patch_0_9_1: contains the patches applied when vllm version is 0.9.1.
# - patch_main: contains the patches applied when vllm version is main branch.
# - patch_common: contains the patches applied in both 0.9.0 and main branch.
# - patch_common: contains the patches applied in both 0.9.1 and main branch.
#
# Once a new patch is added in vllm-ascend, please add the patch description into this file as well.
# ----------------------------------------------------------------------------------
Expand All @@ -35,17 +35,6 @@
# --------------------------------
# * Platform Patch:
# =================
# ** File: platform/patch_0_9_0/patch_distributed.py**
# ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
# 1. `vllm.distributed.utils.stateless_init_torch_distributed_process_group()`
# Why:
# vllm distributed use gloo backend by default to initialize stateless process group, but we want to use hccl here
# How:
# Add hccl backend to the `stateless_init_torch_distributed_process_group`
# Related PR (if no, explain why):
# https://github.com/vllm-project/vllm/pull/18763
# Future Plan:
# Remove this patch once vllm is upgraded to 0.9.1
# ** File: platform/patch_common/patch_distributed.py**
# ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
# 1. `vllm.distributed.parallel_state.destroy_model_parallel()`
Expand Down
4 changes: 2 additions & 2 deletions vllm_ascend/patch/platform/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -17,8 +17,8 @@
from vllm_ascend.utils import vllm_version_is

# Import specific patches for different versions
if vllm_version_is("0.9.0"):
from vllm_ascend.patch.platform import patch_0_9_0 # noqa: F401
if vllm_version_is("0.9.1"):
from vllm_ascend.patch.platform import patch_0_9_1 # noqa: F401
from vllm_ascend.patch.platform import patch_common # noqa: F401
else:
from vllm_ascend.patch.platform import patch_common # noqa: F401
Expand Down
Loading
Loading