Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
6 changes: 2 additions & 4 deletions .github/workflows/dockerfiles/Dockerfile.lint
Original file line number Diff line number Diff line change
Expand Up @@ -27,10 +27,8 @@ RUN apt-get update -y && \

ARG VLLM_REPO=https://github.com/vllm-project/vllm.git
# For lint purpose, actually we need make a main2main matching.
ARG VLLM_COMMIT=4d51588e2381018348f1022dfa3a7698899805b7
RUN git init /vllm-workspace/vllm && \
git -C /vllm-workspace/vllm fetch --depth 1 $VLLM_REPO $VLLM_COMMIT && \
git -C /vllm-workspace/vllm checkout FETCH_HEAD
ARG VLLM_TAG=v0.20.1
RUN git clone --depth 1 -b $VLLM_TAG $VLLM_REPO /vllm-workspace/vllm

# # Install vLLM common dependencies
RUN python3 -m pip install -r /vllm-workspace/vllm/requirements/common.txt --extra-index https://download.pytorch.org/whl/cpu/ && \
Expand Down
4 changes: 2 additions & 2 deletions .github/workflows/pr_test_full.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -80,7 +80,7 @@ jobs:
name: e2e-full
strategy:
matrix:
vllm_version: [4d51588e2381018348f1022dfa3a7698899805b7]
vllm_version: [c7aa186d67b6f051680831418e957c67f34ba7a2, v0.20.1]
needs: [changes]
if: ${{ needs.changes.outputs.e2e_tracker == 'true' || needs.changes.outputs.e2e_tracker == true }}
uses: ./.github/workflows/_e2e_test.yaml
Expand All @@ -102,7 +102,7 @@ jobs:
strategy:
fail-fast: false
matrix:
vllm_version: [4d51588e2381018348f1022dfa3a7698899805b7]
vllm_version: [v0.20.1]
needs: [parse-trigger]
if: ${{ needs.parse-trigger.outputs.allowed == 'true' }}
uses: ./.github/workflows/_e2e_test.yaml
Expand Down
6 changes: 3 additions & 3 deletions .github/workflows/pr_test_light.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -41,7 +41,7 @@ jobs:
lint:
uses: ./.github/workflows/_pre_commit.yml
with:
vllm: 4d51588e2381018348f1022dfa3a7698899805b7
vllm: c7aa186d67b6f051680831418e957c67f34ba7a2
changes:
runs-on: linux-aarch64-a2b3-0
container:
Expand Down Expand Up @@ -154,7 +154,7 @@ jobs:
if: ${{ needs.lint.result == 'success' && needs.changes.outputs.has_tests == 'true' }}
strategy:
matrix:
vllm_version: [4d51588e2381018348f1022dfa3a7698899805b7]
vllm_version: [c7aa186d67b6f051680831418e957c67f34ba7a2, v0.20.1]
uses: ./.github/workflows/_optional_smart_e2e.yaml
with:
vllm: ${{ matrix.vllm_version }}
Expand All @@ -164,7 +164,7 @@ jobs:
name: e2e-light
strategy:
matrix:
vllm_version: [4d51588e2381018348f1022dfa3a7698899805b7]
vllm_version: [c7aa186d67b6f051680831418e957c67f34ba7a2, v0.20.1]
# Note (yikun): If CI resource are limited we can split job into two chain jobs
needs: [lint, changes]
# only trigger e2e test after lint passed and the change is e2e related with pull request.
Expand Down
2 changes: 1 addition & 1 deletion .github/workflows/schedule_update_estimated_time.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -23,7 +23,7 @@ jobs:
name: e2e-test
strategy:
matrix:
vllm_version: [4d51588e2381018348f1022dfa3a7698899805b7]
vllm_version: [v0.20.1]
type: [full, light]
uses: ./.github/workflows/_e2e_test.yaml
with:
Expand Down
2 changes: 1 addition & 1 deletion .github/workflows/schedule_vllm_e2e_test.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -45,7 +45,7 @@ jobs:
fail-fast: false
matrix:
part: [0, 1, 2, 3]
vllm: [4d51588e2381018348f1022dfa3a7698899805b7]
vllm: [v0.20.1]
container:
image: swr.cn-southwest-2.myhuaweicloud.com/base_image/ascend-ci/cann:8.5.1-910b-ubuntu22.04-py3.11
env:
Expand Down
2 changes: 1 addition & 1 deletion .github/workflows/scripts/config.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -166,4 +166,4 @@ e2e-multicard-4-cards:
- name: tests/e2e/multicard/4-cards/test_pipeline_parallel.py
estimated_time: 679
- name: tests/e2e/multicard/4-cards/test_profiling_chunk_performance.py
estimated_time: 1300
estimated_time: 1300
8 changes: 2 additions & 6 deletions Dockerfile
Original file line number Diff line number Diff line change
Expand Up @@ -48,12 +48,8 @@ RUN pip config set global.index-url ${PIP_INDEX_URL} && \

# Install vLLM
ARG VLLM_REPO=https://github.com/vllm-project/vllm.git
# ARG VLLM_TAG=v0.19.1
# RUN git clone --depth 1 -b $VLLM_TAG $VLLM_REPO /vllm-workspace/vllm
ARG VLLM_COMMIT=4d51588e2381018348f1022dfa3a7698899805b7
RUN git init /vllm-workspace/vllm && \
git -C /vllm-workspace/vllm fetch --depth 1 $VLLM_REPO $VLLM_COMMIT && \
git -C /vllm-workspace/vllm checkout FETCH_HEAD
ARG VLLM_TAG=v0.20.1
RUN git clone --depth 1 -b $VLLM_TAG $VLLM_REPO /vllm-workspace/vllm
# In x86, triton will be installed by vllm. But in Ascend, triton doesn't work correctly. we need to uninstall it.
RUN VLLM_TARGET_DEVICE="empty" python3 -m pip install -v -e /vllm-workspace/vllm/[audio] --extra-index https://download.pytorch.org/whl/cpu/ && \
python3 -m pip uninstall -y triton && \
Expand Down
8 changes: 2 additions & 6 deletions Dockerfile.310p
Original file line number Diff line number Diff line change
Expand Up @@ -33,12 +33,8 @@ RUN pip config set global.index-url ${PIP_INDEX_URL} && \

# Install vLLM
ARG VLLM_REPO=https://github.com/vllm-project/vllm.git
# ARG VLLM_TAG=v0.19.1
# RUN git clone --depth 1 -b $VLLM_TAG $VLLM_REPO /vllm-workspace/vllm
ARG VLLM_COMMIT=4d51588e2381018348f1022dfa3a7698899805b7
RUN git init /vllm-workspace/vllm && \
git -C /vllm-workspace/vllm fetch --depth 1 $VLLM_REPO $VLLM_COMMIT && \
git -C /vllm-workspace/vllm checkout FETCH_HEAD
ARG VLLM_TAG=v0.20.1
RUN git clone --depth 1 -b $VLLM_TAG $VLLM_REPO /vllm-workspace/vllm
# In x86, triton will be installed by vllm. But in Ascend, triton doesn't work correctly. we need to uninstall it.
RUN VLLM_TARGET_DEVICE="empty" python3 -m pip install -v -e /vllm-workspace/vllm/[audio] --extra-index https://download.pytorch.org/whl/cpu/ && \
python3 -m pip uninstall -y triton && \
Expand Down
8 changes: 2 additions & 6 deletions Dockerfile.310p.openEuler
Original file line number Diff line number Diff line change
Expand Up @@ -32,12 +32,8 @@ RUN pip config set global.index-url ${PIP_INDEX_URL} && \

# Install vLLM
ARG VLLM_REPO=https://github.com/vllm-project/vllm.git
# ARG VLLM_TAG=v0.19.1
# RUN git clone --depth 1 -b $VLLM_TAG $VLLM_REPO /vllm-workspace/vllm
ARG VLLM_COMMIT=4d51588e2381018348f1022dfa3a7698899805b7
RUN git init /vllm-workspace/vllm && \
git -C /vllm-workspace/vllm fetch --depth 1 $VLLM_REPO $VLLM_COMMIT && \
git -C /vllm-workspace/vllm checkout FETCH_HEAD
ARG VLLM_TAG=v0.20.1
RUN git clone --depth 1 -b $VLLM_TAG $VLLM_REPO /vllm-workspace/vllm
# In x86, triton will be installed by vllm. But in Ascend, triton doesn't work correctly. we need to uninstall it.
RUN VLLM_TARGET_DEVICE="empty" python3 -m pip install -e /vllm-workspace/vllm/[audio] --extra-index https://download.pytorch.org/whl/cpu/ && \
python3 -m pip uninstall -y triton && \
Expand Down
8 changes: 2 additions & 6 deletions Dockerfile.a3
Original file line number Diff line number Diff line change
Expand Up @@ -50,12 +50,8 @@ RUN pip config set global.index-url ${PIP_INDEX_URL} && \

# Install vLLM
ARG VLLM_REPO=https://github.com/vllm-project/vllm.git
# ARG VLLM_TAG=v0.19.1
# RUN git clone --depth 1 -b $VLLM_TAG $VLLM_REPO /vllm-workspace/vllm
ARG VLLM_COMMIT=4d51588e2381018348f1022dfa3a7698899805b7
RUN git init /vllm-workspace/vllm && \
git -C /vllm-workspace/vllm fetch --depth 1 $VLLM_REPO $VLLM_COMMIT && \
git -C /vllm-workspace/vllm checkout FETCH_HEAD
ARG VLLM_TAG=v0.20.1
RUN git clone --depth 1 -b $VLLM_TAG $VLLM_REPO /vllm-workspace/vllm
# In x86, triton will be installed by vllm. But in Ascend, triton doesn't work correctly. we need to uninstall it.
RUN VLLM_TARGET_DEVICE="empty" python3 -m pip install -v -e /vllm-workspace/vllm/[audio] --extra-index https://download.pytorch.org/whl/cpu/ && \
python3 -m pip uninstall -y triton && \
Expand Down
8 changes: 2 additions & 6 deletions Dockerfile.a3.openEuler
Original file line number Diff line number Diff line change
Expand Up @@ -49,12 +49,8 @@ RUN pip config set global.index-url ${PIP_INDEX_URL} && \

# Install vLLM
ARG VLLM_REPO=https://github.com/vllm-project/vllm.git
# ARG VLLM_TAG=v0.19.1
# RUN git clone --depth 1 -b $VLLM_TAG $VLLM_REPO /vllm-workspace/vllm
ARG VLLM_COMMIT=4d51588e2381018348f1022dfa3a7698899805b7
RUN git init /vllm-workspace/vllm && \
git -C /vllm-workspace/vllm fetch --depth 1 $VLLM_REPO $VLLM_COMMIT && \
git -C /vllm-workspace/vllm checkout FETCH_HEAD
ARG VLLM_TAG=v0.20.1
RUN git clone --depth 1 -b $VLLM_TAG $VLLM_REPO /vllm-workspace/vllm
# In x86, triton will be installed by vllm. But in Ascend, triton doesn't work correctly. we need to uninstall it.
RUN VLLM_TARGET_DEVICE="empty" python3 -m pip install -e /vllm-workspace/vllm/[audio] --extra-index https://download.pytorch.org/whl/cpu/ && \
python3 -m pip uninstall -y triton && \
Expand Down
8 changes: 2 additions & 6 deletions Dockerfile.openEuler
Original file line number Diff line number Diff line change
Expand Up @@ -49,12 +49,8 @@ RUN pip config set global.index-url ${PIP_INDEX_URL} && \

# Install vLLM
ARG VLLM_REPO=https://github.com/vllm-project/vllm.git
# ARG VLLM_TAG=v0.19.1
# RUN git clone --depth 1 -b $VLLM_TAG $VLLM_REPO /vllm-workspace/vllm
ARG VLLM_COMMIT=4d51588e2381018348f1022dfa3a7698899805b7
RUN git init /vllm-workspace/vllm && \
git -C /vllm-workspace/vllm fetch --depth 1 $VLLM_REPO $VLLM_COMMIT && \
git -C /vllm-workspace/vllm checkout FETCH_HEAD
ARG VLLM_TAG=v0.20.1
RUN git clone --depth 1 -b $VLLM_TAG $VLLM_REPO /vllm-workspace/vllm
# In x86, triton will be installed by vllm. But in Ascend, triton doesn't work correctly. we need to uninstall it.
RUN VLLM_TARGET_DEVICE="empty" python3 -m pip install -e /vllm-workspace/vllm/[audio] --extra-index https://download.pytorch.org/whl/cpu/ && \
python3 -m pip uninstall -y triton && \
Expand Down
4 changes: 2 additions & 2 deletions docs/source/conf.py
Original file line number Diff line number Diff line change
Expand Up @@ -81,9 +81,9 @@
# CANN image tag
"cann_image_tag": "8.5.1-910b-ubuntu22.04-py3.11",
# vLLM commit hash for main branch
"main_vllm_commit": "4d51588e2381018348f1022dfa3a7698899805b7",
"main_vllm_commit": "c7aa186d67b6f051680831418e957c67f34ba7a2",
# vLLM tag for main branch
"main_vllm_tag": "v0.19.1",
"main_vllm_tag": "v0.20.1",
# Python version for main branch
"main_python_version": ">= 3.10, < 3.12",
# CANN version for main branch
Expand Down
3 changes: 3 additions & 0 deletions mypy.ini
Original file line number Diff line number Diff line change
Expand Up @@ -44,3 +44,6 @@ ignore_missing_imports = True
[mypy-jiwer]
ignore_missing_imports = True

[mypy-vllm.v1.kv_offload.*]
ignore_missing_imports = True

2 changes: 2 additions & 0 deletions tests/e2e/multicard/2-cards/test_qwen3_moe.py
Original file line number Diff line number Diff line change
Expand Up @@ -25,6 +25,7 @@
from vllm.utils.network_utils import get_open_port

from tests.e2e.conftest import RemoteOpenAIServer, VllmRunner
from vllm_ascend.utils import vllm_version_is


@patch.dict(os.environ, {"HCCL_BUFFSIZE": "1024"})
Expand Down Expand Up @@ -74,6 +75,7 @@ def test_qwen3_moe_distributed_aiv_tp2():
vllm_model.generate_greedy(example_prompts, max_tokens)


@pytest.mark.skipif(vllm_version_is("0.20.1"), reason="no need to support model_runner for v0.20.1")
@pytest.mark.parametrize("max_tokens", [5])
@pytest.mark.parametrize("enforce_eager", [True])
@patch.dict(os.environ, {"VLLM_USE_V2_MODEL_RUNNER": "1"})
Expand Down
4 changes: 4 additions & 0 deletions tests/e2e/singlecard/model_runner_v2/test_basic.py
Original file line number Diff line number Diff line change
Expand Up @@ -22,13 +22,15 @@
from vllm import SamplingParams

from tests.e2e.conftest import VllmRunner
from vllm_ascend.utils import vllm_version_is

MODELS = ["Qwen/Qwen3-0.6B", "vllm-ascend/DeepSeek-V2-Lite-W8A8"]

MAIN_MODELS = ["LLM-Research/Meta-Llama-3.1-8B-Instruct"]
EGALE_MODELS = ["vllm-ascend/EAGLE-LLaMA3.1-Instruct-8B"]


@pytest.mark.skipif(vllm_version_is("0.20.1"), reason="no need to support model_runner for v0.20.1")
@pytest.mark.parametrize("model", MODELS)
@pytest.mark.parametrize("max_tokens", [32])
@pytest.mark.parametrize("enforce_eager", [True])
Expand Down Expand Up @@ -63,6 +65,7 @@ def test_qwen3_dense_eager_mode(
runner.model.generate(prompts, sampling_params)


@pytest.mark.skipif(vllm_version_is("0.20.1"), reason="no need to support model_runner for v0.20.1")
@pytest.mark.parametrize("model", MAIN_MODELS)
@pytest.mark.parametrize("eagle_model", EGALE_MODELS)
@pytest.mark.parametrize("max_tokens", [32])
Expand Down Expand Up @@ -101,6 +104,7 @@ def test_egale_spec_decoding(
runner.model.generate(prompts, sampling_params)


@pytest.mark.skipif(vllm_version_is("0.20.1"), reason="no need to support model_runner for v0.20.1")
@pytest.mark.parametrize("model", MODELS)
@pytest.mark.parametrize("max_tokens", [32])
@pytest.mark.parametrize("enforce_eager", [False])
Expand Down
59 changes: 0 additions & 59 deletions tests/ut/worker/test_model_runner_v2.py

This file was deleted.

20 changes: 14 additions & 6 deletions vllm_ascend/core/scheduler_profiling_chunk.py
Original file line number Diff line number Diff line change
Expand Up @@ -41,6 +41,7 @@
from vllm.v1.utils import record_function_or_nullcontext

from vllm_ascend.core.profiling_chunk_predictor import ProfilingChunkManager
from vllm_ascend.utils import vllm_version_is


class ProfilingChunkScheduler(Scheduler):
Expand Down Expand Up @@ -575,12 +576,16 @@ def schedule(self) -> SchedulerOutput: # noqa: C901
if self.is_encoder_decoder and request.has_encoder_inputs and encoder_inputs_to_schedule:
num_encoder_tokens = sum(request.get_num_encoder_embeds(i) for i in encoder_inputs_to_schedule)

if self.scheduler_reserve_full_isl and not self.kv_cache_manager.can_fit_full_sequence(
request,
num_new_computed_tokens=num_new_local_computed_tokens,
new_computed_blocks=new_computed_blocks,
num_external_computed_tokens=num_external_computed_tokens,
num_encoder_tokens=num_encoder_tokens,
if (
vllm_version_is("0.20.1")
and self.scheduler_reserve_full_isl
and not self.kv_cache_manager.can_fit_full_sequence(
request,
num_new_computed_tokens=num_new_local_computed_tokens,
new_computed_blocks=new_computed_blocks,
num_external_computed_tokens=num_external_computed_tokens,
num_encoder_tokens=num_encoder_tokens,
)
):
if request.has_encoder_inputs:
self.encoder_cache_manager.free(request)
Expand All @@ -595,6 +600,9 @@ def schedule(self) -> SchedulerOutput: # noqa: C901
num_external_computed_tokens=num_external_computed_tokens,
delay_cache_blocks=load_kv_async,
num_encoder_tokens=num_encoder_tokens,
**(
{} if vllm_version_is("0.20.1") else {"full_sequence_must_fit": self.scheduler_reserve_full_isl}
),
)

if new_blocks is None:
Expand Down
1 change: 1 addition & 0 deletions vllm_ascend/patch/platform/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -19,6 +19,7 @@
import vllm_ascend.patch.platform.patch_distributed # noqa
import vllm_ascend.patch.platform.patch_kv_cache_interface # noqa
import vllm_ascend.patch.platform.patch_kv_cache_utils # noqa
import vllm_ascend.patch.platform.patch_mla_prefill_backend # noqa
from vllm_ascend import envs
from vllm_ascend.utils import is_310p

Expand Down
Loading
Loading