diff --git a/.github/workflows/dockerfiles/Dockerfile.lint b/.github/workflows/dockerfiles/Dockerfile.lint index 10dbcaefabc..24fbaab9497 100644 --- a/.github/workflows/dockerfiles/Dockerfile.lint +++ b/.github/workflows/dockerfiles/Dockerfile.lint @@ -27,7 +27,7 @@ RUN apt-get update -y && \ ARG VLLM_REPO=https://github.com/vllm-project/vllm.git # For lint purpose, actually we need make a main2main matching. -ARG VLLM_COMMIT=5af684c31912232e5c89484c2e8259e0fac6c55b +ARG VLLM_COMMIT=6f786f2c506cb07f4566771fdc62e640e2c4a176 RUN git init /vllm-workspace/vllm && \ git -C /vllm-workspace/vllm fetch --depth 1 $VLLM_REPO $VLLM_COMMIT && \ git -C /vllm-workspace/vllm checkout FETCH_HEAD diff --git a/.github/workflows/pr_test_full.yaml b/.github/workflows/pr_test_full.yaml index 894c8f3e589..8e6b51fd7bf 100644 --- a/.github/workflows/pr_test_full.yaml +++ b/.github/workflows/pr_test_full.yaml @@ -80,7 +80,7 @@ jobs: name: e2e-full strategy: matrix: - vllm_version: [5af684c31912232e5c89484c2e8259e0fac6c55b, v0.19.0] + vllm_version: [6f786f2c506cb07f4566771fdc62e640e2c4a176, v0.19.0] needs: [changes] if: ${{ needs.changes.outputs.e2e_tracker == 'true' || needs.changes.outputs.e2e_tracker == true }} uses: ./.github/workflows/_e2e_test.yaml diff --git a/.github/workflows/pr_test_light.yaml b/.github/workflows/pr_test_light.yaml index afc717a9091..767c3adcc7e 100644 --- a/.github/workflows/pr_test_light.yaml +++ b/.github/workflows/pr_test_light.yaml @@ -41,7 +41,7 @@ jobs: lint: uses: ./.github/workflows/_pre_commit.yml with: - vllm: 5af684c31912232e5c89484c2e8259e0fac6c55b + vllm: 6f786f2c506cb07f4566771fdc62e640e2c4a176 changes: runs-on: linux-aarch64-a2b3-0 outputs: @@ -92,7 +92,7 @@ jobs: if: ${{ needs.lint.result == 'success' && (needs.changes.outputs.e2e_tracker == 'true' || needs.changes.outputs.ut_tracker == 'true') }} strategy: matrix: - vllm_version: [5af684c31912232e5c89484c2e8259e0fac6c55b, v0.19.0] + vllm_version: [6f786f2c506cb07f4566771fdc62e640e2c4a176, v0.19.0] uses: ./.github/workflows/_unit_test.yaml with: vllm: ${{ matrix.vllm_version }} @@ -104,7 +104,7 @@ jobs: name: e2e-light strategy: matrix: - vllm_version: [5af684c31912232e5c89484c2e8259e0fac6c55b, v0.19.0] + vllm_version: [6f786f2c506cb07f4566771fdc62e640e2c4a176, v0.19.0] # Note (yikun): If CI resource are limited we can split job into two chain jobs needs: [lint, changes] # only trigger e2e test after lint passed and the change is e2e related with pull request. diff --git a/.github/workflows/schedule_test_benchmarks.yaml b/.github/workflows/schedule_test_benchmarks.yaml deleted file mode 100644 index f729208fb6b..00000000000 --- a/.github/workflows/schedule_test_benchmarks.yaml +++ /dev/null @@ -1,203 +0,0 @@ -# -# Copyright (c) 2025 Huawei Technologies Co., Ltd. All Rights Reserved. -# This file is a part of the vllm-ascend project. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -# - -name: Performance Schedule Test -# This workflow runs nightly benchmarks for vllm-ascend. - -on: - schedule: - # Run benchmarks at 20:00 and 03:00 Beijing time (UTC+8) - - cron: "0 12 * * *" - - cron: "0 19 * * *" - - workflow_dispatch: - # Allow manual triggering of the workflow - - pull_request: - types: [ labeled ] - -# Bash shells do not use ~/.profile or ~/.bashrc so these shells need to be explicitly -# declared as "shell: bash -el {0}" on steps that need to be properly activated. -# It's used to activate ascend-toolkit environment variables. -defaults: - run: - shell: bash -el {0} - -# only 1 job can runs on static-8-01-cards -concurrency: - group: static-8-01-cards - cancel-in-progress: false - -jobs: - test: - if: ${{ contains(github.event.pull_request.labels.*.name, 'performance-test') && contains(github.event.pull_request.labels.*.name, 'ready-for-test') || github.event_name == 'schedule' || github.event_name == 'workflow_dispatch' }} - - name: Benchmarks/vLLM=${{ matrix.vllm_branch }}, vLLM-Ascend=${{ matrix.vllm_ascend_branch }} - runs-on: 'linux-arm64-npu-static-8' - strategy: - matrix: - include: - - vllm_branch: v0.19.0 - vllm_ascend_branch: main - max-parallel: 1 - container: - image: swr.cn-southwest-2.myhuaweicloud.com/base_image/ascend-ci/cann:8.5.1-910b-ubuntu22.04-py3.11 - volumes: - - /usr/local/dcmi:/usr/local/dcmi - - /usr/local/bin/npu-smi:/usr/local/bin/npu-smi - - /usr/local/Ascend/driver/:/usr/local/Ascend/driver/ - # Use self-host cache speed up pip and model download - - /home/action/.cache:/github/home/.cache/ - options: >- - --device /dev/davinci0 - --device /dev/davinci1 - --device /dev/davinci_manager - --device /dev/devmm_svm - --device /dev/hisi_hdc - env: - VLLM_USE_MODELSCOPE: True - ES_OM_DOMAIN: ${{ secrets.ES_OM_DOMAIN }} - ES_OM_AUTHORIZATION: ${{ secrets.ES_OM_AUTHORIZATION }} - steps: - - name: Check npu and CANN info - run: | - npu-smi info - cat /usr/local/Ascend/ascend-toolkit/latest/"$(uname -i)"-linux/ascend_toolkit_install.info - - - name: Config mirrors - run: | - # keep using tuna's proxy since linux-arm64-npu-static-8 is in another region - sed -i 's|ports.ubuntu.com|mirrors.tuna.tsinghua.edu.cn|g' /etc/apt/sources.list - pip config set global.index-url https://mirrors.tuna.tsinghua.edu.cn/pypi/web/simple - - - name: Install system dependencies - run: | - apt-get update -y - apt-get -y install git jq wget curl lsof gcc g++ cmake libnuma-dev - - - name: Config git - run: | - git config --global --add safe.directory "$GITHUB_WORKSPACE" - git config --global url."https://gh-proxy.test.osinfra.cn/https://github.com/".insteadOf https://github.com/ - - - name: Checkout vllm-project/vllm-ascend repo - uses: actions/checkout@v6 - with: - fetch-depth: 0 - - - name: Checkout vllm-project/vllm repo - uses: actions/checkout@v6 - with: - repository: vllm-project/vllm - path: ./vllm-empty - ref: ${{ matrix.vllm_branch }} - - - name: Install vllm-project/vllm from source - working-directory: ./vllm-empty - run: | - VLLM_TARGET_DEVICE=empty pip install -e . - - - name: Install vllm-project/vllm-ascend - env: - PIP_EXTRA_INDEX_URL: https://mirrors.huaweicloud.com/ascend/repos/pypi - run: | - pip install -e . - pip install -r benchmarks/requirements-bench.txt - - - name: Run current commit benchmarks - if: github.event_name != 'schedule' && github.event_name != 'workflow_dispatch' - run: | - # Sometimes we only want to run benchmarks on the current commit - # This is useful for debugging or a release benchmark - bash benchmarks/scripts/run-performance-benchmarks.sh - # Convert the benchmark results to markdown format - python3 benchmarks/scripts/convert_json_to_markdown.py - - - name: Generate step summary - if: github.event_name != 'schedule' && github.event_name != 'workflow_dispatch' - run: | - cat ./benchmarks/results/benchmark_results.md >> "$GITHUB_STEP_SUMMARY" - - - name: Upload benchmark artifacts - if: github.event_name != 'schedule' && github.event_name != 'workflow_dispatch' - uses: actions/upload-artifact@v7 - with: - name: "benchmark-performance-${{ matrix.vllm_branch }}-${{ matrix.vllm_ascend_branch }}-report" - path: ./benchmarks/results/benchmark_results.md - if-no-files-found: warn - retention-days: 90 - overwrite: true - - - name: Install elastic_tool - if: github.event_name != 'pull_request' - run: | - pip install escli-tool==0.2.3 - - - name: Collect pr info from vllm-project/vllm-ascend - if: github.event_name != 'pull_request' - run: | - # Only get the pull request which may influences performance - git log --pretty=format:"%H %s" -- '**/*.py' ':!docs/*' ':!tests/*' ':!examples/*' ':!benchmarks/*' > commit_log.txt - escli check commit_log.txt - - - name: Prepare benchmark script in advance - if: github.event_name != 'pull_request' - # This is for the benchmark iteration, which will change the benchmark scripts while checkouting each commit. - # We need ensure the benchmark scripts always available. - run: | - # Prepare the benchmark script in advance - mkdir -p /github/home/benchmarks - cp -r benchmarks/* /github/home/benchmarks/ - - - name: Run benchmark iteration - env: - PIP_EXTRA_INDEX_URL: https://mirrors.huaweicloud.com/ascend/repos/pypi - if: github.event_name != 'pull_request' - run: | - while IFS= read -r line || [[ -n "$line" ]]; do - commit_id=${line%% *} - commit_title=${line#* } - - git checkout "$commit_id" - commit_time=$(git show -s --format=%cd "$commit_id" --date=iso-strict) - commit_time_no_tz="${commit_time::19}" - pip install -e . - - echo "------------------------" - echo "commit_id: $commit_id" - echo "commit_title: $commit_title" - echo "commit_time: $commit_time_no_tz" - echo "vllm branch: ${{ matrix.vllm_branch }}" - echo "vllm-ascend branch: ${{ matrix.vllm_ascend_branch }}" - echo "------------------------" - - cd /github/home - ERROR_MSG="" - if ! bash benchmarks/scripts/run-performance-benchmarks.sh; then - ERROR_MSG="Benchmark failed to run" - fi - # send the result to es - escli add --vllm_branch "${{ matrix.vllm_branch }}" \ - --vllm_ascend_branch "${{ matrix.vllm_ascend_branch }}" \ - --commit_id "$commit_id" \ - --commit_title "$commit_title" \ - --created_at "$commit_time_no_tz" \ - --res_dir ./benchmarks/results \ - --error "$ERROR_MSG" \ - rm -rf ./benchmarks/results - cd - - done < commit_log.txt diff --git a/.github/workflows/schedule_update_estimated_time.yaml b/.github/workflows/schedule_update_estimated_time.yaml index 01175419ffb..01bb5c87aff 100644 --- a/.github/workflows/schedule_update_estimated_time.yaml +++ b/.github/workflows/schedule_update_estimated_time.yaml @@ -23,7 +23,7 @@ jobs: name: e2e-test strategy: matrix: - vllm_version: [v0.18.0] + vllm_version: [v0.19.0] type: [full, light] uses: ./.github/workflows/_e2e_test.yaml with: diff --git a/docs/source/conf.py b/docs/source/conf.py index 0950d4406bc..f01562dc68d 100644 --- a/docs/source/conf.py +++ b/docs/source/conf.py @@ -80,7 +80,7 @@ # CANN image tag "cann_image_tag": "8.5.1-910b-ubuntu22.04-py3.11", # vLLM commit hash for main branch - "main_vllm_commit": "5af684c31912232e5c89484c2e8259e0fac6c55b", + "main_vllm_commit": "6f786f2c506cb07f4566771fdc62e640e2c4a176", # vLLM tag for main branch "main_vllm_tag": "v0.19.0", # Python version for main branch diff --git a/tests/e2e/singlecard/model_runner_v2/test_basic.py b/tests/e2e/singlecard/model_runner_v2/test_basic.py index cd47bad20e5..59e03d4dd69 100644 --- a/tests/e2e/singlecard/model_runner_v2/test_basic.py +++ b/tests/e2e/singlecard/model_runner_v2/test_basic.py @@ -22,6 +22,7 @@ from vllm import SamplingParams from tests.e2e.conftest import VllmRunner +from vllm_ascend.utils import vllm_version_is MODELS = ["Qwen/Qwen3-0.6B"] @@ -63,6 +64,7 @@ def test_qwen3_dense_eager_mode( runner.model.generate(prompts, sampling_params) +@pytest.mark.skipif(vllm_version_is("0.19.0"), reason="no need to support model_runner for v0.19.0") @pytest.mark.parametrize("model", MAIN_MODELS) @pytest.mark.parametrize("eagle_model", EGALE_MODELS) @pytest.mark.parametrize("max_tokens", [32]) diff --git a/vllm_ascend/patch/worker/patch_gdn_attn.py b/vllm_ascend/patch/worker/patch_gdn_attn.py index 716b3af86fe..53181133dfd 100644 --- a/vllm_ascend/patch/worker/patch_gdn_attn.py +++ b/vllm_ascend/patch/worker/patch_gdn_attn.py @@ -23,6 +23,7 @@ _validate_cu_seqlens, build_chunk_meta_device, ) +from vllm_ascend.utils import is_310p _GDN_CHUNK_SIZE = 64 # Keep this aligned with solve_tril.LARGE_BLOCK_T in ops/triton/fla/solve_tril.py. @@ -596,7 +597,7 @@ def _patched_build( return attn_metadata -if not _IS_PATCHED: +if not _IS_PATCHED and not is_310p(): gdn_attn.GDNChunkedPrefillMetadata = GDNChunkedPrefillMetadata gdn_attn.GDNCausalConv1dHostMetadata = GDNCausalConv1dHostMetadata gdn_attn.GDNPrefillFallbackMeta = GDNPrefillFallbackMeta diff --git a/vllm_ascend/quantization/modelslim_config.py b/vllm_ascend/quantization/modelslim_config.py index 10eb7622885..3ab2986c0ec 100644 --- a/vllm_ascend/quantization/modelslim_config.py +++ b/vllm_ascend/quantization/modelslim_config.py @@ -404,7 +404,7 @@ def from_config(cls, config: dict[str, Any]) -> "AscendModelSlimConfig": return cls(config) @classmethod - def override_quantization_method(cls, hf_quant_cfg, user_quant) -> str | None: + def override_quantization_method(cls, hf_quant_cfg, user_quant, hf_config: Any = None) -> str | None: if hf_quant_cfg is not None: quant_method = hf_quant_cfg.get("quant_method", None) if not quant_method and torch.npu.is_available(): diff --git a/vllm_ascend/worker/model_runner_v1.py b/vllm_ascend/worker/model_runner_v1.py index af57b6bdc97..2992033423a 100644 --- a/vllm_ascend/worker/model_runner_v1.py +++ b/vllm_ascend/worker/model_runner_v1.py @@ -2449,7 +2449,6 @@ def _dummy_run( if create_mixed_batch: raise NotImplementedError("create_mixed_batch is used for warmup deepgemm, vllm-ascend does not need it") elif uniform_decode: - assert not create_mixed_batch num_reqs = min(max_num_reqs, cdiv(num_tokens, max_query_len)) num_scheduled_tokens_list = [max_query_len] * num_reqs if num_tokens % max_query_len != 0: @@ -3473,12 +3472,20 @@ def _check_and_update_cudagraph_mode( with update_pass_config(self): super()._check_and_update_cudagraph_mode(attention_backends, kv_cache_groups) + + capture_descs = self.cudagraph_dispatcher.get_capture_descs() + capture_sizes = sorted({ + desc.num_tokens + for _, descs in capture_descs + for desc in descs + }) + # NOTE: Since aclgraph_batch_sizes cannot be determined until here, # we set the graph params right before initializing the keys. if self.use_aclgraph: - set_graph_params(self.cudagraph_batch_sizes) + set_graph_params(capture_sizes) if self.speculative_config: - set_draft_graph_params(self.cudagraph_batch_sizes) + set_draft_graph_params(capture_sizes) def capture_model(self) -> None: gpu_model_runner_cls = next((cls for cls in self.__class__.__mro__ if cls.__name__ == "GPUModelRunner"), None) diff --git a/vllm_ascend/worker/v2/spec_decode/eagle/speculator.py b/vllm_ascend/worker/v2/spec_decode/eagle/speculator.py index 62e3d0e66fc..05fda17b111 100644 --- a/vllm_ascend/worker/v2/spec_decode/eagle/speculator.py +++ b/vllm_ascend/worker/v2/spec_decode/eagle/speculator.py @@ -66,6 +66,7 @@ def propose( dummy_run: bool = False, skip_attn_for_dummy_run: bool = False, mm_inputs: tuple[list[torch.Tensor], torch.Tensor] | None = None, + is_profile: Any = None, ): """Override GPU EagleSpeculator.propose for Ascend NPUs, because npu attention metadata needs more information, @@ -92,6 +93,7 @@ def propose( dummy_run, skip_attn_for_dummy_run, mm_inputs, + is_profile=is_profile, ) def generate_draft(