Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
22 commits
Select commit Hold shift + click to select a range
b2598c3
enable mm allreduce test (#2192)
Ronald1995 Aug 7, 2025
dceef08
[main] remove torch.cat and replace it by List[0] (#2153)
loukong33 Aug 7, 2025
ad10837
[CI][Quickfix] Fix AscendFusedMoE init error (#2268)
MengqingCao Aug 8, 2025
0bd5ff5
Fix accuracy test config and add DeepSeek-V2-Lite test (#2261)
wxsIcey Aug 8, 2025
3e65c40
Fix accuracy test create PR (#2274)
wxsIcey Aug 8, 2025
ee6f79c
Add ut for test_communicator.py (#2293)
yangqinghao-cmss Aug 9, 2025
9260910
[CI] Fix broken CI (#2302)
wangxiyuan Aug 11, 2025
1ab1541
[2/N][Refactor] torchair model runner refactor (#2204)
wangxiyuan Aug 11, 2025
c0f0b70
[core] Support capture custom ops into aclgraph (#2113)
ganyi1996ppo Aug 11, 2025
ca27400
Bump actions/download-artifact from 4 to 5 (#2311)
dependabot[bot] Aug 11, 2025
29aaba5
[Perf][MTP] Optimize reject sampler in greedy situation. (#2137)
whx-sjtu Aug 11, 2025
881e36d
[3/N][Refactor] torchair model runner refactor (#2207)
wangxiyuan Aug 11, 2025
eb43a47
[Feat] chunkprefill mla support torchair graph (#1772)
haojiangzheng Aug 11, 2025
c8b0f5f
[4/N][Refactor] torchair model runner refactor (#2208)
wangxiyuan Aug 11, 2025
9c6d108
Configure Gemini (#2298)
QwertyJack Aug 11, 2025
8181790
ut: add ci guard for ut coverage (#2317)
Ronald1995 Aug 12, 2025
dc585f1
[main][prefill optimization] Optimize parallel strategies to reduce c…
kunpengW-code Aug 12, 2025
49ec6c9
[Doc] Update faq (#2334)
MengqingCao Aug 12, 2025
1a70564
[5/N][Refactor] torchair model runner refactor (#2216)
wangxiyuan Aug 12, 2025
992271b
[1/N][Feat] Support MoE models with ACL Graph and refactor MoE commun…
yiz-liu Aug 12, 2025
8bfd16a
[Doc] Add container image save/load FAQ for offline environments (#2347)
QwertyJack Aug 13, 2025
0f7492d
[Bugfix] fix the oom when chunkprefill with long context like 64k (#2…
haojiangzheng Aug 13, 2025
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
6 changes: 6 additions & 0 deletions .gemini/config.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,6 @@
# https://developers.google.com/gemini-code-assist/docs/customize-gemini-behavior-github
have_fun: false # Just review the code
code_review:
comment_severity_threshold: HIGH # Reduce quantity of comments
pull_request_opened:
summary: false # Don't summarize the PR in a separate comment
23 changes: 12 additions & 11 deletions .github/workflows/accuracy_test.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -70,6 +70,8 @@ jobs:
runner: linux-aarch64-a2-1
- model_name: Qwen3-30B-A3B
runner: linux-aarch64-a2-2
- model_name: DeepSeek-V2-Lite
runner: linux-aarch64-a2-2
fail-fast: false

name: ${{ matrix.model_name }} accuracy
Expand Down Expand Up @@ -200,9 +202,8 @@ jobs:
markdown_name="${model_base_name}"
echo "markdown_name=$markdown_name" >> $GITHUB_OUTPUT
mkdir -p ./benchmarks/accuracy
pytest -sv ./tests/e2e/singlecard/models/test_lm_eval_correctness.py \
--config ./tests/e2e/singlecard/models/configs/${{ matrix.model_name }}.yaml \
--report_output ./benchmarks/accuracy/${model_base_name}.md
pytest -sv ./tests/e2e/models/test_lm_eval_correctness.py \
--config ./tests/e2e/models/configs/${{ matrix.model_name }}.yaml

- name: Generate step summary
if: ${{ always() }}
Expand All @@ -225,14 +226,14 @@ jobs:

outputs:
model_name: ${{ steps.set_output.outputs.model_name }}

vllm_ascend_version: ${{ env.GHA_VLLM_ASCEND_VERSION }}

create_pr:
runs-on: ubuntu-latest
needs: accuracy_tests
if: ${{ github.event_name == 'workflow_dispatch' && github.event.inputs.vllm-ascend-version == 'latest' }}
env:
UPSTREAM_REPO: vllm-project/vllm-ascend

steps:
- name: Checkout repository
uses: actions/checkout@v4
Expand All @@ -257,10 +258,10 @@ jobs:
TIMESTAMP=$(date +%Y%m%d%H%M%S)
BRANCH_NAME="auto-pr/accuracy-report-${TIMESTAMP}"
echo "BRANCH_NAME=${BRANCH_NAME}" >> $GITHUB_ENV
git checkout -B "${BRANCH_NAME}" upstream/${{ github.event.inputs.vllm-ascend-version }}
git checkout -B "${BRANCH_NAME}" upstream/main

- name: Download only current run reports
uses: actions/download-artifact@v4
uses: actions/download-artifact@v5
with:
path: ./docs/source/developer_guide/evaluation/accuracy_report
pattern: report-*
Expand Down Expand Up @@ -298,7 +299,7 @@ jobs:
GITHUB_TOKEN: ${{ secrets.PAT_TOKEN }}
run: |
git add ./docs/source/developer_guide/evaluation/accuracy_report/*.md
git commit -s -m "[Doc] Update accuracy reports for ${{ github.event.inputs.vllm-ascend-version }}"
git commit -s -m "[Doc] Update accuracy reports for ${{ needs.accuracy_tests.outputs.vllm_ascend_version }}"
git push -f origin "${{ env.BRANCH_NAME }}"

- name: Create PR in upstream via API
Expand All @@ -310,9 +311,9 @@ jobs:
owner: 'vllm-project',
repo: 'vllm-ascend',
head: `vllm-ascend-ci:${{ env.BRANCH_NAME }}`,
base: '${{ github.event.inputs.vllm-ascend-version }}',
title: `[Doc] Update accuracy reports for ${{ github.event.inputs.vllm-ascend-version }}`,
body: `The accuracy results running on NPU Altlas A2 have changed, updating reports for: All models (Qwen/Qwen3-30B-A3B, Qwen2.5-VL-7B-Instruct, Qwen3-8B-Base)
base: 'main',
title: `[Doc] Update accuracy reports for ${{ needs.accuracy_tests.outputs.vllm_ascend_version }}`,
body: `The accuracy results running on NPU Altlas A2 have changed, updating reports for: All models (Qwen3-30B-A3B, Qwen2.5-VL-7B-Instruct, Qwen3-8B-Base, DeepSeek-V2-Lite)

- [Workflow run][1]

Expand Down
9 changes: 7 additions & 2 deletions .github/workflows/vllm_ascend_test.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -185,6 +185,9 @@ jobs:
run: |
pip install -r requirements-dev.txt
pip install -v -e .
if [[ "${{ matrix.vllm_version }}" == "v0.10.0" ]]; then
pip install "transformers<4.54.0"
fi

- name: Run e2e test
env:
Expand All @@ -211,8 +214,7 @@ jobs:
--ignore=tests/e2e/singlecard/test_embedding.py \
--ignore=tests/e2e/singlecard/spec_decode_v1/test_v1_mtp_correctness.py \
--ignore=tests/e2e/singlecard/spec_decode_v1/test_v1_spec_decode.py \
--ignore=tests/e2e/singlecard/test_offline_inference_310p.py \
--ignore=tests/e2e/singlecard/models/test_lm_eval_correctness.py
--ignore=tests/e2e/singlecard/test_offline_inference_310p.py
e2e-2-cards:
needs: [e2e]
if: ${{ needs.e2e.result == 'success' }}
Expand Down Expand Up @@ -268,6 +270,9 @@ jobs:
run: |
pip install -r requirements-dev.txt
pip install -v -e .
if [[ "${{ matrix.vllm_version }}" == "v0.10.0" ]]; then
pip install "transformers<4.54.0"
fi

- name: Run vllm-project/vllm-ascend test
env:
Expand Down
102 changes: 0 additions & 102 deletions .github/workflows/vllm_ascend_test_long_term.yaml

This file was deleted.

6 changes: 2 additions & 4 deletions codecov.yml
Original file line number Diff line number Diff line change
Expand Up @@ -17,12 +17,10 @@

coverage:
status:
# non-voting, new code must be fully tested
# Patch coverage is mandatory and must be >= 80%
patch:
default:
target: 100%
# non-voting
informational: true
target: 80%
# non-voting
project:
default:
Expand Down
11 changes: 11 additions & 0 deletions csrc/torch_binding.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -27,6 +27,17 @@

namespace vllm_ascend {

AscendType get_dtype_from_torch(at::ScalarType scalarType)
{
if (scalarType == at::ScalarType::Float) {
return AscendType::FP32;
} else if (scalarType == at::ScalarType::BFloat16) {
return AscendType::BF16;
} else {
return AscendType::FP16;
}
}

std::tuple<at::Tensor, at::Tensor> rotary_embedding(at::Tensor &positions, at::Tensor &query, at::Tensor &key,
int64_t head_size, at::Tensor &cos_sin_cache, bool is_neox)
{
Expand Down
86 changes: 86 additions & 0 deletions csrc/torch_binding_meta.cpp
Original file line number Diff line number Diff line change
@@ -0,0 +1,86 @@
#include <torch/extension.h>
#include <torch/library.h>
#include <torch/version.h>
#include <torch_npu/csrc/core/npu/NPUStream.h>
#include <torch_npu/csrc/framework/OpCommand.h>
#include <torch_npu/csrc/npu/Module.h>
#include "utils.h"
/*
* How to write a meta implementation for a custom operator (meta kernel):
*
* Meta implementations are used for shape and dtype inference, tracing, and export.
* They do NOT perform any real computation or allocate device memory.
* Instead, they return empty tensors with the correct shapes, dtypes, and device types.
*
* Steps to write a meta implementation:
* 1. The function signature should match the operator's schema, but only use the arguments
* necessary to infer output shapes and dtypes.
* 2. Use input tensor shapes, dtypes, and any relevant arguments to compute the output shapes.
* 3. Return empty tensors (e.g., at::empty_symint, at::empty_like) with the correct shape and dtype.
* 4. Do NOT perform any real computation or data movement.
* 5. Register the meta implementation with the "Meta" dispatch key using TORCH_LIBRARY_IMPL or similar.
*
* Example:
* std::tuple<at::Tensor, at::Tensor> my_op_meta(
* at::Tensor &input, int64_t some_param) {
* // Infer output shape based on input and parameters
* auto out_shape = ...;
* at::Tensor out = at::empty_symint(out_shape, input.options());
* // Return empty tensor(s) with correct shape/dtype
* return {out, ...};
* }
*
* See below for real examples.
*/

namespace vllm_ascend {
namespace meta {

std::tuple<at::Tensor, at::Tensor> rotary_embedding_meta(
at::Tensor &positions,
at::Tensor &query,
at::Tensor &key,
int64_t head_size,
at::Tensor &cos_sin_cache,
bool is_neox) {
auto num_tokens = positions.sym_numel();
auto query_hidden_size = query.sym_numel() / num_tokens;
auto key_hidden_size = key.sym_numel() / num_tokens;

auto num_heads = query_hidden_size / head_size;
auto num_kv_heads = key_hidden_size / head_size;
at::Tensor query_dst = at::empty_symint({num_tokens, num_heads, head_size}, query.options());
at::Tensor key_dst = at::empty_symint({num_tokens, num_kv_heads, head_size}, key.options());

return {query_dst, key_dst};
}

std::tuple<at::Tensor, at::Tensor> get_masked_input_and_mask_meta(
at::Tensor &input,
const int64_t org_vocab_start_index,
const int64_t org_vocab_end_index,
const int64_t num_org_vocab_padding,
const int64_t added_vocab_start_index,
const int64_t added_vocab_end_index) {

at::Tensor masked_input = at::empty_like(input);
at::Tensor mask = at::empty_like(input, input.options().dtype(at::kBool));

return {masked_input, mask};
}


} // namespace meta
} // namespace vllm_ascend

namespace {
// Register the meta implementations of the custom kernels for symbolic tracing, this will also
// the custom kernel been captured into aclgraph
TORCH_LIBRARY_IMPL_EXPAND(_C, Meta, ops) {
// Rotary embedding meta implementation
ops.impl("rotary_embedding", &vllm_ascend::meta::rotary_embedding_meta);
// Masked input and mask meta implementation
ops.impl("get_masked_input_and_mask", &vllm_ascend::meta::get_masked_input_and_mask_meta);

}
}
12 changes: 0 additions & 12 deletions csrc/utils.h
Original file line number Diff line number Diff line change
Expand Up @@ -29,15 +29,3 @@
}


namespace vllm_ascend {
AscendType get_dtype_from_torch(at::ScalarType scalarType)
{
if (scalarType == at::ScalarType::Float) {
return AscendType::FP32;
} else if (scalarType == at::ScalarType::BFloat16) {
return AscendType::BF16;
} else {
return AscendType::FP16;
}
}
} // namespace vllm_ascend
Loading