Csrayz · Csrayz · Aug 13, 2025 · Aug 7, 2025 · Aug 7, 2025 · Aug 8, 2025
diff --git a/.gemini/config.yaml b/.gemini/config.yaml
@@ -0,0 +1,6 @@
+# https://developers.google.com/gemini-code-assist/docs/customize-gemini-behavior-github
+have_fun: false  # Just review the code
+code_review:
+  comment_severity_threshold: HIGH  # Reduce quantity of comments
+  pull_request_opened:
+    summary: false  # Don't summarize the PR in a separate comment
diff --git a/.github/workflows/accuracy_test.yaml b/.github/workflows/accuracy_test.yaml
@@ -70,6 +70,8 @@ jobs:
             runner: linux-aarch64-a2-1
           - model_name: Qwen3-30B-A3B
             runner: linux-aarch64-a2-2
+          - model_name: DeepSeek-V2-Lite
+            runner: linux-aarch64-a2-2
       fail-fast: false
 
     name: ${{ matrix.model_name }} accuracy
@@ -200,9 +202,8 @@ jobs:
           markdown_name="${model_base_name}"
           echo "markdown_name=$markdown_name" >> $GITHUB_OUTPUT
           mkdir -p ./benchmarks/accuracy
-          pytest -sv ./tests/e2e/singlecard/models/test_lm_eval_correctness.py \
-          --config ./tests/e2e/singlecard/models/configs/${{ matrix.model_name }}.yaml \
-          --report_output ./benchmarks/accuracy/${model_base_name}.md 
+          pytest -sv ./tests/e2e/models/test_lm_eval_correctness.py \
+          --config ./tests/e2e/models/configs/${{ matrix.model_name }}.yaml
 
       - name: Generate step summary
         if: ${{ always() }}
@@ -225,14 +226,14 @@ jobs:
 
     outputs:
       model_name: ${{ steps.set_output.outputs.model_name }}
-
+      vllm_ascend_version: ${{ env.GHA_VLLM_ASCEND_VERSION }} 
+
   create_pr:
     runs-on: ubuntu-latest
     needs: accuracy_tests
     if: ${{ github.event_name == 'workflow_dispatch' && github.event.inputs.vllm-ascend-version == 'latest' }}
     env:
       UPSTREAM_REPO: vllm-project/vllm-ascend
-
     steps:
       - name: Checkout repository
         uses: actions/checkout@v4
@@ -257,10 +258,10 @@ jobs:
           TIMESTAMP=$(date +%Y%m%d%H%M%S)
           BRANCH_NAME="auto-pr/accuracy-report-${TIMESTAMP}"
           echo "BRANCH_NAME=${BRANCH_NAME}" >> $GITHUB_ENV
-          git checkout -B "${BRANCH_NAME}" upstream/${{ github.event.inputs.vllm-ascend-version }}
+          git checkout -B "${BRANCH_NAME}" upstream/main
 
       - name: Download only current run reports
-        uses: actions/download-artifact@v4
+        uses: actions/download-artifact@v5
         with:
           path: ./docs/source/developer_guide/evaluation/accuracy_report
           pattern: report-*
@@ -298,7 +299,7 @@ jobs:
           GITHUB_TOKEN: ${{ secrets.PAT_TOKEN }}
         run: |
           git add ./docs/source/developer_guide/evaluation/accuracy_report/*.md
-          git commit -s -m "[Doc] Update accuracy reports for ${{ github.event.inputs.vllm-ascend-version }}"
+          git commit -s -m "[Doc] Update accuracy reports for ${{ needs.accuracy_tests.outputs.vllm_ascend_version }}"
           git push -f origin "${{ env.BRANCH_NAME }}"
 
       - name: Create PR in upstream via API
@@ -310,9 +311,9 @@ jobs:
               owner: 'vllm-project',
               repo: 'vllm-ascend',
               head: `vllm-ascend-ci:${{ env.BRANCH_NAME }}`,
-              base: '${{ github.event.inputs.vllm-ascend-version }}',
-              title: `[Doc] Update accuracy reports for ${{ github.event.inputs.vllm-ascend-version }}`,
-              body: `The accuracy results running on NPU Altlas A2 have changed, updating reports for: All models (Qwen/Qwen3-30B-A3B, Qwen2.5-VL-7B-Instruct, Qwen3-8B-Base)
+              base: 'main',
+              title: `[Doc] Update accuracy reports for ${{ needs.accuracy_tests.outputs.vllm_ascend_version }}`,
+              body: `The accuracy results running on NPU Altlas A2 have changed, updating reports for: All models (Qwen3-30B-A3B, Qwen2.5-VL-7B-Instruct, Qwen3-8B-Base, DeepSeek-V2-Lite)
 
               - [Workflow run][1]
 

diff --git a/.github/workflows/vllm_ascend_test.yaml b/.github/workflows/vllm_ascend_test.yaml
@@ -185,6 +185,9 @@ jobs:
         run: |
           pip install -r requirements-dev.txt
           pip install -v -e .
+          if [[ "${{ matrix.vllm_version }}" == "v0.10.0" ]]; then
+            pip install "transformers<4.54.0"
+          fi
 
       - name: Run e2e test
         env:
@@ -211,8 +214,7 @@ jobs:
           --ignore=tests/e2e/singlecard/test_embedding.py \
           --ignore=tests/e2e/singlecard/spec_decode_v1/test_v1_mtp_correctness.py \
           --ignore=tests/e2e/singlecard/spec_decode_v1/test_v1_spec_decode.py \
-          --ignore=tests/e2e/singlecard/test_offline_inference_310p.py \
-          --ignore=tests/e2e/singlecard/models/test_lm_eval_correctness.py
+          --ignore=tests/e2e/singlecard/test_offline_inference_310p.py
   e2e-2-cards:
     needs: [e2e]
     if: ${{ needs.e2e.result == 'success' }}
@@ -268,6 +270,9 @@ jobs:
         run: |
           pip install -r requirements-dev.txt
           pip install -v -e .
+          if [[ "${{ matrix.vllm_version }}" == "v0.10.0" ]]; then
+            pip install "transformers<4.54.0"
+          fi
 
       - name: Run vllm-project/vllm-ascend test
         env:

diff --git a/.github/workflows/vllm_ascend_test_long_term.yaml b/.github/workflows/vllm_ascend_test_long_term.yaml
diff --git a/codecov.yml b/codecov.yml
@@ -17,12 +17,10 @@
 
 coverage:
   status:
-    # non-voting, new code must be fully tested
+    # Patch coverage is mandatory and must be >= 80%
     patch:
       default:
-        target: 100%
-        # non-voting
-        informational: true
+        target: 80%
     # non-voting
     project:
       default:

diff --git a/csrc/torch_binding.cpp b/csrc/torch_binding.cpp
@@ -27,6 +27,17 @@
 
 namespace vllm_ascend {
 
+AscendType get_dtype_from_torch(at::ScalarType scalarType)
+{
+    if (scalarType == at::ScalarType::Float) {
+        return AscendType::FP32;
+    } else if (scalarType == at::ScalarType::BFloat16) {
+        return AscendType::BF16;
+    } else {
+        return AscendType::FP16;
+    }
+}
+
 std::tuple<at::Tensor, at::Tensor> rotary_embedding(at::Tensor &positions, at::Tensor &query, at::Tensor &key,
     int64_t head_size, at::Tensor &cos_sin_cache,  bool is_neox)
 {

diff --git a/csrc/torch_binding_meta.cpp b/csrc/torch_binding_meta.cpp
@@ -0,0 +1,86 @@
+#include <torch/extension.h>
+#include <torch/library.h>
+#include <torch/version.h>
+#include <torch_npu/csrc/core/npu/NPUStream.h>
+#include <torch_npu/csrc/framework/OpCommand.h>
+#include <torch_npu/csrc/npu/Module.h>
+#include "utils.h"
+/*
+ * How to write a meta implementation for a custom operator (meta kernel):
+ *
+ * Meta implementations are used for shape and dtype inference, tracing, and export.
+ * They do NOT perform any real computation or allocate device memory.
+ * Instead, they return empty tensors with the correct shapes, dtypes, and device types.
+ *
+ * Steps to write a meta implementation:
+ * 1. The function signature should match the operator's schema, but only use the arguments
+ *    necessary to infer output shapes and dtypes.
+ * 2. Use input tensor shapes, dtypes, and any relevant arguments to compute the output shapes.
+ * 3. Return empty tensors (e.g., at::empty_symint, at::empty_like) with the correct shape and dtype.
+ * 4. Do NOT perform any real computation or data movement.
+ * 5. Register the meta implementation with the "Meta" dispatch key using TORCH_LIBRARY_IMPL or similar.
+ *
+ * Example:
+ *   std::tuple<at::Tensor, at::Tensor> my_op_meta(
+ *       at::Tensor &input, int64_t some_param) {
+ *     // Infer output shape based on input and parameters
+ *     auto out_shape = ...;
+ *     at::Tensor out = at::empty_symint(out_shape, input.options());
+ *     // Return empty tensor(s) with correct shape/dtype
+ *     return {out, ...};
+ *   }
+ *
+ * See below for real examples.
+ */
+
+namespace vllm_ascend {
+namespace meta {
+
+std::tuple<at::Tensor, at::Tensor> rotary_embedding_meta(
+  at::Tensor &positions,
+  at::Tensor &query,
+  at::Tensor &key,
+  int64_t head_size, 
+  at::Tensor &cos_sin_cache,
+  bool is_neox) {
+    auto num_tokens = positions.sym_numel();
+    auto query_hidden_size = query.sym_numel() / num_tokens;
+    auto key_hidden_size = key.sym_numel() / num_tokens;
+
+    auto num_heads = query_hidden_size / head_size;
+    auto num_kv_heads = key_hidden_size / head_size;
+    at::Tensor query_dst = at::empty_symint({num_tokens, num_heads, head_size}, query.options());
+    at::Tensor key_dst = at::empty_symint({num_tokens, num_kv_heads, head_size}, key.options());
+
+    return {query_dst, key_dst};
+}
+
+std::tuple<at::Tensor, at::Tensor> get_masked_input_and_mask_meta(
+    at::Tensor &input,
+    const int64_t org_vocab_start_index,
+    const int64_t org_vocab_end_index,
+    const int64_t num_org_vocab_padding,
+    const int64_t added_vocab_start_index,
+    const int64_t added_vocab_end_index) {
+
+    at::Tensor masked_input = at::empty_like(input);
+    at::Tensor mask = at::empty_like(input, input.options().dtype(at::kBool));
+
+    return {masked_input, mask};
+}
+
+
+} // namespace meta
+} // namespace vllm_ascend
+
+namespace {
+  // Register the meta implementations of the custom kernels for symbolic tracing, this will also 
+  // the custom kernel been captured into aclgraph
+  TORCH_LIBRARY_IMPL_EXPAND(_C, Meta, ops) {
+    // Rotary embedding meta implementation
+    ops.impl("rotary_embedding", &vllm_ascend::meta::rotary_embedding_meta);
+    // Masked input and mask meta implementation
+    ops.impl("get_masked_input_and_mask", &vllm_ascend::meta::get_masked_input_and_mask_meta);
+
+}
+}
diff --git a/csrc/utils.h b/csrc/utils.h
@@ -29,15 +29,3 @@
   }
 
 
-namespace vllm_ascend {
-AscendType get_dtype_from_torch(at::ScalarType scalarType)
-{
-    if (scalarType == at::ScalarType::Float) {
-        return AscendType::FP32;
-    } else if (scalarType == at::ScalarType::BFloat16) {
-        return AscendType::BF16;
-    } else {
-        return AscendType::FP16;
-    }
-}
-} // namespace vllm_ascend