diff --git a/.github/actionlint.yaml b/.github/actionlint.yaml index f9fa5108081..75c64b58617 100644 --- a/.github/actionlint.yaml +++ b/.github/actionlint.yaml @@ -16,3 +16,9 @@ self-hosted-runner: - linux-aarch64-a2b3-1 - linux-aarch64-a2b3-2 - linux-aarch64-a2b3-4 + - linux-amd64-cpu-test-8-hk + - linux-amd64-cpu-test-16-hk + - linux-aarch64-a2b3-test-0 + - linux-aarch64-a2b3-test-1 + - linux-aarch64-a2b3-test-2 + - linux-aarch64-a2b3-test-4 \ No newline at end of file diff --git a/.github/workflows/_e2e_nightly_multi_node.yaml b/.github/workflows/_e2e_nightly_multi_node.yaml index 1777af17410..cf9f38a73c8 100644 --- a/.github/workflows/_e2e_nightly_multi_node.yaml +++ b/.github/workflows/_e2e_nightly_multi_node.yaml @@ -66,7 +66,7 @@ defaults: # only cancel in-progress runs of the same workflow # and ignore the lint / 8 cards test type concurrency: - group: ascend-nightly-${{ github.workflow_ref }}-${{ github.ref }}-${{ inputs.soc_version }} + group: ascend-nightly-${{ github.workflow_ref }}-${{ github.ref }}-${{ inputs.soc_version }}-${{ inputs.config_file_path }} cancel-in-progress: true jobs: @@ -80,7 +80,6 @@ jobs: env: KUBECONFIG: /tmp/kubeconfig NAMESPACE: vllm-project - LEADER_POD: vllm-0 steps: - name: Decode kubeconfig from secrets run: | @@ -101,6 +100,17 @@ jobs: - name: Checkout code uses: actions/checkout@v6 + - name: Set job variables + run: | + # Derive a unique, valid k8s resource name from config_file_path. + # Strip .yaml extension, lowercase, replace dots/underscores with hyphens, cap at 50 chars. + config_file="${{ inputs.config_file_path }}" + lws_suffix=$(echo "$config_file" | sed 's/\.yaml$//' | tr '[:upper:]' '[:lower:]' | tr '._' '-' | cut -c1-50) + LWS_NAME="vllm-${lws_suffix}" + echo "LWS_NAME=${LWS_NAME}" >> $GITHUB_ENV + echo "LEADER_POD=${LWS_NAME}-0" >> $GITHUB_ENV + echo "Computed LWS_NAME=${LWS_NAME}" + - name: Prepare scripts run: | # prepare for lws entrypoint scripts @@ -110,14 +120,14 @@ jobs: run: | set -euo pipefail - CRD_NAME="${CRD_NAME:-vllm}" TIMEOUT=${TIMEOUT:-120} SLEEP_INTERVAL=2 - echo "Deleting leaderworkerset [$CRD_NAME] in namespace [$NAMESPACE]..." - kubectl delete leaderworkerset "$CRD_NAME" -n "$NAMESPACE" --ignore-not-found + echo "Deleting leaderworkerset [$LWS_NAME] in namespace [$NAMESPACE]..." + kubectl delete leaderworkerset "$LWS_NAME" -n "$NAMESPACE" --ignore-not-found + kubectl delete service "${LWS_NAME}-leader" -n "$NAMESPACE" --ignore-not-found - echo "Waiting for all pods starting with 'vllm' to be deleted..." + echo "Waiting for pods of leaderworkerset [$LWS_NAME] to be deleted..." START_TIME=$(date +%s) while true; do @@ -126,14 +136,14 @@ jobs: if [[ $ELAPSED -ge $TIMEOUT ]]; then echo "Timeout reached ($TIMEOUT seconds), some pods still exist:" - kubectl get pods -n "$NAMESPACE" | grep '^vllm' || true + kubectl get pods -n "$NAMESPACE" | grep "^${LWS_NAME}-" || true exit 1 fi - PODS_EXIST=$(kubectl get pods -n "$NAMESPACE" -o jsonpath='{.items[*].metadata.name}' 2>/dev/null | tr ' ' '\n' | grep '^vllm' || true) + PODS_EXIST=$(kubectl get pods -n "$NAMESPACE" -o jsonpath='{.items[*].metadata.name}' 2>/dev/null | tr ' ' '\n' | grep "^${LWS_NAME}-" || true) if [[ -z "$PODS_EXIST" ]]; then - echo "All vllm pods deleted." + echo "All pods for [$LWS_NAME] deleted." break else echo "Waiting for pods to be deleted: $PODS_EXIST" @@ -174,6 +184,7 @@ jobs: fi jinja2 $TEMPLATE_FILE \ + -D lws_name="$LWS_NAME" \ -D size="$size" \ -D replicas="$replicas" \ -D image="$image" \ @@ -190,7 +201,7 @@ jobs: - name: Waiting for pod ready run: | - POD_PREFIX="${POD_PREFIX:-vllm-0}" + POD_PREFIX="${LWS_NAME}-0" SIZE="${{ inputs.size }}" TIMEOUT=1200 # default timeout 20 minutes @@ -260,7 +271,7 @@ jobs: trap cleanup EXIT for i in $(seq 1 $((size - 1))); do - POD="vllm-0-${i}" + POD="${LWS_NAME}-0-${i}" echo "==== Collecting logs from worker pod: $POD ====" kubectl logs -f "$POD" -n "$NAMESPACE" \ @@ -290,5 +301,34 @@ jobs: - name: Post process if: always() run: | + echo "Current pod status:" kubectl get pods -n "$NAMESPACE" --ignore-not-found=true + + echo "Deleting resources for [$LWS_NAME]..." kubectl delete -f ./lws.yaml --ignore-not-found=true || true + + echo "Waiting for pods of [$LWS_NAME] to fully terminate..." + TIMEOUT=300 + SLEEP_INTERVAL=5 + START_TIME=$(date +%s) + + while true; do + NOW=$(date +%s) + ELAPSED=$((NOW - START_TIME)) + + if [[ $ELAPSED -ge $TIMEOUT ]]; then + echo "Timeout reached ($TIMEOUT seconds) waiting for termination, continuing anyway." + kubectl get pods -n "$NAMESPACE" | grep "^${LWS_NAME}-" || true + break + fi + + PODS_EXIST=$(kubectl get pods -n "$NAMESPACE" -o jsonpath='{.items[*].metadata.name}' 2>/dev/null | tr ' ' '\n' | grep "^${LWS_NAME}-" || true) + + if [[ -z "$PODS_EXIST" ]]; then + echo "All pods for [$LWS_NAME] have terminated." + break + else + echo "Waiting for pods to terminate: $PODS_EXIST" + sleep $SLEEP_INTERVAL + fi + done diff --git a/.github/workflows/_e2e_nightly_single_node.yaml b/.github/workflows/_e2e_nightly_single_node.yaml index 4e99e8e2c3d..f6ba5cff8ff 100644 --- a/.github/workflows/_e2e_nightly_single_node.yaml +++ b/.github/workflows/_e2e_nightly_single_node.yaml @@ -71,11 +71,17 @@ jobs: env: HF_HUB_OFFLINE: 1 VLLM_USE_MODELSCOPE: True + UV_INDEX_URL: http://cache-service.nginx-pypi-cache.svc.cluster.local/pypi/simple + UV_EXTRA_INDEX_URL: https://mirrors.huaweicloud.com/ascend/repos/pypi + UV_INDEX_STRATEGY: unsafe-best-match + UV_NO_CACHE: 1 + UV_SYSTEM_PYTHON: 1 steps: - name: Check npu and CANN info run: | npu-smi info cat /usr/local/Ascend/ascend-toolkit/latest/"$(uname -i)"-linux/ascend_toolkit_install.info + pip install uv - name: uninstall vlm vllm-ascend and remove code (if pr test) if: ${{ inputs.is_pr_test }} @@ -110,7 +116,7 @@ jobs: if: ${{ inputs.is_pr_test }} working-directory: /vllm-workspace/vllm run: | - VLLM_TARGET_DEVICE=empty pip install -e . + VLLM_TARGET_DEVICE=empty uv pip install -e . - name: Install vllm-project/vllm-ascend if: ${{ inputs.is_pr_test }} @@ -118,8 +124,9 @@ jobs: env: PIP_EXTRA_INDEX_URL: https://mirrors.huaweicloud.com/ascend/repos/pypi run: | - pip install -r requirements-dev.txt - pip install -v -e . + pip install uc-manager + uv pip install -r requirements-dev.txt + uv pip install -v -e . - name: Install aisbench if: ${{ inputs.is_pr_test }} diff --git a/.github/workflows/_e2e_nightly_single_node_models.yaml b/.github/workflows/_e2e_nightly_single_node_models.yaml index f7672f621f1..44f78c5b3a2 100644 --- a/.github/workflows/_e2e_nightly_single_node_models.yaml +++ b/.github/workflows/_e2e_nightly_single_node_models.yaml @@ -67,6 +67,11 @@ jobs: env: VLLM_USE_MODELSCOPE: True GHA_VLLM_ASCEND_VERSION: ${{ inputs.vllm-ascend }} + UV_INDEX_URL: http://cache-service.nginx-pypi-cache.svc.cluster.local/pypi/simple + UV_EXTRA_INDEX_URL: https://mirrors.huaweicloud.com/ascend/repos/pypi + UV_INDEX_STRATEGY: unsafe-best-match + UV_NO_CACHE: 1 + UV_SYSTEM_PYTHON: 1 steps: - name: Check npu and CANN info run: | @@ -91,6 +96,7 @@ jobs: update-alternatives --install /usr/bin/clang clang /usr/bin/clang-15 20 update-alternatives --install /usr/bin/clang++ clang++ /usr/bin/clang++-15 20 + pip install uv - name: Checkout vllm-project/vllm repo uses: actions/checkout@v6 @@ -102,14 +108,15 @@ jobs: - name: Install vllm-project/vllm from source working-directory: ./vllm-empty run: | - VLLM_TARGET_DEVICE=empty pip install -e . + VLLM_TARGET_DEVICE=empty uv pip install -e . - name: Install vllm-project/vllm-ascend env: PIP_EXTRA_INDEX_URL: https://mirrors.huaweicloud.com/ascend/repos/pypi run: | - pip install -r requirements-dev.txt - pip install -v -e . + pip install uc-manager + uv pip install -r requirements-dev.txt + uv pip install -v -e . - name: Install tensorflow (for Molmo-7B-D-0924) if: ${{ inputs.runner == 'linux-aarch64-a2b3-1' && contains(inputs.model_list, 'Molmo-7B-D-0924') }} diff --git a/.github/workflows/_e2e_test.yaml b/.github/workflows/_e2e_test.yaml index fb84bd72395..d953cc27f1b 100644 --- a/.github/workflows/_e2e_test.yaml +++ b/.github/workflows/_e2e_test.yaml @@ -19,12 +19,18 @@ on: required: false type: boolean default: false +env: + UV_INDEX_URL: http://cache-service.nginx-pypi-cache.svc.cluster.local/pypi/simple + UV_EXTRA_INDEX_URL: https://mirrors.huaweicloud.com/ascend/repos/pypi + UV_INDEX_STRATEGY: unsafe-best-match + UV_NO_CACHE: 1 + UV_SYSTEM_PYTHON: 1 jobs: e2e-light: name: singlecard-light if: ${{ inputs.type == 'light' }} - runs-on: linux-aarch64-a2b3-1 + runs-on: linux-aarch64-a2b3-test-1 strategy: fail-fast: false matrix: @@ -58,6 +64,7 @@ jobs: update-alternatives --install /usr/bin/clang clang /usr/bin/clang-15 20 update-alternatives --install /usr/bin/clang++ clang++ /usr/bin/clang++-15 20 + pip install uv - name: Checkout vllm-project/vllm repo uses: actions/checkout@v6 @@ -70,14 +77,22 @@ jobs: - name: Install vllm-project/vllm from source working-directory: ./vllm-empty run: | - VLLM_TARGET_DEVICE=empty pip install -e . + VLLM_TARGET_DEVICE=empty uv pip install -e . - name: Install vllm-project/vllm-ascend env: PIP_EXTRA_INDEX_URL: https://mirrors.huaweicloud.com/ascend/repos/pypi run: | - pip install -r requirements-dev.txt - pip install -v -e . + start_time=$(date +%s) + pip install uc-manager + end_time=$(date +%s) + duration=$((end_time - start_time)) + echo "uc-manager总耗时:$duration 秒($((duration/60)) 分 $((duration%60)) 秒)" + uv pip install -r requirements-dev.txt + end_time1=$(date +%s) + duration=$((end_time1 - end_time)) + echo "install总耗时:$duration 秒($((duration/60)) 分 $((duration%60)) 秒)" + uv pip install -v -e . - name: Run vllm-project/vllm-ascend test env: @@ -147,6 +162,7 @@ jobs: update-alternatives --install /usr/bin/clang clang /usr/bin/clang-15 20 update-alternatives --install /usr/bin/clang++ clang++ /usr/bin/clang++-15 20 + pip install uv - name: Checkout vllm-project/vllm repo uses: actions/checkout@v6 @@ -159,14 +175,15 @@ jobs: - name: Install vllm-project/vllm from source working-directory: ./vllm-empty run: | - VLLM_TARGET_DEVICE=empty pip install -e . + VLLM_TARGET_DEVICE=empty uv pip install -e . - name: Install vllm-project/vllm-ascend env: PIP_EXTRA_INDEX_URL: https://mirrors.huaweicloud.com/ascend/repos/pypi run: | - pip install -r requirements-dev.txt - pip install -v -e . + pip install uc-manager + uv pip install -r requirements-dev.txt + uv pip install -v -e . - name: Run e2e test env: VLLM_WORKER_MULTIPROC_METHOD: spawn @@ -198,7 +215,7 @@ jobs: e2e-2-cards-light: name: multicard-2-light if: ${{ inputs.type == 'light' }} - runs-on: linux-aarch64-a3-2 + runs-on: linux-aarch64-a2b3-test-2 strategy: fail-fast: false matrix: @@ -233,6 +250,7 @@ jobs: update-alternatives --install /usr/bin/clang clang /usr/bin/clang-15 20 update-alternatives --install /usr/bin/clang++ clang++ /usr/bin/clang++-15 20 + pip install uv - name: Checkout vllm-project/vllm repo uses: actions/checkout@v6 @@ -245,14 +263,22 @@ jobs: - name: Install vllm-project/vllm from source working-directory: ./vllm-empty run: | - VLLM_TARGET_DEVICE=empty pip install -e . + VLLM_TARGET_DEVICE=empty uv pip install -e . - name: Install vllm-project/vllm-ascend env: PIP_EXTRA_INDEX_URL: https://mirrors.huaweicloud.com/ascend/repos/pypi run: | + start_time=$(date +%s) + pip install uc-manager + end_time=$(date +%s) + duration=$((end_time - start_time)) + echo "uc-manager总耗时:$duration 秒($((duration/60)) 分 $((duration%60)) 秒)" pip install -r requirements-dev.txt - pip install -v -e . + end_time1=$(date +%s) + duration=$((end_time1 - end_time)) + echo "install总耗时:$duration 秒($((duration/60)) 分 $((duration%60)) 秒)" + uv pip install -v -e . - name: Run vllm-project/vllm-ascend test (light) env: VLLM_WORKER_MULTIPROC_METHOD: spawn @@ -319,6 +345,7 @@ jobs: update-alternatives --install /usr/bin/clang clang /usr/bin/clang-15 20 update-alternatives --install /usr/bin/clang++ clang++ /usr/bin/clang++-15 20 + pip install uv - name: Checkout vllm-project/vllm repo uses: actions/checkout@v6 @@ -331,14 +358,15 @@ jobs: - name: Install vllm-project/vllm from source working-directory: ./vllm-empty run: | - VLLM_TARGET_DEVICE=empty pip install -e . + VLLM_TARGET_DEVICE=empty uv pip install -e . - name: Install vllm-project/vllm-ascend env: PIP_EXTRA_INDEX_URL: https://mirrors.huaweicloud.com/ascend/repos/pypi run: | - pip install -r requirements-dev.txt - pip install -v -e . + pip install uc-manager + uv pip install -r requirements-dev.txt + uv pip install -v -e . - name: Run vllm-project/vllm-ascend test (full) env: VLLM_WORKER_MULTIPROC_METHOD: spawn @@ -412,6 +440,7 @@ jobs: update-alternatives --install /usr/bin/clang clang /usr/bin/clang-15 20 update-alternatives --install /usr/bin/clang++ clang++ /usr/bin/clang++-15 20 + pip install uv - name: Checkout vllm-project/vllm repo uses: actions/checkout@v6 @@ -424,14 +453,15 @@ jobs: - name: Install vllm-project/vllm from source working-directory: ./vllm-empty run: | - VLLM_TARGET_DEVICE=empty pip install -e . + VLLM_TARGET_DEVICE=empty uv pip install -e . - name: Install vllm-project/vllm-ascend env: PIP_EXTRA_INDEX_URL: https://mirrors.huaweicloud.com/ascend/repos/pypi run: | - pip install -r requirements-dev.txt - pip install -v -e . + pip install uc-manager + uv pip install -r requirements-dev.txt + uv pip install -v -e . - name: Run vllm-project/vllm-ascend test for V1 Engine env: @@ -491,6 +521,7 @@ jobs: run: | apt-get -y install `cat packages.txt` apt-get -y install gcc g++ cmake libnuma-dev + pip install uv - name: Checkout vllm-project/vllm repo uses: actions/checkout@v6 @@ -503,14 +534,15 @@ jobs: - name: Install vllm-project/vllm from source working-directory: ./vllm-empty run: | - VLLM_TARGET_DEVICE=empty pip install -e . + VLLM_TARGET_DEVICE=empty uv pip install -e . - name: Install vllm-project/vllm-ascend env: PIP_EXTRA_INDEX_URL: https://mirrors.huaweicloud.com/ascend/repos/pypi run: | - pip install -r requirements-dev.txt - pip install -v -e . + pip install uc-manager + uv pip install -r requirements-dev.txt + uv pip install -v -e . - name: Run vllm-project/vllm-ascend test env: @@ -550,6 +582,7 @@ jobs: run: | apt-get -y install `cat packages.txt` apt-get -y install gcc g++ cmake libnuma-dev + pip install uv - name: Checkout vllm-project/vllm repo uses: actions/checkout@v6 @@ -562,14 +595,15 @@ jobs: - name: Install vllm-project/vllm from source working-directory: ./vllm-empty run: | - VLLM_TARGET_DEVICE=empty pip install -e . + VLLM_TARGET_DEVICE=empty uv pip install -e . - name: Install vllm-project/vllm-ascend env: PIP_EXTRA_INDEX_URL: https://mirrors.huaweicloud.com/ascend/repos/pypi run: | - pip install -r requirements-dev.txt - pip install -v -e . + pip install uc-manager + uv pip install -r requirements-dev.txt + uv pip install -v -e . - name: Run vllm-project/vllm-ascend test env: diff --git a/.github/workflows/_pre_commit.yml b/.github/workflows/_pre_commit.yml index 0e6b6ddb153..d5f7c4dfdfa 100644 --- a/.github/workflows/_pre_commit.yml +++ b/.github/workflows/_pre_commit.yml @@ -12,7 +12,7 @@ permissions: jobs: pre-commit: - runs-on: linux-amd64-cpu-8-hk + runs-on: linux-amd64-cpu-test-8-hk container: # Build it from https://github.com/nv-action/vllm-benchmarks/blob/main/Dockerfile image: quay.io/ascend-ci/vllm-ascend:lint @@ -50,9 +50,17 @@ jobs: - name: Install vllm-ascend dev (conditional) if: steps.filter.outputs.lint_tracker == 'true' + env: + UV_INDEX_URL: http://cache-service.nginx-pypi-cache.svc.cluster.local/pypi/simple + UV_EXTRA_INDEX_URL: https://mirrors.huaweicloud.com/ascend/repos/pypi + UV_INDEX_STRATEGY: unsafe-best-match + UV_NO_CACHE: 1 + UV_SYSTEM_PYTHON: 1 run: | + pip install uv git config --global --add safe.directory /__w/vllm-ascend/vllm-ascend - pip install -r requirements-dev.txt --extra-index-url https://download.pytorch.org/whl/cpu + pip install uc-manager + uv pip install -r requirements-dev.txt --extra-index-url https://download.pytorch.org/whl/cpu - name: Run pre-commit env: diff --git a/.github/workflows/_unit_test.yaml b/.github/workflows/_unit_test.yaml index b6864bb5c33..9de2ee92952 100644 --- a/.github/workflows/_unit_test.yaml +++ b/.github/workflows/_unit_test.yaml @@ -28,6 +28,12 @@ jobs: SOC_VERSION: ascend910b1 MAX_JOBS: 4 COMPILE_CUSTOM_KERNELS: 0 + UV_INDEX_URL: http://cache-service.nginx-pypi-cache.svc.cluster.local/pypi/simple + UV_EXTRA_INDEX_URL: https://mirrors.huaweicloud.com/ascend/repos/pypi + UV_INDEX_STRATEGY: unsafe-best-match + UV_NO_CACHE: 1 + UV_SYSTEM_PYTHON: 1 + UV_PYTHON: python3 steps: - name: Install packages run: | @@ -36,6 +42,7 @@ jobs: pip config set global.trusted-host cache-service.nginx-pypi-cache.svc.cluster.local apt-get update -y apt-get install -y python3-pip git vim wget net-tools gcc g++ cmake libnuma-dev curl gnupg2 + pip install uv - name: Checkout vllm-project/vllm repo uses: actions/checkout@v6 @@ -47,18 +54,25 @@ jobs: - name: Install vllm-project/vllm from source working-directory: ./vllm-empty run: | - VLLM_TARGET_DEVICE=empty python3 -m pip install . --extra-index https://download.pytorch.org/whl/cpu/ - python3 -m pip uninstall -y triton + VLLM_TARGET_DEVICE=empty uv pip install . --extra-index-url https://download.pytorch.org/whl/cpu/ + uv pip uninstall triton - name: Checkout vllm-project/vllm-ascend repo uses: actions/checkout@v6 - name: Install vllm-project/vllm-ascend run: | - export PIP_EXTRA_INDEX_URL=https://mirrors.huaweicloud.com/ascend/repos/pypi + start_time=$(date +%s) + pip install uc-manager + end_time=$(date +%s) + duration=$((end_time - start_time)) + echo "uc-manager总耗时:$duration 秒($((duration/60)) 分 $((duration%60)) 秒)" export LD_LIBRARY_PATH=$LD_LIBRARY_PATH:/usr/local/Ascend/ascend-toolkit/latest/x86_64-linux/devlib - python3 -m pip install -v . --extra-index https://download.pytorch.org/whl/cpu/ - python3 -m pip install -r requirements-dev.txt --extra-index https://download.pytorch.org/whl/cpu/ + uv pip install -v . --extra-index-url https://download.pytorch.org/whl/cpu/ + uv pip install -r requirements-dev.txt --extra-index-url https://download.pytorch.org/whl/cpu/ + end_time1=$(date +%s) + duration=$((end_time1 - end_time)) + echo "install总耗时:$duration 秒($((duration/60)) 分 $((duration%60)) 秒)" - name: Run unit test env: diff --git a/.github/workflows/pr_test_light.yaml b/.github/workflows/pr_test_light.yaml index bb98dfe36a1..366c31dbee3 100644 --- a/.github/workflows/pr_test_light.yaml +++ b/.github/workflows/pr_test_light.yaml @@ -43,7 +43,7 @@ jobs: with: vllm: 4034c3d32e30d01639459edd3ab486f56993876d changes: - runs-on: linux-aarch64-a2b3-0 + runs-on: linux-aarch64-a2b3-test-0 outputs: e2e_tracker: ${{ steps.filter.outputs.e2e_tracker }} ut_tracker: ${{ steps.filter.outputs.ut_tracker }} @@ -94,7 +94,7 @@ jobs: uses: ./.github/workflows/_unit_test.yaml with: vllm: ${{ matrix.vllm_version }} - runner: linux-amd64-cpu-8-hk + runner: linux-amd64-cpu-test-8-hk image: quay.nju.edu.cn/ascend/cann:8.5.1-910b-ubuntu22.04-py3.11 type: pr diff --git a/.github/workflows/schedule_nightly_test_a2.yaml b/.github/workflows/schedule_nightly_test_a2.yaml index 347d4a36d2d..2d9b2b06b88 100644 --- a/.github/workflows/schedule_nightly_test_a2.yaml +++ b/.github/workflows/schedule_nightly_test_a2.yaml @@ -21,8 +21,8 @@ name: Nightly-A2 on: schedule: - # Run test at 24:00 Beijing time (UTC+8) - - cron: "0 16 * * *" + # Run test at 23:45 Beijing time (UTC+8) + - cron: "45 15 * * *" workflow_dispatch: pull_request: branches: @@ -49,7 +49,11 @@ concurrency: jobs: parse-trigger: name: Parse trigger and determine test scope - runs-on: linux-aarch64-a2b3-0 + runs-on: linux-aarch64-a2b3-test-0 + if: >- + github.event_name == 'schedule' || + github.event_name == 'workflow_dispatch' || + contains(github.event.pull_request.labels.*.name, 'nightly-test') outputs: should_run: ${{ steps.parse.outputs.should_run }} test_filter: ${{ steps.parse.outputs.test_filter }} @@ -130,7 +134,7 @@ jobs: matrix: test_config: - name: test_custom_op - os: linux-aarch64-a2b3-1 + os: linux-aarch64-a2b3-test-1 tests: tests/e2e/nightly/single_node/ops/singlecard_ops - name: test_custom_op_multi_card os: linux-aarch64-a2b3-4 @@ -165,7 +169,7 @@ jobs: matrix: test_config: - name: qwen3-32b - os: linux-aarch64-a2b3-4 + os: linux-aarch64-a2b3-test-4 config_file_path: Qwen3-32B.yaml - name: qwen3-next-80b-a3b-instruct os: linux-aarch64-a2b3-4 @@ -201,7 +205,7 @@ jobs: if: always() && needs.parse-trigger.outputs.should_run == 'true' strategy: fail-fast: false - max-parallel: 1 + max-parallel: 2 matrix: test_config: - name: multi-node-deepseek-dp @@ -244,7 +248,7 @@ jobs: matrix: test_config: - name: accuracy-group-1 - os: linux-aarch64-a2b3-1 + os: linux-aarch64-a2b3-test-1 model_list: - Qwen3-8B - Qwen2-Audio-7B-Instruct diff --git a/.github/workflows/schedule_nightly_test_a3.yaml b/.github/workflows/schedule_nightly_test_a3.yaml index 66528caa904..88c9b5eb719 100644 --- a/.github/workflows/schedule_nightly_test_a3.yaml +++ b/.github/workflows/schedule_nightly_test_a3.yaml @@ -22,8 +22,8 @@ name: Nightly-A3 on: schedule: - # Run test at 24:00 Beijing time (UTC+8) - - cron: "0 16 * * *" + # Run test at 23:45 Beijing time (UTC+8) + - cron: "45 15 * * *" workflow_dispatch: pull_request: branches: @@ -50,6 +50,10 @@ jobs: parse-trigger: name: Parse trigger and determine test scope runs-on: linux-aarch64-a2b3-0 + if: >- + github.event_name == 'schedule' || + github.event_name == 'workflow_dispatch' || + contains(github.event.pull_request.labels.*.name, 'nightly-test') outputs: should_run: ${{ steps.parse.outputs.should_run }} test_filter: ${{ steps.parse.outputs.test_filter }} @@ -127,7 +131,7 @@ jobs: if: always() && needs.parse-trigger.outputs.should_run == 'true' strategy: fail-fast: false - max-parallel: 1 + max-parallel: 2 matrix: test_config: - name: multi-node-deepseek-pd diff --git a/docs/source/developer_guide/feature_guide/cpu_binding.md b/docs/source/developer_guide/feature_guide/cpu_binding.md new file mode 100644 index 00000000000..28fb46fc0ff --- /dev/null +++ b/docs/source/developer_guide/feature_guide/cpu_binding.md @@ -0,0 +1,228 @@ +# CPU Binding + +## Overview + +CPU binding pins vLLM Ascend worker processes and key threads to specific CPU cores to reduce CPU–NPU cross‑NUMA traffic and stabilize latency under multi‑process workloads. It is designed for ARM servers running Ascend NPUs and is automatically executed during worker initialization when enabled. + +## Background + +On multi‑socket ARM systems, the OS scheduler may place vLLM threads on CPUs far from the local NPU, causing NUMA cross‑traffic and jitter. CPU binding enforces a deterministic CPU placement strategy and optionally binds NPU IRQs to the same CPU pool. This is distinct from other performance features (e.g., graph mode or dynamic batch) because it is purely a host‑side affinity policy and does not change model execution logic. + +## Design & How it works + +### Key concepts + +- **Allowed CPU list**: The cpuset from /proc/self/status (Cpus_allowed_list). All allocations are constrained to this list. +- **Running NPU list**: Logical NPU IDs extracted from npu‑smi process listing, optionally filtered by ASCEND_RT_VISIBLE_DEVICES. +- **CPU pool per NPU**: The CPU list assigned to each logical NPU ID based on the binding mode. +- **Binding modes & Device behavior**: + + | Device type | Default mode | Description | + | ----------- | ------------ | ------------ | + | A3 (No Affinity) | `global_slice` | Splits the allowed CPU list evenly based on the **total number of global logical NPUs**, ensuring each NPU is assigned a contiguous segment of CPU cores. This prevents CPU core overlap across multiple process groups. | + | A2 / 310P / Others | `topo_affinity` | Allocates CPUs based on NPU topology affinity (`npu‑smi info -t topo`). If multiple NPUs are assigned to a single NUMA node (which may cause bandwidth contention), the CPU allocation extends to adjacent NUMA nodes. | + + - **Default**: enabled (enable_cpu_binding = true). + - **Fallback**: If NPU topo affinity is unavailable, global_slice is used. + - **Failure handling**: Any exception in binding is logged as a warning and **binding is skipped for that rank**. + +### Execution flow (simplified) + +1. **Feature entry**: worker initialization calls `bind_cpus(local_rank)` when `enable_cpu_binding` is true. +2. **CPU architecture gate**: If the CPU is not ARM, binding is skipped with a log. +3. **Collect device info**: + - Map logical NPU IDs from `npu‑smi info -m`. + - Detect running NPU IDs from npu‑smi info process table. + - Read cpuset from /proc/self/status. + - Read topo affinity from `npu‑smi info -t topo`. +4. **Build CPU pools**: + - Use **global_slice** for A3 devices; **topo_affinity** for A2 and 310P. + - If topo affinity is missing, fall back to global_slice. + - Ensure each NPU has at least 5 CPUs. +5. **Allocate per‑role CPUs**: + - Reserve the first two CPUs for IRQ binding. + - `main`: pool[2:-2] + - `acl`: pool[-2] + - `release`: pool[-1] +6. **Bind threads**: + - Main process is pinned to `main` CPUs. + - ACL threads (named with acl_thread) are pinned to `acl` CPU. + - Release threads (named with release_thread) are pinned to `release` CPU. +7. **Bind NPU IRQs (optional)**: + - If /proc/irq is writable, bind SQ/CQ IRQs to the first two CPUs in the pool. + - irqbalance may be stopped to prevent overrides. +8. **Memory binding (optional)**: + - If migratepages is available, memory for ACL threads is migrated to the NPU’s NUMA node. + +## Allocation plan examples + +The allocation plan is derived directly from the CPU pool per NPU and then split into roles: + +- IRQ CPUs: pool[0], pool[1] +- `main`: pool[2:-2] +- `acl`: pool[-2] +- `release`: pool[-1] + +Below are concrete examples that reflect the actual code paths. + +### Example 1: A3 inference server with 640 CPUs and 16 NPUs + +- allowed_cpus = [0..639] (640 CPUs) +- NUMA nodes = 0..7 (8 NUMA nodes, symmetric layout) +- total_npus = 16 +- running_npu_list = [0..15] +- base = 640 // 16 = 40, extra = 0 +- Each NPU gets a 40‑CPU pool. + +|NPU ID|Assigned CPU Cores (global_slice)|Role Division (IRQ/Main/ACL/Release)| +|---|---|---| +|0|0-39|`IRQ`: 0-1, `Main`: 2-37, `ACL`: 38, `Release`: 39| +|1|40-79|`IRQ`: 40-41, `Main`: 42-77, `ACL`: 78, `Release`: 79| +|...|...|...| +|15|600-639|`IRQ`: 600-601, `Main`: 602-637, `ACL`: 638, `Release`: 639| + +This layout remains deterministic even when multiple processes share the same cpuset, because slicing is based on the global logical NPU ID. + +### Example 2: A3 global_slice, even split + +**Inputs**: + +- allowed_cpus = [0..23] (24 CPUs) +- NUMA nodes = 0..1 (2 NUMA nodes, symmetric layout; NUMA0 = 0..11, NUMA1 = 12..23) +- total_npus = 4 (from npu-smi info -m) +- running_npu_list = [0, 1, 2, 3] + +**Global slice**: + +- base = 24 // 4 = 6, extra = 0 +- Each NPU gets a 6‑CPU pool. + +|NPU ID|Assigned CPU Cores (global_slice)|Role Division (IRQ/Main/ACL/Release)| +|---|---|---| +|0|0-5|`IRQ`: 0-1, `Main`: 2-3, `ACL`: 4, `Release`: 5| +|1|6-11|`IRQ`: 6-7, `Main`: 8-9, `ACL`: 10, `Release`: 11| +|2|12-17|`IRQ`: 12-13, `Main`: 14-15, `ACL`: 16, `Release`: 17| +|3|18-23|`IRQ`: 18-19, `Main`: 20-21, `ACL`: 22, `Release`: 23| + +### Example 3: A3 global_slice, remainder distribution + +**Inputs**: + +- allowed_cpus = [0..16] (17 CPUs) +- NUMA nodes = 0..1 (2 NUMA nodes, symmetric layout; NUMA0 = 0..7, NUMA1 = 8..16) +- total_npus = 3 +- running_npu_list = [0, 1, 2] + +**Global slice**: + +- base = 17 // 3 = 5, extra = 2 +- NPU0 pool size = 6 (base+1) +- NPU1 pool size = 6 (base+1) +- NPU2 pool size = 5 (base) + +|NPU ID|Assigned CPU Cores (global_slice)|Role Division (IRQ/Main/ACL/Release)| +|---|---|---| +|0|0-5|`IRQ`: 0-1, `Main`: 2-3, `ACL`: 4, `Release`: 5| +|1|6-11|`IRQ`: 6-7, `Main`: 8-9, `ACL`: 10, `Release`: 11| +|2|12-16|`IRQ`: 12-13, `Main`: 14, `ACL`: 15, `Release`: 16| + +Note: When a pool size is exactly 5, `main` has a single CPU (pool[2]). If any pool is <5, binding raises an error. + +**NUMA analysis**: + +- With the symmetric NUMA layout above (NUMA0 = 0..7, NUMA1 = 8..16), NPU0 stays within NUMA0, NPU2 stays within NUMA1, but NPU1 spans both NUMA0 (6,7) and NUMA1 (8..11). This is a direct consequence of global slicing over the ordered cpuset; the remainder distribution does not enforce NUMA boundaries. +- If the cpuset numbering is interleaved across NUMA nodes (non‑symmetric layout), cross‑NUMA pools can happen even earlier. This is why symmetric NUMA layout is recommended for best locality. + +### Known limitations and future improvements + +With the current `global_slice` strategy, some CPU/NPU layouts cannot avoid cross‑NUMA pools. A future enhancement should incorporate NUMA node boundaries into the slicing logic so that pools remain within a single NUMA node whenever possible. + +### Example 4: global_slice with visible subset of NPUs + +**Inputs**: + +- total_npus = 8 (from npu-smi info -m) +- running_npu_list = [2, 3] (filtered by ASCEND_RT_VISIBLE_DEVICES) +- allowed_cpus = [0..39] (40 CPUs) +- NUMA nodes = 0..3 (4 NUMA nodes, symmetric layout; 0..9, 10..19, 20..29, 30..39) + +**Global slice**: + +- base = 40 // 8 = 5, extra = 0 +- Only the visible logical NPUs get pools, but slicing uses the global NPU ID so different processes do not overlap. + +|NPU ID|Assigned CPU Cores (global_slice)|Role Division (IRQ/Main/ACL/Release)| +|---|---|---| +|2|10-14|`IRQ`: 10-11, `Main`: 12, `ACL`: 13, `Release`: 14| +|3|15-19|`IRQ`: 15-16, `Main`: 17, `ACL`: 18, `Release`: 19| + +### Example 5: A2/310P topo_affinity with NUMA extension + +**Inputs**: + +- npu_affinity = {0: [0..7], 1: [0..7]} (from `npu-smi info -t topo`) +- allowed_cpus = [0..15] (16 CPUs) +- NUMA nodes = 0..1 (2 NUMA nodes; NUMA0 = 0..7, NUMA1 = 8..15) + +**NUMA extension**: + +- Both NPUs are on NUMA0, so each pool extends to the nearest NUMA node to reduce contention. +- NPU0 extends to NUMA1 -> [0..15] +- NPU1 extends to NUMA1 -> [0..15] + +Because both pools are identical, the allocator applies average distribution across NPUs to avoid overlap. With a pool [0..15] and 2 NPUs, the final pools become: + +|NPU ID|Assigned CPU Cores (topo_affinity)|Role Division (IRQ/Main/ACL/Release)| +|---|---|---| +|0|0-7|`IRQ`: 0-1, `Main`: 2-5, `ACL`: 6, `Release`: 7| +|1|8-15|`IRQ`: 8-9, `Main`: 10-13, `ACL`: 14, `Release`: 15| + +### Example 6: Minimum CPUs per NPU + +**Inputs**: + +- total_npus = 2 +- allowed_cpus = [0..7] (8 CPUs) +- NUMA nodes = 0..1 (2 NUMA nodes, symmetric layout; NUMA0 = 0..3, NUMA1 = 4..7) + +**Result**: + +- base = 4, which is < 5, so binding fails with: "Insufficient CPUs for binding with IRQ/ACL/REL reservations..." + +|NPU ID|Assigned CPU Cores|Role Division (IRQ/Main/ACL/Release)| +|---|---|---| +|0|N/A|Binding error (insufficient CPUs per NPU)| +|1|N/A|Binding error (insufficient CPUs per NPU)| + +To resolve, either reduce total_npus or enlarge the cpuset so that each NPU has at least 5 CPUs. + +### Logging and verification + +- Logs show the selected binding mode and the allocation plan, for example: + - `[cpu_bind_mode] mode=global_slice rank=0 visible_npus=[...]` + - `The CPU allocation plan is as follows: ...` +- You can verify affinity via taskset or /proc//status after startup. + +## Limitations & Notes + +- **ARM‑only**: Binding is skipped on non‑ARM CPUs. +- **Minimum CPU requirement**: Each logical NPU requires at least 5 CPUs. If the cpuset is smaller, binding fails with an error. +- **NUMA symmetry assumption**: For best locality, the current strategies assume the cpuset is evenly distributed across NUMA nodes and CPU numbering aligns with NUMA layout; otherwise NUMA locality may be suboptimal. + - Example (symmetric layout): 2 NUMA nodes, 64 CPUs total. NUMA0 = CPUs 0–31, NUMA1 = CPUs 32–63, and the cpuset is 0–63. With 4 logical NPUs, global slicing yields 16 CPUs per NPU (0–15, 16–31, 32–47, 48–63), so each NPU’s pool stays within a single NUMA node. +- **Runtime dependencies**: + - Requires npu‑smi and lscpu commands. + - IRQ binding requires write access to /proc/irq. + - Memory binding requires migratepages; otherwise it is skipped. +- **IRQ side effects**: irqbalance may be stopped to avoid overriding bindings. +- **Per‑process behavior**: Only the current rank’s NPU is used for IRQ binding to avoid cross‑process overwrite. + +### Debug logging + +Use the standard vLLM logging configuration to enable debug logs. The binding process emits debug messages (e.g., `[cpu_global_slice] ...`) when debug level is enabled. + +## References + +- CPU binding implementation: vllm_ascend/cpu_binding.py (`DeviceInfo`, `CpuAlloc`, `bind_cpus`) +- Worker integration: vllm_ascend/worker/worker.py (`NPUWorker._init_device`) +- Additional config option: docs/source/user_guide/configuration/additional_config.md (`enable_cpu_binding`) +- Tests: tests/ut/device_allocator/test_cpu_binding.py diff --git a/docs/source/developer_guide/feature_guide/index.md b/docs/source/developer_guide/feature_guide/index.md index 5ed6e65e6f0..dd1cdf2d087 100644 --- a/docs/source/developer_guide/feature_guide/index.md +++ b/docs/source/developer_guide/feature_guide/index.md @@ -6,6 +6,7 @@ This section provides an overview of the features implemented in vLLM Ascend. De :caption: Feature Guide :maxdepth: 1 patch +cpu_binding ModelRunner_prepare_inputs disaggregated_prefill eplb_swift_balancer diff --git a/docs/source/user_guide/configuration/additional_config.md b/docs/source/user_guide/configuration/additional_config.md index f655c49fc68..b961d07151b 100644 --- a/docs/source/user_guide/configuration/additional_config.md +++ b/docs/source/user_guide/configuration/additional_config.md @@ -38,7 +38,7 @@ The following table lists additional configuration options available in vLLM Asc | `multistream_overlap_shared_expert` | bool | `False` | Whether to enable multi-stream shared expert. This option only takes effect on MoE models with shared experts. | | `multistream_overlap_gate` | bool | `False` | Whether to enable multi-stream overlap gate. This option only takes effect on MoE models with shared experts. | | `recompute_scheduler_enable` | bool | `False` | Whether to enable recompute scheduler. | -| `enable_cpu_binding` | bool | `True` | Whether to enable CPU binding. Only takes effect on ARM CPUs; when enabled, A3 uses NUMA-balanced binding strategy and other device types use NUMA-affinity's. | +| `enable_cpu_binding` | bool | `True` | Whether to enable CPU binding. Only takes effect on ARM CPUs; A3 uses the global-slicing CPU allocation strategy and other device types use the topo-affinity CPU allocation strategy. | | `SLO_limits_for_dynamic_batch` | int | `-1` | SLO limits for dynamic batch. This is new scheduler to support dynamic batch feature | | `enable_npugraph_ex` | bool | `False` | Whether to enable npugraph_ex graph mode. | | `pa_shape_list` | list | `[]` | The custom shape list of page attention ops. | diff --git a/docs/source/user_guide/feature_guide/cpu_binding.md b/docs/source/user_guide/feature_guide/cpu_binding.md new file mode 100644 index 00000000000..f2df9706050 --- /dev/null +++ b/docs/source/user_guide/feature_guide/cpu_binding.md @@ -0,0 +1,132 @@ +# CPU Binding + +## Overview + +CPU Binding is a performance optimization feature for vLLM, specifically designed for servers equipped with **ARM architecture and Ascend NPUs**. It pins vLLM processes and threads to specific CPU cores to reduce CPU–NPU cross‑NUMA communication overhead and stabilize inference latency. This feature only adjusts host-side CPU affinity policies and **does not alter model execution logic or impact inference results**. + +## Usage + +### Online serving example with CPU binding enabled (by default) + +```bash +vllm serve Qwen/Qwen2.5-7B-Instruct \ + --additional-config '{"enable_cpu_binding": true}' +``` + +### Online serving example with CPU binding disabled + +```bash +vllm serve Qwen/Qwen2.5-7B-Instruct \ + --additional-config '{"enable_cpu_binding": false}' +``` + +### Offline inference example with CPU binding enabled + +```python +from vllm import LLM + +llm = LLM( + model="Qwen/Qwen2.5-7B-Instruct", + additional_config={"enable_cpu_binding": True}, +) +``` + +### Offline inference example with CPU binding disabled + +```python +from vllm import LLM + +llm = LLM( + model="Qwen/Qwen2.5-7B-Instruct", + additional_config={"enable_cpu_binding": False}, +) +``` + +## Dependencies + +### Installation + +#### Ubuntu/Debian + +```bash +sudo apt-get update +sudo apt-get install -y util-linux numactl procps +``` + +#### RHEL/CentOS/Alma/Rocky + +```bash +sudo yum install -y util-linux numactl procps-ng +``` + +#### openEuler + +```bash +sudo dnf install -y util-linux numactl procps-ng +``` + +### IRQ binding's additional considerations + +For best results, if you run inside a docker container, which `systemctl` is likely unavailable, stop `irqbalance` service on the host manually before starting vLLM. Also make sure the container has the necessary permissions to write to `/proc/irq/*/smp_affinity` for IRQ binding: + +- **Stop `irqbalance` service**: + + For example, on Ubuntu system, you can run the following command to stop irqbalance: + ```bash + sudo systemctl stop irqbalance + ``` + + After you finish the vLLM process, you can restore irqbalance on the host: + + ```bash + sudo systemctl start irqbalance + ``` + +- **Permissions**: + - Read access to `/proc/self/status` and `/proc/interrupts` + - Write access to `/proc/irq/*/smp_affinity` for IRQ binding + +## Common Issues & Troubleshooting + +|Error/Warning Message|Core Cause|Solution| +|---|---|---| +|Can not get running npu info.|The npu-smi process table is empty, or the `ASCEND_RT_VISIBLE_DEVICES` environment variable filters out all NPUs.|1. Ensure the process is running on visible NPUs; 2. Verify that the `ASCEND_RT_VISIBLE_DEVICES` value matches the actual logical NPU IDs.| +|Insufficient CPUs for binding...|The number of CPU cores allocated to each NPU is less than the minimum requirement of 5.|1. Expand the allowed CPU list; 2. Reduce the number of visible NPUs.| +|NPU topo affinity not found...|npu-smi is unable to retrieve NPU topology affinity information.|Verify the integrity of the npu-smi installation and ensure the user has sufficient execution permissions.| +|Bind cpus failed in rankX...|The CPU binding process failed (e.g., taskset is unavailable, or the user lacks write permissions for /proc/irq).|1. Confirm that required tools (taskset, lscpu, npu-smi) are installed and available; 2. Verify the Cpus_allowed_list in `/proc/self/status` is valid.| + +## Key Limitations + +- ARM architecture only: Binding is automatically skipped on x86_64 systems. + +- Symmetric NUMA layout required for optimal performance: CPU numbering should be aligned with NUMA nodes. Non-symmetric layouts may result in cross-NUMA CPU pools, reducing locality. + +- IRQ binding requires write permissions for /proc/irq. Memory binding depends on the `migratepages` tool; if unavailable, memory migration is skipped. + +## FAQ + +**Q1: Does CPU binding work on x86_64?** + +No. The binding is skipped on non‑ARM CPUs. + +**Q2: Why are only the current rank’s IRQs bound?** + +To avoid multiple processes overwriting IRQ affinity settings for the same device. + +**Q3: What if my cpuset already limits CPUs?** + +The binder uses Cpus_allowed_list from /proc/self/status as the only eligible CPU set. Ensure this list is large enough. + +**Q4: Does CPU binding change model outputs?** + +No. It only affects host‑side affinity and should not change numerical results. + +--- + +## Summary + +1. **Core Objective**: Reduce cross‑NUMA communication by pinning vLLM processes and threads to specific CPU cores, thereby stabilizing inference latency in Ascend NPU deployments (only applicable to ARM architectures). + +2. **Usage**: Enable or disable with `enable_cpu_binding` via `additional_config` in both online and offline workflows. + +3. **Key Limitations**: ARM‑only; relies on symmetric NUMA layouts; binding fails if the CPU pool has fewer than 5 cores; binding errors trigger a warning log but do not terminate the process. diff --git a/docs/source/user_guide/feature_guide/index.md b/docs/source/user_guide/feature_guide/index.md index ad7abdcca3d..32607948850 100644 --- a/docs/source/user_guide/feature_guide/index.md +++ b/docs/source/user_guide/feature_guide/index.md @@ -6,6 +6,7 @@ This section provides a detailed usage guide of vLLM Ascend features. :caption: Feature Guide :maxdepth: 1 graph_mode +cpu_binding quantization sleep_mode structured_output diff --git a/examples/save_sharded_state_310.py b/examples/save_sharded_state_310.py index fb7acabe931..0787ae17574 100644 --- a/examples/save_sharded_state_310.py +++ b/examples/save_sharded_state_310.py @@ -24,12 +24,15 @@ Example usage: -python save_sharded_state.py \ +python save_sharded_state_310.py \ --model /path/to/load \ --tensor-parallel-size 8 \ --output /path/to/save \ --enable-compress \ - --compress-process-num 8 + --compress-process-num 8 \ + --enforce-eager \ + --dtype float16 \ + --quantization ascend Then, the model can be loaded with @@ -140,29 +143,30 @@ def get_quant_description(json_file: str) -> dict: return quant_desc -def update_quant_description(json_file: str) -> None: +def update_quant_description(ori_json_file: str, target_json_file: str) -> None: """ Update quantization types in JSON configuration file based on update mapping. Args: - json_file: Path to the JSON configuration file + ori_json_file: Path to the JSON configuration file + target_json_file: Path to the JSON configuration file to be saved Raises: FileNotFoundError: If the JSON file does not exist RuntimeError: If JSON parsing fails or required keys are missing """ - config_path = Path(json_file) + config_path = Path(ori_json_file) try: with config_path.open("r", encoding="utf-8") as file: json_data = json.load(file) except (FileNotFoundError, json.JSONDecodeError) as e: - raise RuntimeError(f"Failed to read configuration file {json_file}: {e}") + raise RuntimeError(f"Failed to read configuration file {ori_json_file}: {e}") original_quant_type = json_data.get("model_quant_type") if not original_quant_type or original_quant_type not in QUANTIZATION_UPDATE_MAP: raise RuntimeError( f"Cannot update quantization type. " - f"Original type '{original_quant_type}' not found or not supported for update in {json_file}." + f"Original type '{original_quant_type}' not found or not supported for update in {ori_json_file}." ) updated_quant_type = QUANTIZATION_UPDATE_MAP[original_quant_type] @@ -175,12 +179,12 @@ def update_quant_description(json_file: str) -> None: updated_config[key] = value try: - new_file_path = config_path.parent / "quant_model_description.json" + new_file_path = Path(target_json_file) with new_file_path.open("w", encoding="utf-8") as file: json.dump(updated_config, file, indent=2, ensure_ascii=False) - os.remove(json_file) + os.remove(ori_json_file) except OSError as e: - raise RuntimeError(f"Failed to write updated configuration to {json_file}: {e}") + raise RuntimeError(f"Failed to write updated configuration to {target_json_file}: {e}") def weight_compress_worker(file_path: str, quant_desc: dict, process_num: int) -> bool: @@ -214,9 +218,6 @@ def weight_compress_worker(file_path: str, quant_desc: dict, process_num: int) - compressor.run() if p.exists(): os.remove(p) - ori_quant_desc_file = p.parent / "quant_model_description.json" - if ori_quant_desc_file.exists(): - os.rename(str(ori_quant_desc_file), str(ori_quant_desc_file.parent / "ori_quant_model_description.json")) compressor.export_safetensors(str(p.parent), safetensors_name=p.name) return True except Exception as e: @@ -248,6 +249,10 @@ def main(args): # 4. Compression Logic parameters_map_fpath = output_dir / "parameters_type_map.json" if args.enable_compress: + quant_desc_file = output_dir / "quant_model_description.json" + backup_quant_desc_file = output_dir / "ori_quant_model_description.json" + if quant_desc_file.exists(): + os.rename(str(quant_desc_file), str(backup_quant_desc_file)) quant_desc = get_quant_description(str(parameters_map_fpath)) quant_type = quant_desc["model_quant_type"] if quant_type in SUPPORTED_COMPRESS_QUANT_TYPE: @@ -269,7 +274,7 @@ def main(args): for p in tasks: p.join() - update_quant_description(os.path.join(args.output, "ori_quant_model_description.json")) + update_quant_description(str(backup_quant_desc_file), str(quant_desc_file)) print("Compression completed successfully.") else: print(f"Skipping compression: Unsupported type {quant_type}") diff --git a/requirements-dev.txt b/requirements-dev.txt index e6193c501c5..9ef64609785 100644 --- a/requirements-dev.txt +++ b/requirements-dev.txt @@ -23,3 +23,4 @@ mindstudio-probe>=8.3.0 arctic-inference==0.1.1 xlite==0.1.0rc3 uc-manager + diff --git a/tests/e2e/nightly/multi_node/scripts/lws-a2.yaml.jinja2 b/tests/e2e/nightly/multi_node/scripts/lws-a2.yaml.jinja2 index b6048604421..c1a2f75eef8 100644 --- a/tests/e2e/nightly/multi_node/scripts/lws-a2.yaml.jinja2 +++ b/tests/e2e/nightly/multi_node/scripts/lws-a2.yaml.jinja2 @@ -1,7 +1,7 @@ apiVersion: leaderworkerset.x-k8s.io/v1 kind: LeaderWorkerSet metadata: - name: vllm + name: {{ lws_name | default("vllm") }} namespace: vllm-project spec: replicas: {{ replicas | default(1) }} @@ -128,7 +128,7 @@ spec: apiVersion: v1 kind: Service metadata: - name: vllm-leader + name: {{ lws_name | default("vllm") }}-leader namespace: vllm-project spec: ports: @@ -137,6 +137,6 @@ spec: protocol: TCP targetPort: 8080 selector: - leaderworkerset.sigs.k8s.io/name: vllm + leaderworkerset.sigs.k8s.io/name: {{ lws_name | default("vllm") }} role: leader type: ClusterIP diff --git a/tests/e2e/nightly/multi_node/scripts/lws.yaml.jinja2 b/tests/e2e/nightly/multi_node/scripts/lws.yaml.jinja2 index 7e2de7b6205..5b0aa94c4a6 100644 --- a/tests/e2e/nightly/multi_node/scripts/lws.yaml.jinja2 +++ b/tests/e2e/nightly/multi_node/scripts/lws.yaml.jinja2 @@ -1,7 +1,7 @@ apiVersion: leaderworkerset.x-k8s.io/v1 kind: LeaderWorkerSet metadata: - name: vllm + name: {{ lws_name | default("vllm") }} namespace: vllm-project spec: replicas: {{ replicas | default(1) }} @@ -128,7 +128,7 @@ spec: apiVersion: v1 kind: Service metadata: - name: vllm-leader + name: {{ lws_name | default("vllm") }}-leader namespace: vllm-project spec: ports: @@ -137,6 +137,6 @@ spec: protocol: TCP targetPort: 8080 selector: - leaderworkerset.sigs.k8s.io/name: vllm + leaderworkerset.sigs.k8s.io/name: {{ lws_name | default("vllm") }} role: leader type: ClusterIP \ No newline at end of file diff --git a/tests/e2e/singlecard/test_quantization.py b/tests/e2e/singlecard/test_quantization.py index 19ee42e05be..b50ac3cfcc4 100644 --- a/tests/e2e/singlecard/test_quantization.py +++ b/tests/e2e/singlecard/test_quantization.py @@ -49,43 +49,6 @@ def test_qwen3_w8a8_quant(): name_1="vllm_quant_w8a8_outputs", ) -# fmt: off -def test_qwen3_w8a8_quant_auto_detect(): - """Test that ModelSlim quantization is auto-detected without --quantization. - - Uses the same W8A8 model as test_qwen3_w8a8_quant but omits the - quantization parameter, verifying that the auto-detection in - maybe_auto_detect_quantization() picks up quant_model_description.json - and produces identical results. - """ - max_tokens = 5 - example_prompts = [ - "vLLM is a high-throughput and memory-efficient inference and serving engine for LLMs." - ] - vllm_target_outputs = [([ - 85, 4086, 44, 374, 264, 1550, 42747, 628, 323, 4938, 72816, 44378, 323, - 13480, 4712, 369, 444, 10994, 82, 13, 1084, 374, 6188, 311, 387 - ], 'vLLM is a high-throughput and memory-efficient inference and serving engine for LLMs. It is designed to be' - )] -# fmt: on - - with VllmRunner( - "vllm-ascend/Qwen3-0.6B-W8A8", - max_model_len=8192, - gpu_memory_utilization=0.7, - cudagraph_capture_sizes=[1, 2, 4, 8], - ) as vllm_model: - vllm_quant_auto_detect_outputs = vllm_model.generate_greedy( - example_prompts, max_tokens) - - check_outputs_equal( - outputs_0_lst=vllm_target_outputs, - outputs_1_lst=vllm_quant_auto_detect_outputs, - name_0="vllm_target_outputs", - name_1="vllm_quant_auto_detect_outputs", - ) - - # fmt: off def test_qwen3_dense_w8a16(): max_tokens = 5 diff --git a/tests/ut/_310p/quantization/test_w8a8sc_310.py b/tests/ut/_310p/quantization/test_w8a8sc_310.py new file mode 100644 index 00000000000..0db4b9784c2 --- /dev/null +++ b/tests/ut/_310p/quantization/test_w8a8sc_310.py @@ -0,0 +1,122 @@ +# +# Copyright (c) 2026 Huawei Technologies Co., Ltd. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import math +from unittest.mock import MagicMock, patch + +import pytest +import torch + +from tests.ut.base import TestBase +from vllm_ascend._310p.quantization.methods.w8a8sc import AscendW8A8SCLinearMethod310 + + +class TestAscendW8A8SCLinearMethod310(TestBase): + + def setUp(self): + self.method = AscendW8A8SCLinearMethod310() + + def test_get_weight_310(self): + weight = self.method.get_weight(10, 20) + self.assertEqual(weight["weight"].dtype, torch.int8) + self.assertEqual(weight["weight"].shape, (10 * 20, )) + self.assertEqual(weight["index"].dtype, torch.int8) + index_len = math.ceil(10 / 256) * math.ceil(20 / 128) * 8 + self.assertEqual(weight["index"].shape, (index_len, )) + self.assertEqual(weight["info"].dtype, torch.int64) + self.assertEqual(weight["info"].shape, (5, )) + + def test_get_pertensor_param_310(self): + params = self.method.get_pertensor_param(torch.float16) + self.assertEqual(params["input_scale"].dtype, torch.float16) + self.assertEqual(params["input_offset"].dtype, torch.int8) + self.assertEqual(params["input_scale"].shape, (1, )) + self.assertEqual(params["input_offset"].shape, (1, )) + + def test_get_perchannel_param_310(self): + params = self.method.get_perchannel_param(10, torch.float16) + + self.assertEqual(params["quant_bias"].dtype, torch.int32) + self.assertEqual(params["deq_scale"].dtype, torch.int64) + self.assertEqual(params["quant_bias"].shape, (10, )) + self.assertEqual(params["deq_scale"].shape, (10, )) + + @pytest.mark.skip( + "Skip as npu_matmul_compress_dequant will be supported in PTA 26.0.0.") + @patch("torch.ops.vllm.quantize") + @patch("torch_npu.npu_matmul_compress_dequant") + def test_apply_with_x_not_int8_310(self, mock_matmul_compress_dequant, + mock_quantize): + layer = MagicMock() + layer.aclnn_input_scale = torch.randn(256) + layer.aclnn_input_scale_reciprocal = 1.0 / layer.aclnn_input_scale + layer.aclnn_input_offset = torch.randint(-128, + 127, (256, ), + dtype=torch.int8) + layer.weight = torch.randint(-128, + 127, (256 * 128, ), + dtype=torch.int8) + layer.index = torch.randint(-128, 127, (8, ), dtype=torch.int8) + layer.deq_scale = torch.randn(128) + layer.quant_bias = torch.randint(-128, 127, (256, )) + layer.params_dtype = torch.float16 + + x = torch.randn(32, 128) + expect_x_output = torch.randint(-128, 127, x.shape, dtype=torch.int8) + mock_quantize.return_value = expect_x_output + + expected_y_output = torch.randn(32, 256) + mock_matmul_compress_dequant.return_value = expected_y_output + + output = self.method.apply(layer, x, tp_rank=0) + + mock_quantize.assert_called_with(x, layer.aclnn_input_scale, + layer.aclnn_input_scale_reciprocal, + layer.aclnn_input_offset) + mock_matmul_compress_dequant.assert_called_with( + expect_x_output, layer.weight, layer.index, layer.quant_bias, + layer.deq_scale) + self.assertTrue(torch.equal(output, expected_y_output)) + + @pytest.mark.skip( + "Skip as npu_matmul_compress_dequant will be supported in PTA 26.0.0.") + @patch("torch.ops.vllm.quantize") + @patch("torch_npu.npu_matmul_compress_dequant") + def test_apply_with_x_is_int8_310(self, mock_matmul_compress_dequant, + mock_quantize): + layer = MagicMock() + layer.aclnn_input_scale = torch.randn(256) + layer.aclnn_input_offset = torch.randint(-128, + 127, (256, ), + dtype=torch.int8) + layer.weight = torch.randint(-128, + 127, (256 * 128, ), + dtype=torch.int8) + layer.index = torch.randint(-128, 127, (8, ), dtype=torch.int8) + layer.deq_scale = torch.randn(128) + layer.quant_bias = torch.randint(-128, 127, (256, )) + layer.params_dtype = torch.float16 + + x = torch.randint(-128, 127, (32, 128), dtype=torch.int8) + + expected_y_output = torch.randn(32, 256) + mock_matmul_compress_dequant.return_value = expected_y_output + + output = self.method.apply(layer, x, tp_rank=0) + + mock_quantize.assert_not_called() + mock_matmul_compress_dequant.assert_called_with( + x, layer.weight, layer.index, layer.quant_bias, layer.deq_scale) + self.assertTrue(torch.equal(output, expected_y_output)) diff --git a/tests/ut/quantization/test_modelslim_config.py b/tests/ut/quantization/test_modelslim_config.py index 556c8a4acd3..f71238aa4fc 100644 --- a/tests/ut/quantization/test_modelslim_config.py +++ b/tests/ut/quantization/test_modelslim_config.py @@ -1,6 +1,3 @@ -import json -import os -import tempfile from unittest.mock import MagicMock, patch from vllm.model_executor.layers.fused_moe import FusedMoE @@ -10,7 +7,6 @@ from tests.ut.base import TestBase from vllm_ascend.ops.linear import AscendUnquantizedLinearMethod from vllm_ascend.quantization.modelslim_config import ( - MODELSLIM_CONFIG_FILENAME, AscendModelSlimConfig, ) from vllm_ascend.utils import ASCEND_QUANTIZATION_METHOD @@ -57,7 +53,7 @@ def test_get_min_capability(self): def test_get_config_filenames(self): filenames = AscendModelSlimConfig.get_config_filenames() - self.assertEqual(filenames, []) + self.assertEqual(filenames, ["quant_model_description.json"]) def test_from_config(self): config = AscendModelSlimConfig.from_config(self.sample_config) @@ -165,90 +161,5 @@ def test_is_layer_skipped_ascend(self): with self.assertRaises(ValueError): config.is_layer_skipped_ascend("fused_layer", fused_mapping) - def test_init_with_none_config(self): - config = AscendModelSlimConfig(None) - self.assertEqual(config.quant_description, {}) - - def test_init_with_default_config(self): - config = AscendModelSlimConfig() - self.assertEqual(config.quant_description, {}) - - def test_maybe_update_config_already_populated(self): - # When quant_description is already populated, should be a no-op - self.assertTrue(len(self.ascend_config.quant_description) > 0) - self.ascend_config.maybe_update_config("/some/model/path") - # quant_description should remain unchanged - self.assertEqual(self.ascend_config.quant_description, - self.sample_config) - - def test_maybe_update_config_loads_from_file(self): - config = AscendModelSlimConfig() - self.assertEqual(config.quant_description, {}) - - quant_data = {"layer1.weight": "INT8", "layer2.weight": "FLOAT"} - with tempfile.TemporaryDirectory() as tmpdir: - config_path = os.path.join(tmpdir, MODELSLIM_CONFIG_FILENAME) - with open(config_path, "w") as f: - json.dump(quant_data, f) - - config.maybe_update_config(tmpdir) - - self.assertEqual(config.quant_description, quant_data) - - def test_maybe_update_config_raises_when_file_missing(self): - config = AscendModelSlimConfig() - - with tempfile.TemporaryDirectory() as tmpdir: - with self.assertRaises(ValueError) as ctx: - config.maybe_update_config(tmpdir) - - error_msg = str(ctx.exception) - self.assertIn("ModelSlim Quantization Config Not Found", error_msg) - self.assertIn(MODELSLIM_CONFIG_FILENAME, error_msg) - - def test_maybe_update_config_raises_with_json_files_listed(self): - config = AscendModelSlimConfig() - - with tempfile.TemporaryDirectory() as tmpdir: - # Create a dummy json file that is NOT the config file - dummy_path = os.path.join(tmpdir, "config.json") - with open(dummy_path, "w") as f: - json.dump({"dummy": True}, f) - - with self.assertRaises(ValueError) as ctx: - config.maybe_update_config(tmpdir) - - error_msg = str(ctx.exception) - self.assertIn("config.json", error_msg) - - def test_maybe_update_config_non_directory_raises(self): - config = AscendModelSlimConfig() - - with self.assertRaises(ValueError) as ctx: - config.maybe_update_config("not_a_real_directory_path") - - error_msg = str(ctx.exception) - self.assertIn("ModelSlim Quantization Config Not Found", error_msg) - - def test_apply_extra_quant_adaptations_shared_head(self): - config = AscendModelSlimConfig() - config.quant_description = { - "model.layers.0.shared_head.weight": "INT8", - } - config._apply_extra_quant_adaptations() - self.assertIn("model.layers.0.weight", config.quant_description) - self.assertEqual(config.quant_description["model.layers.0.weight"], - "INT8") - - def test_apply_extra_quant_adaptations_weight_packed(self): - config = AscendModelSlimConfig() - config.quant_description = { - "model.layers.0.weight_packed": "INT8", - } - config._apply_extra_quant_adaptations() - self.assertIn("model.layers.0.weight", config.quant_description) - self.assertEqual(config.quant_description["model.layers.0.weight"], - "INT8") - def test_get_scaled_act_names(self): self.assertEqual(self.ascend_config.get_scaled_act_names(), []) diff --git a/tests/ut/quantization/test_quant_utils.py b/tests/ut/quantization/test_quant_utils.py deleted file mode 100644 index f148342e504..00000000000 --- a/tests/ut/quantization/test_quant_utils.py +++ /dev/null @@ -1,182 +0,0 @@ -import json -import logging -import os -import tempfile -from unittest.mock import MagicMock, patch - -from tests.ut.base import TestBase -from vllm_ascend.quantization.modelslim_config import MODELSLIM_CONFIG_FILENAME -from vllm_ascend.quantization.utils import ( - detect_quantization_method, - maybe_auto_detect_quantization, -) -from vllm_ascend.utils import ASCEND_QUANTIZATION_METHOD, COMPRESSED_TENSORS_METHOD - - -class TestDetectQuantizationMethod(TestBase): - - def test_returns_none_for_non_directory(self): - result = detect_quantization_method("/non/existent/path") - self.assertIsNone(result) - - def test_detects_modelslim(self): - with tempfile.TemporaryDirectory() as tmpdir: - config_path = os.path.join(tmpdir, MODELSLIM_CONFIG_FILENAME) - with open(config_path, "w") as f: - json.dump({"layer.weight": "INT8"}, f) - - result = detect_quantization_method(tmpdir) - self.assertEqual(result, ASCEND_QUANTIZATION_METHOD) - - def test_detects_compressed_tensors(self): - with tempfile.TemporaryDirectory() as tmpdir: - config_path = os.path.join(tmpdir, "config.json") - with open(config_path, "w") as f: - json.dump({ - "quantization_config": { - "quant_method": "compressed-tensors" - } - }, f) - - result = detect_quantization_method(tmpdir) - self.assertEqual(result, COMPRESSED_TENSORS_METHOD) - - def test_returns_none_for_no_quant(self): - with tempfile.TemporaryDirectory() as tmpdir: - result = detect_quantization_method(tmpdir) - self.assertIsNone(result) - - def test_returns_none_for_non_compressed_tensors_quant_method(self): - with tempfile.TemporaryDirectory() as tmpdir: - config_path = os.path.join(tmpdir, "config.json") - with open(config_path, "w") as f: - json.dump({ - "quantization_config": { - "quant_method": "gptq" - } - }, f) - - result = detect_quantization_method(tmpdir) - self.assertIsNone(result) - - def test_returns_none_for_config_without_quant_config(self): - with tempfile.TemporaryDirectory() as tmpdir: - config_path = os.path.join(tmpdir, "config.json") - with open(config_path, "w") as f: - json.dump({"model_type": "llama"}, f) - - result = detect_quantization_method(tmpdir) - self.assertIsNone(result) - - def test_returns_none_for_malformed_config_json(self): - with tempfile.TemporaryDirectory() as tmpdir: - config_path = os.path.join(tmpdir, "config.json") - with open(config_path, "w") as f: - f.write("not valid json{{{") - - result = detect_quantization_method(tmpdir) - self.assertIsNone(result) - - def test_modelslim_takes_priority_over_compressed_tensors(self): - """When both ModelSlim config and compressed-tensors config exist, - ModelSlim should take priority.""" - with tempfile.TemporaryDirectory() as tmpdir: - # Create ModelSlim config - modelslim_path = os.path.join(tmpdir, MODELSLIM_CONFIG_FILENAME) - with open(modelslim_path, "w") as f: - json.dump({"layer.weight": "INT8"}, f) - - # Create compressed-tensors config - config_path = os.path.join(tmpdir, "config.json") - with open(config_path, "w") as f: - json.dump({ - "quantization_config": { - "quant_method": "compressed-tensors" - } - }, f) - - result = detect_quantization_method(tmpdir) - self.assertEqual(result, ASCEND_QUANTIZATION_METHOD) - - -class TestMaybeAutoDetectQuantization(TestBase): - - def _make_vllm_config(self, model_path="/fake/model", quantization=None): - vllm_config = MagicMock() - vllm_config.model_config.model = model_path - vllm_config.model_config.quantization = quantization - return vllm_config - - @patch("vllm_ascend.quantization.utils.detect_quantization_method", - return_value=None) - def test_no_detection_does_nothing(self, mock_detect): - vllm_config = self._make_vllm_config() - maybe_auto_detect_quantization(vllm_config) - # quantization should remain unchanged - self.assertIsNone(vllm_config.model_config.quantization) - - @patch("vllm_ascend.quantization.utils.detect_quantization_method", - return_value=ASCEND_QUANTIZATION_METHOD) - def test_user_specified_same_method_no_change(self, mock_detect): - vllm_config = self._make_vllm_config( - quantization=ASCEND_QUANTIZATION_METHOD) - maybe_auto_detect_quantization(vllm_config) - self.assertEqual(vllm_config.model_config.quantization, - ASCEND_QUANTIZATION_METHOD) - - @patch("vllm.config.VllmConfig._get_quantization_config", - return_value=MagicMock()) - @patch("vllm_ascend.quantization.utils.detect_quantization_method", - return_value=ASCEND_QUANTIZATION_METHOD) - def test_auto_detect_sets_quantization_and_logs_info( - self, mock_detect, mock_get_quant_config): - """When no --quantization is specified but ModelSlim config is found, - the method should auto-set quantization and emit an INFO log.""" - vllm_config = self._make_vllm_config( - model_path="/fake/quant_model", quantization=None) - - with self.assertLogs("vllm_ascend.quantization.utils", - level=logging.INFO) as cm: - maybe_auto_detect_quantization(vllm_config) - - self.assertEqual(vllm_config.model_config.quantization, - ASCEND_QUANTIZATION_METHOD) - log_output = "\n".join(cm.output) - self.assertIn("Auto-detected quantization method", log_output) - self.assertIn(ASCEND_QUANTIZATION_METHOD, log_output) - self.assertIn("/fake/quant_model", log_output) - - @patch("vllm_ascend.quantization.utils.detect_quantization_method", - return_value=ASCEND_QUANTIZATION_METHOD) - def test_user_mismatch_logs_warning(self, mock_detect): - """When user specifies a different method than auto-detected, - a WARNING should be emitted and user's choice should be respected.""" - vllm_config = self._make_vllm_config( - model_path="/fake/quant_model", - quantization=COMPRESSED_TENSORS_METHOD) - - with self.assertLogs("vllm_ascend.quantization.utils", - level=logging.WARNING) as cm: - maybe_auto_detect_quantization(vllm_config) - - # User's choice is respected - self.assertEqual(vllm_config.model_config.quantization, - COMPRESSED_TENSORS_METHOD) - log_output = "\n".join(cm.output) - self.assertIn("Auto-detected quantization method", log_output) - self.assertIn(ASCEND_QUANTIZATION_METHOD, log_output) - self.assertIn(COMPRESSED_TENSORS_METHOD, log_output) - - @patch("vllm_ascend.quantization.utils.detect_quantization_method", - return_value=None) - def test_no_detection_emits_no_log(self, mock_detect): - """When no quantization is detected, no log should be emitted.""" - vllm_config = self._make_vllm_config(quantization=None) - logger_name = "vllm_ascend.quantization.utils" - - with self.assertRaises(AssertionError): - # assertLogs raises AssertionError when no logs are emitted - with self.assertLogs(logger_name, level=logging.DEBUG): - maybe_auto_detect_quantization(vllm_config) - - self.assertIsNone(vllm_config.model_config.quantization) diff --git a/tests/ut/test_platform.py b/tests/ut/test_platform.py index 0f91ca4d9d2..3b75d32e5bb 100644 --- a/tests/ut/test_platform.py +++ b/tests/ut/test_platform.py @@ -125,14 +125,13 @@ def test_inference_mode(self, mock_inference_mode): self.assertIsNone(self.platform.inference_mode()) mock_inference_mode.assert_called_once() - @patch("vllm_ascend.quantization.utils.maybe_auto_detect_quantization") @patch("vllm_ascend.ascend_config.init_ascend_config") @patch("vllm_ascend.utils.update_aclgraph_sizes") @patch("vllm_ascend.utils.get_ascend_device_type", return_value=AscendDeviceType.A3) @patch("os.environ", {}) @patch("vllm_ascend.core.recompute_scheduler.RecomputeSchedulerConfig.initialize_from_config") def test_check_and_update_config_basic_config_update( - self, mock_init_recompute, mock_soc_version, mock_update_acl, mock_init_ascend, mock_auto_detect + self, mock_init_recompute, mock_soc_version, mock_update_acl, mock_init_ascend ): mock_init_ascend.return_value = TestNPUPlatform.mock_vllm_ascend_config() vllm_config = TestNPUPlatform.mock_vllm_config() @@ -156,12 +155,11 @@ def test_check_and_update_config_basic_config_update( mock_init_ascend.assert_called_once_with(vllm_config) - @patch("vllm_ascend.quantization.utils.maybe_auto_detect_quantization") @patch("vllm_ascend.utils.get_ascend_device_type", return_value=AscendDeviceType.A3) @patch("vllm_ascend.ascend_config.init_ascend_config") @patch("vllm_ascend.core.recompute_scheduler.RecomputeSchedulerConfig.initialize_from_config") def test_check_and_update_config_no_model_config_warning( - self, mock_init_recompute, mock_init_ascend, mock_soc_version, mock_auto_detect + self, mock_init_recompute, mock_init_ascend, mock_soc_version ): mock_init_ascend.return_value = TestNPUPlatform.mock_vllm_ascend_config() vllm_config = TestNPUPlatform.mock_vllm_config() @@ -183,11 +181,10 @@ def test_check_and_update_config_no_model_config_warning( self.assertTrue("Model config is missing" in cm.output[0]) - @patch("vllm_ascend.quantization.utils.maybe_auto_detect_quantization") @patch("vllm_ascend.utils.get_ascend_device_type", return_value=AscendDeviceType.A3) @patch("vllm_ascend.ascend_config.init_ascend_config") @patch("vllm_ascend.core.recompute_scheduler.RecomputeSchedulerConfig.initialize_from_config") - def test_check_and_update_config_enforce_eager_mode(self, mock_init_recompute, mock_init_ascend, mock_soc_version, mock_auto_detect): + def test_check_and_update_config_enforce_eager_mode(self, mock_init_recompute, mock_init_ascend, mock_soc_version): mock_init_ascend.return_value = TestNPUPlatform.mock_vllm_ascend_config() vllm_config = TestNPUPlatform.mock_vllm_config() vllm_config.model_config.enforce_eager = True @@ -218,12 +215,11 @@ def test_check_and_update_config_enforce_eager_mode(self, mock_init_recompute, m CUDAGraphMode.NONE, ) - @patch("vllm_ascend.quantization.utils.maybe_auto_detect_quantization") @patch("vllm_ascend.utils.get_ascend_device_type", return_value=AscendDeviceType.A3) @patch("vllm_ascend.ascend_config.init_ascend_config") @patch("vllm_ascend.core.recompute_scheduler.RecomputeSchedulerConfig.initialize_from_config") def test_check_and_update_config_unsupported_compilation_level( - self, mock_init_recompute, mock_init_ascend, mock_soc_version, mock_auto_detect + self, mock_init_recompute, mock_init_ascend, mock_soc_version ): mock_init_ascend.return_value = TestNPUPlatform.mock_vllm_ascend_config() vllm_config = TestNPUPlatform.mock_vllm_config() @@ -257,10 +253,9 @@ def test_check_and_update_config_unsupported_compilation_level( ) @pytest.mark.skip("Revert me when vllm support setting cudagraph_mode on oot platform") - @patch("vllm_ascend.quantization.utils.maybe_auto_detect_quantization") @patch("vllm_ascend.utils.get_ascend_device_type", return_value=AscendDeviceType.A3) @patch("vllm_ascend.ascend_config.init_ascend_config") - def test_check_and_update_config_unsupported_cudagraph_mode(self, mock_init_ascend, mock_soc_version, mock_auto_detect): + def test_check_and_update_config_unsupported_cudagraph_mode(self, mock_init_ascend, mock_soc_version): mock_init_ascend.return_value = TestNPUPlatform.mock_vllm_ascend_config() vllm_config = TestNPUPlatform.mock_vllm_config() vllm_config.model_config.enforce_eager = False @@ -282,12 +277,11 @@ def test_check_and_update_config_unsupported_cudagraph_mode(self, mock_init_asce CUDAGraphMode.NONE, ) - @patch("vllm_ascend.quantization.utils.maybe_auto_detect_quantization") @patch("vllm_ascend.utils.get_ascend_device_type", return_value=AscendDeviceType.A3) @patch("vllm_ascend.ascend_config.init_ascend_config") @patch("vllm_ascend.core.recompute_scheduler.RecomputeSchedulerConfig.initialize_from_config") def test_check_and_update_config_cache_config_block_size( - self, mock_init_recompute, mock_init_ascend, mock_soc_version, mock_auto_detect + self, mock_init_recompute, mock_init_ascend, mock_soc_version ): mock_init_ascend.return_value = TestNPUPlatform.mock_vllm_ascend_config() vllm_config = TestNPUPlatform.mock_vllm_config() @@ -307,12 +301,11 @@ def test_check_and_update_config_cache_config_block_size( self.assertEqual(vllm_config.cache_config.block_size, 128) - @patch("vllm_ascend.quantization.utils.maybe_auto_detect_quantization") @patch("vllm_ascend.utils.get_ascend_device_type", return_value=AscendDeviceType.A3) @patch("vllm_ascend.ascend_config.init_ascend_config") @patch("vllm_ascend.core.recompute_scheduler.RecomputeSchedulerConfig.initialize_from_config") def test_check_and_update_config_v1_worker_class_selection( - self, mock_init_recompute, mock_init_ascend, mock_soc_version, mock_auto_detect + self, mock_init_recompute, mock_init_ascend, mock_soc_version ): mock_init_ascend.return_value = TestNPUPlatform.mock_vllm_ascend_config() vllm_config = TestNPUPlatform.mock_vllm_config() @@ -343,11 +336,10 @@ def test_check_and_update_config_v1_worker_class_selection( "vllm_ascend.xlite.xlite_worker.XliteWorker", ) - @patch("vllm_ascend.quantization.utils.maybe_auto_detect_quantization") @patch("vllm_ascend.ascend_config.init_ascend_config") @patch("vllm_ascend.utils.get_ascend_device_type", return_value=AscendDeviceType._310P) @patch("vllm_ascend.core.recompute_scheduler.RecomputeSchedulerConfig.initialize_from_config") - def test_check_and_update_config_310p_no_custom_ops(self, mock_init_recompute, mock_soc_version, mock_init_ascend, mock_auto_detect): + def test_check_and_update_config_310p_no_custom_ops(self, mock_init_recompute, mock_soc_version, mock_init_ascend): mock_init_ascend.return_value = TestNPUPlatform.mock_vllm_ascend_config() vllm_config = TestNPUPlatform.mock_vllm_config() vllm_config.compilation_config.custom_ops = [] diff --git a/vllm_ascend/_310p/quantization/methods/__init__.py b/vllm_ascend/_310p/quantization/methods/__init__.py index 4399207ad92..9bce13ffb69 100644 --- a/vllm_ascend/_310p/quantization/methods/__init__.py +++ b/vllm_ascend/_310p/quantization/methods/__init__.py @@ -19,4 +19,5 @@ w8a8_dynamic, # noqa: F401 w8a8_static, # noqa: F401 w8a8s, # noqa: F401 + w8a8sc, # noqa: F401 ) diff --git a/vllm_ascend/_310p/quantization/methods/w8a8sc.py b/vllm_ascend/_310p/quantization/methods/w8a8sc.py new file mode 100644 index 00000000000..76de861bba8 --- /dev/null +++ b/vllm_ascend/_310p/quantization/methods/w8a8sc.py @@ -0,0 +1,116 @@ +# +# Copyright (c) 2026 Huawei Technologies Co., Ltd. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# This file is a part of the vllm-ascend project. +# + +import math +from typing import Any + +import torch +import torch_npu +from vllm.distributed import get_tensor_model_parallel_rank + +from vllm_ascend.ops.linear import AscendRowParallelLinear +from vllm_ascend.quantization.methods.base import AscendLinearScheme + +from .registry import register_scheme + + +@register_scheme("W8A8SC", "linear") +class AscendW8A8SCLinearMethod310(AscendLinearScheme): + """310P-only W8A8SC static linear scheme. + + Notes: + - This scheme is discovered via 310P local registry. + """ + + def get_weight( + self, + input_size: int, + output_size: int, + params_dtype: torch.dtype = torch.float16, + ) -> dict[str, Any]: + """ + Get the weight tensors for the W8A8SC quantization scheme. + + Args: + input_size: Size of the input dimension (k) + output_size: Size of the output dimension (n) + params_dtype: Data type for parameters, default is torch.float16 + + Returns: + A dictionary containing: + - "weight": The compressed weight tensor with shape [c], where c is greater than 0 + and not larger than k * n + - "index": Compression index generated simultaneously with compressed weights, + with shape [x], where x = k_index * n_index * 8, k_index = ceil(k1 / tilingK), + n_index = ceil(n1 / tilingN), k1 = k / 32, n1 = n / 16 + - "info": Compression information with length 5, containing compression block + information tilingN, tilingK, original shape of the pre-compression x2 matrix, + and identifier for the compression block traversal direction + """ + self.input_size = input_size + index_len = math.ceil(input_size / 256) * math.ceil(output_size / 128) * 8 + return { + "weight": torch.empty(input_size * output_size, dtype=torch.int8), + "index": torch.empty(index_len, dtype=torch.int8), + "info": torch.empty(5, dtype=torch.int64), + } + + def get_pertensor_param(self, params_dtype: torch.dtype) -> dict[str, Any]: + return { + "input_scale": torch.empty(1, dtype=params_dtype), + "input_offset": torch.empty(1, dtype=torch.int8), + } + + def get_perchannel_param(self, output_size: int, params_dtype: torch.dtype) -> dict[str, Any]: + return { + "quant_bias": torch.empty(output_size, dtype=torch.int32), + "deq_scale": torch.empty(output_size, dtype=torch.int64), + } + + def apply( + self, + layer: torch.nn.Module, + x: torch.Tensor, + bias: torch.Tensor | None = None, + tp_rank: int | None = 0, + ) -> torch.Tensor: + if x.dtype != torch.int8: + x = torch.ops.vllm.quantize( + x, + layer.aclnn_input_scale, + layer.aclnn_input_scale_reciprocal, + layer.aclnn_input_offset, + ) + + return torch_npu.npu_matmul_compress_dequant( + x, + layer.weight, + layer.index, + layer.quant_bias, + layer.deq_scale, + ) + + def process_weights_after_loading(self, layer: torch.nn.Module) -> None: + layer.aclnn_input_scale = layer.input_scale.data.repeat(self.input_size) + layer.aclnn_input_scale_reciprocal = 1.0 / layer.aclnn_input_scale.data + layer.aclnn_input_offset = layer.input_offset.data.repeat(self.input_size).to(layer.aclnn_input_scale.dtype) + layer.deq_scale.data = layer.deq_scale.data.unsqueeze(0).to(torch.uint64) + layer.quant_bias.data = layer.quant_bias.data.unsqueeze(0) + # Only apply bias on row_parallel_linear when tp_rank is 0. + # torch_npu.npu_matmul_compress_dequant's quant_bias cannot be None. + if isinstance(layer, AscendRowParallelLinear) and get_tensor_model_parallel_rank() != 0: + layer.quant_bias.data = torch.zeros_like(layer.quant_bias) diff --git a/vllm_ascend/attention/context_parallel/attention_cp.py b/vllm_ascend/attention/context_parallel/attention_cp.py index eadd4fd7702..655fec1dcdc 100644 --- a/vllm_ascend/attention/context_parallel/attention_cp.py +++ b/vllm_ascend/attention/context_parallel/attention_cp.py @@ -143,7 +143,7 @@ def build( chunked_context_metadata = None attn_mask_seqlens = common_long_seq_metadata.attn_mask_seqlens if num_prefills > 0: - query_lens = query_lens[num_decode_tokens:] + query_lens = query_lens[num_decodes:] context_lens_cpu = num_computed_tokens_cpu[num_decodes:num_reqs] max_context_len_cpu = context_lens_cpu.max().item() if self.chunked_prefill_enabled and max_context_len_cpu > 0: diff --git a/vllm_ascend/platform.py b/vllm_ascend/platform.py index 78046e3a93b..8e44cdecdb1 100644 --- a/vllm_ascend/platform.py +++ b/vllm_ascend/platform.py @@ -178,11 +178,6 @@ def set_device(cls, device: torch.device): @classmethod def check_and_update_config(cls, vllm_config: VllmConfig) -> None: - from vllm_ascend.quantization.utils import maybe_auto_detect_quantization - - if vllm_config.model_config is not None: - maybe_auto_detect_quantization(vllm_config) - # initialize ascend config from vllm additional_config cls._fix_incompatible_config(vllm_config) ascend_config = init_ascend_config(vllm_config) diff --git a/vllm_ascend/quantization/modelslim_config.py b/vllm_ascend/quantization/modelslim_config.py index c769bfa00a7..ea397f73fbd 100644 --- a/vllm_ascend/quantization/modelslim_config.py +++ b/vllm_ascend/quantization/modelslim_config.py @@ -21,9 +21,6 @@ configs generated by the ModelSlim tool, along with model-specific mappings. """ -import glob -import json -import os from collections.abc import Mapping from types import MappingProxyType from typing import Any, Optional @@ -42,9 +39,6 @@ from .methods import get_scheme_class -# The config filename that ModelSlim generates after quantizing a model. -MODELSLIM_CONFIG_FILENAME = "quant_model_description.json" - logger = init_logger(__name__) # key: model_type @@ -424,9 +418,9 @@ class AscendModelSlimConfig(QuantizationConfig): quantized using the ModelSlim tool. """ - def __init__(self, quant_config: dict[str, Any] | None = None): + def __init__(self, quant_config: dict[str, Any]): super().__init__() - self.quant_description = quant_config if quant_config is not None else {} + self.quant_description = quant_config # TODO(whx): remove this adaptation after adding "shared_head" # to prefix of DeepSeekShareHead in vLLM. extra_quant_dict = {} @@ -456,12 +450,7 @@ def get_min_capability(cls) -> int: @classmethod def get_config_filenames(cls) -> list[str]: - # Return empty list so that vllm's get_quant_config() skips the - # file-based lookup (which raises an unfriendly "Cannot find the - # config file for ascend" error when the model is not quantized). - # Instead, the config file is loaded in maybe_update_config(), - # which can provide a user-friendly error message. - return [] + return ["quant_model_description.json"] @classmethod def from_config(cls, config: dict[str, Any]) -> "AscendModelSlimConfig": @@ -573,98 +562,5 @@ def is_layer_skipped_ascend(self, prefix: str, fused_mapping: Mapping[str, list[ assert is_skipped is not None return is_skipped - def maybe_update_config(self, model_name: str) -> None: - """Load the ModelSlim quantization config from model directory. - - This method is called by vllm after get_quant_config() returns - successfully. Since we return an empty list from get_config_filenames() - to bypass vllm's built-in file lookup, we do the actual config loading - here and provide user-friendly error messages when the config is missing. - - Args: - model_name: Path to the model directory or model name. - """ - # If quant_description is already populated (e.g. from from_config()), - # there is nothing to do. - if self.quant_description: - return - - # Try to find and load the ModelSlim config file - if os.path.isdir(model_name): - config_path = os.path.join(model_name, MODELSLIM_CONFIG_FILENAME) - if os.path.isfile(config_path): - with open(config_path) as f: - self.quant_description = json.load(f) - self._apply_extra_quant_adaptations() - return - - # Check if there are any json files at all to help diagnose - json_files = glob.glob(os.path.join(model_name, "*.json")) - json_names = [os.path.basename(f) for f in json_files] - else: - json_names = [] - - # Config file not found - raise a friendly error message - raise ValueError( - "\n" - + "=" * 80 - + "\n" - + "ERROR: ModelSlim Quantization Config Not Found\n" - + "=" * 80 - + "\n" - + "\n" - + f"You have enabled '--quantization {ASCEND_QUANTIZATION_METHOD}' " - + "(ModelSlim quantization),\n" - + f"but the model at '{model_name}' does not contain the required\n" - + f"quantization config file ('{MODELSLIM_CONFIG_FILENAME}').\n" - + "\n" - + "This usually means the model weights are NOT quantized by " - + "ModelSlim.\n" - + "\n" - + "Please choose one of the following solutions:\n" - + "\n" - + " Solution 1: Remove the quantization option " - + "(for float/unquantized models)\n" - + " " - + "-" * 58 - + "\n" - + f" Remove '--quantization {ASCEND_QUANTIZATION_METHOD}' from " - + "your command if you want to\n" - + " run the model with the original (float) weights.\n" - + "\n" - + " Example:\n" - + f" vllm serve {model_name}\n" - + "\n" - + " Solution 2: Quantize your model weights with ModelSlim first\n" - + " " - + "-" * 58 - + "\n" - + " Use the ModelSlim tool to quantize your model weights " - + "before deployment.\n" - + " After quantization, the model directory should contain " - + f"'{MODELSLIM_CONFIG_FILENAME}'.\n" - + " For more information, please refer to:\n" - + " https://gitee.com/ascend/msit/tree/master/msmodelslim\n" - + "\n" - + (f" (Found JSON files in model directory: {json_names})\n" if json_names else "") - + "=" * 80 - ) - - def _apply_extra_quant_adaptations(self) -> None: - """Apply extra adaptations to the quant_description dict. - - This handles known key transformations such as shared_head and - weight_packed mappings. - """ - extra_quant_dict = {} - for k in self.quant_description: - if "shared_head" in k: - new_k = k.replace(".shared_head.", ".") - extra_quant_dict[new_k] = self.quant_description[k] - if "weight_packed" in k: - new_k = k.replace("weight_packed", "weight") - extra_quant_dict[new_k] = self.quant_description[k] - self.quant_description.update(extra_quant_dict) - def get_scaled_act_names(self) -> list[str]: return [] diff --git a/vllm_ascend/quantization/utils.py b/vllm_ascend/quantization/utils.py deleted file mode 100644 index 7c9570b4ac7..00000000000 --- a/vllm_ascend/quantization/utils.py +++ /dev/null @@ -1,147 +0,0 @@ -# -# Copyright (c) 2025 Huawei Technologies Co., Ltd. All Rights Reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -# This file is a part of the vllm-ascend project. -# - -import json -import os - -from vllm.logger import init_logger - -from vllm_ascend.quantization.modelslim_config import MODELSLIM_CONFIG_FILENAME -from vllm_ascend.utils import ASCEND_QUANTIZATION_METHOD, COMPRESSED_TENSORS_METHOD - -logger = init_logger(__name__) - - -def detect_quantization_method(model_path: str) -> str | None: - """Auto-detect the quantization method from model directory files. - - This function performs a lightweight check (JSON files and file existence - only — no .safetensors or .bin inspection) to determine which quantization - method was used to produce the weights in *model_path*. - - Detection priority: - 1. **ModelSlim (Ascend)** – ``quant_model_description.json`` exists - in the model directory. - 2. **LLM-Compressor (compressed-tensors)** – ``config.json`` contains - a ``quantization_config`` section with - ``"quant_method": "compressed-tensors"``. - 3. **None** – neither condition is met; the caller should fall back to - the default (float) behaviour. - - Args: - model_path: Path to the local model directory. - - Returns: - ``"ascend"`` for ModelSlim models, - ``"compressed-tensors"`` for LLM-Compressor models, - or ``None`` if no quantization signature is found. - """ - if not os.path.isdir(model_path): - return None - - # Case 1: ModelSlim — look for quant_model_description.json - modelslim_config_path = os.path.join(model_path, MODELSLIM_CONFIG_FILENAME) - if os.path.isfile(modelslim_config_path): - return ASCEND_QUANTIZATION_METHOD - - # Case 2: LLM-Compressor — look for compressed-tensors in config.json - config_json_path = os.path.join(model_path, "config.json") - if os.path.isfile(config_json_path): - try: - with open(config_json_path) as f: - config = json.load(f) - quant_cfg = config.get("quantization_config") - if isinstance(quant_cfg, dict): - quant_method = quant_cfg.get("quant_method", "") - if quant_method == COMPRESSED_TENSORS_METHOD: - return COMPRESSED_TENSORS_METHOD - except (json.JSONDecodeError, OSError): - # Malformed or unreadable config.json — skip silently. - pass - - # Case 3: No quantization signature found. - return None - - -def maybe_auto_detect_quantization(vllm_config) -> None: - """Auto-detect and apply the quantization method on *vllm_config*. - - This should be called during engine initialisation (from - ``NPUPlatform.check_and_update_config``) **after** ``VllmConfig`` has been - created but **before** heavy weights are loaded. - - Because ``check_and_update_config`` runs *after* - ``VllmConfig.__post_init__`` has already evaluated - ``_get_quantization_config`` (which returned ``None`` when - ``model_config.quantization`` was not set), we must: - - 1. Set ``model_config.quantization`` to the detected value. - 2. Recreate ``vllm_config.quant_config`` so that the quantization - pipeline (``get_quant_config`` → ``QuantizationConfig`` → - ``get_quant_method`` for every layer) is properly initialised. - - Rules: - * If the user explicitly set ``--quantization``, that value is - respected. A warning is emitted when the detected method differs. - * If no ``--quantization`` was given, the detected method (if any) is - applied automatically. - - Args: - vllm_config: A ``vllm.config.VllmConfig`` instance (mutable). - """ - model_config = vllm_config.model_config - model_path = model_config.model - user_quant = model_config.quantization - detected = detect_quantization_method(model_path) - - if detected is None: - # No quantization signature found — nothing to do. - return - - if user_quant is not None: - # User explicitly specified a quantization method. - if user_quant != detected: - logger.warning( - "Auto-detected quantization method '%s' from model " - "files at '%s', but user explicitly specified " - "'--quantization %s'. Respecting the user-specified " - "value. If you encounter errors during model loading, " - "consider using '--quantization %s' instead.", - detected, - model_path, - user_quant, - detected, - ) - return - - # No user-specified quantization — apply auto-detected value. - model_config.quantization = detected - logger.info( - "Auto-detected quantization method '%s' from model files " - "at '%s'. To override, pass '--quantization ' explicitly.", - detected, - model_path, - ) - - # Recreate quant_config on VllmConfig. The original __post_init__ - # already ran _get_quantization_config(), but at that point - # model_config.quantization was None so it returned None. Now that - # we've set it, we need to build the actual QuantizationConfig so the - # downstream model-loading code can use it. - from vllm.config import VllmConfig as _VllmConfig - - vllm_config.quant_config = _VllmConfig._get_quantization_config(model_config, vllm_config.load_config) diff --git a/vllm_ascend/worker/model_runner_v1.py b/vllm_ascend/worker/model_runner_v1.py index 1bb40291ed2..6ed5caa6c0e 100644 --- a/vllm_ascend/worker/model_runner_v1.py +++ b/vllm_ascend/worker/model_runner_v1.py @@ -1016,7 +1016,7 @@ def propose_draft_token_ids( target_positions = self._get_positions(num_scheduled_tokens) target_hidden_states = hidden_states if self.use_aux_hidden_state_outputs: - target_hidden_states = torch.cat([h[:num_scheduled_tokens] for h in aux_hidden_states], dim=-1) + target_hidden_states = torch.cat([h for h in aux_hidden_states], dim=-1) else: token_indices_to_sample = None # input_ids can be None for multimodal models.