ascend-gha-runners · tfhddd · Mar 10, 2026 · Mar 10, 2026 · Mar 10, 2026 · Mar 10, 2026
diff --git a/.github/actionlint.yaml b/.github/actionlint.yaml
@@ -16,3 +16,9 @@ self-hosted-runner:
     - linux-aarch64-a2b3-1
     - linux-aarch64-a2b3-2
     - linux-aarch64-a2b3-4
+    - linux-amd64-cpu-test-8-hk
+    - linux-amd64-cpu-test-16-hk
+    - linux-aarch64-a2b3-test-0
+    - linux-aarch64-a2b3-test-1
+    - linux-aarch64-a2b3-test-2
+    - linux-aarch64-a2b3-test-4
diff --git a/.github/workflows/_e2e_nightly_multi_node.yaml b/.github/workflows/_e2e_nightly_multi_node.yaml
@@ -66,7 +66,7 @@ defaults:
 # only cancel in-progress runs of the same workflow
 # and ignore the lint / 8 cards test type
 concurrency:
-  group: ascend-nightly-${{ github.workflow_ref }}-${{ github.ref }}-${{ inputs.soc_version }}
+  group: ascend-nightly-${{ github.workflow_ref }}-${{ github.ref }}-${{ inputs.soc_version }}-${{ inputs.config_file_path }}
   cancel-in-progress: true
 
 jobs:
@@ -80,7 +80,6 @@ jobs:
       env:
         KUBECONFIG: /tmp/kubeconfig
         NAMESPACE: vllm-project
-        LEADER_POD: vllm-0
     steps:
         - name: Decode kubeconfig from secrets
           run: |
@@ -101,6 +100,17 @@ jobs:
         - name: Checkout code
           uses: actions/checkout@v6
 
+        - name: Set job variables
+          run: |
+            # Derive a unique, valid k8s resource name from config_file_path.
+            # Strip .yaml extension, lowercase, replace dots/underscores with hyphens, cap at 50 chars.
+            config_file="${{ inputs.config_file_path }}"
+            lws_suffix=$(echo "$config_file" | sed 's/\.yaml$//' | tr '[:upper:]' '[:lower:]' | tr '._' '-' | cut -c1-50)
+            LWS_NAME="vllm-${lws_suffix}"
+            echo "LWS_NAME=${LWS_NAME}" >> $GITHUB_ENV
+            echo "LEADER_POD=${LWS_NAME}-0" >> $GITHUB_ENV
+            echo "Computed LWS_NAME=${LWS_NAME}"
+
         - name: Prepare scripts
           run: |
             # prepare for lws entrypoint scripts
@@ -110,14 +120,14 @@ jobs:
           run: |
             set -euo pipefail
 
-            CRD_NAME="${CRD_NAME:-vllm}"
             TIMEOUT=${TIMEOUT:-120}
             SLEEP_INTERVAL=2
 
-            echo "Deleting leaderworkerset [$CRD_NAME] in namespace [$NAMESPACE]..."
-            kubectl delete leaderworkerset "$CRD_NAME" -n "$NAMESPACE" --ignore-not-found
+            echo "Deleting leaderworkerset [$LWS_NAME] in namespace [$NAMESPACE]..."
+            kubectl delete leaderworkerset "$LWS_NAME" -n "$NAMESPACE" --ignore-not-found
+            kubectl delete service "${LWS_NAME}-leader" -n "$NAMESPACE" --ignore-not-found
 
-            echo "Waiting for all pods starting with 'vllm' to be deleted..."
+            echo "Waiting for pods of leaderworkerset [$LWS_NAME] to be deleted..."
             START_TIME=$(date +%s)
 
             while true; do
@@ -126,14 +136,14 @@ jobs:
 
               if [[ $ELAPSED -ge $TIMEOUT ]]; then
                 echo "Timeout reached ($TIMEOUT seconds), some pods still exist:"
-                kubectl get pods -n "$NAMESPACE" | grep '^vllm' || true
+                kubectl get pods -n "$NAMESPACE" | grep "^${LWS_NAME}-" || true
                 exit 1
               fi
 
-              PODS_EXIST=$(kubectl get pods -n "$NAMESPACE" -o jsonpath='{.items[*].metadata.name}' 2>/dev/null | tr ' ' '\n' | grep '^vllm' || true)
+              PODS_EXIST=$(kubectl get pods -n "$NAMESPACE" -o jsonpath='{.items[*].metadata.name}' 2>/dev/null | tr ' ' '\n' | grep "^${LWS_NAME}-" || true)
 
               if [[ -z "$PODS_EXIST" ]]; then
-                echo "All vllm pods deleted."
+                echo "All pods for [$LWS_NAME] deleted."
                 break
               else
                 echo "Waiting for pods to be deleted: $PODS_EXIST"
@@ -174,6 +184,7 @@ jobs:
             fi
 
             jinja2 $TEMPLATE_FILE \
+              -D lws_name="$LWS_NAME" \
               -D size="$size" \
               -D replicas="$replicas" \
               -D image="$image" \
@@ -190,7 +201,7 @@ jobs:
 
         - name: Waiting for pod ready
           run: |
-            POD_PREFIX="${POD_PREFIX:-vllm-0}"
+            POD_PREFIX="${LWS_NAME}-0"
             SIZE="${{ inputs.size }}"
             TIMEOUT=1200  # default timeout 20 minutes
 
@@ -260,7 +271,7 @@ jobs:
             trap cleanup EXIT
 
             for i in $(seq 1 $((size - 1))); do
-              POD="vllm-0-${i}"
+              POD="${LWS_NAME}-0-${i}"
 
               echo "==== Collecting logs from worker pod: $POD ===="
               kubectl logs -f "$POD" -n "$NAMESPACE" \
@@ -290,5 +301,34 @@ jobs:
         - name: Post process
           if: always()
           run: |
+            echo "Current pod status:"
             kubectl get pods -n "$NAMESPACE" --ignore-not-found=true
+
+            echo "Deleting resources for [$LWS_NAME]..."
             kubectl delete -f ./lws.yaml --ignore-not-found=true || true
+
+            echo "Waiting for pods of [$LWS_NAME] to fully terminate..."
+            TIMEOUT=300
+            SLEEP_INTERVAL=5
+            START_TIME=$(date +%s)
+
+            while true; do
+              NOW=$(date +%s)
+              ELAPSED=$((NOW - START_TIME))
+
+              if [[ $ELAPSED -ge $TIMEOUT ]]; then
+                echo "Timeout reached ($TIMEOUT seconds) waiting for termination, continuing anyway."
+                kubectl get pods -n "$NAMESPACE" | grep "^${LWS_NAME}-" || true
+                break
+              fi
+
+              PODS_EXIST=$(kubectl get pods -n "$NAMESPACE" -o jsonpath='{.items[*].metadata.name}' 2>/dev/null | tr ' ' '\n' | grep "^${LWS_NAME}-" || true)
+
+              if [[ -z "$PODS_EXIST" ]]; then
+                echo "All pods for [$LWS_NAME] have terminated."
+                break
+              else
+                echo "Waiting for pods to terminate: $PODS_EXIST"
+                sleep $SLEEP_INTERVAL
+              fi
+            done
diff --git a/.github/workflows/_e2e_nightly_single_node.yaml b/.github/workflows/_e2e_nightly_single_node.yaml
@@ -71,11 +71,17 @@ jobs:
     env:
       HF_HUB_OFFLINE: 1
       VLLM_USE_MODELSCOPE: True
+      UV_INDEX_URL: http://cache-service.nginx-pypi-cache.svc.cluster.local/pypi/simple
+      UV_EXTRA_INDEX_URL: https://mirrors.huaweicloud.com/ascend/repos/pypi
+      UV_INDEX_STRATEGY: unsafe-best-match
+      UV_NO_CACHE: 1
+      UV_SYSTEM_PYTHON: 1
     steps:
       - name: Check npu and CANN info
         run: |
           npu-smi info
           cat /usr/local/Ascend/ascend-toolkit/latest/"$(uname -i)"-linux/ascend_toolkit_install.info
+          pip install uv
 
       - name: uninstall vlm vllm-ascend and remove code (if pr test)
         if: ${{ inputs.is_pr_test }}
@@ -110,16 +116,17 @@ jobs:
         if: ${{ inputs.is_pr_test }}
         working-directory: /vllm-workspace/vllm
         run: |
-          VLLM_TARGET_DEVICE=empty pip install -e .
+          VLLM_TARGET_DEVICE=empty uv pip install -e .
 
       - name: Install vllm-project/vllm-ascend
         if: ${{ inputs.is_pr_test }}
         working-directory: /vllm-workspace/vllm-ascend
         env:
           PIP_EXTRA_INDEX_URL: https://mirrors.huaweicloud.com/ascend/repos/pypi
         run: |
-          pip install -r requirements-dev.txt
-          pip install -v -e .
+          pip install uc-manager
+          uv pip install -r requirements-dev.txt
+          uv pip install -v -e .
 
       - name: Install aisbench
         if: ${{ inputs.is_pr_test }}

diff --git a/.github/workflows/_e2e_nightly_single_node_models.yaml b/.github/workflows/_e2e_nightly_single_node_models.yaml
@@ -67,6 +67,11 @@ jobs:
       env:
         VLLM_USE_MODELSCOPE: True
         GHA_VLLM_ASCEND_VERSION: ${{ inputs.vllm-ascend }}
+        UV_INDEX_URL: http://cache-service.nginx-pypi-cache.svc.cluster.local/pypi/simple
+        UV_EXTRA_INDEX_URL: https://mirrors.huaweicloud.com/ascend/repos/pypi
+        UV_INDEX_STRATEGY: unsafe-best-match
+        UV_NO_CACHE: 1
+        UV_SYSTEM_PYTHON: 1
     steps:
       - name: Check npu and CANN info
         run: |
@@ -91,6 +96,7 @@ jobs:
 
           update-alternatives --install /usr/bin/clang clang /usr/bin/clang-15 20
           update-alternatives --install /usr/bin/clang++ clang++ /usr/bin/clang++-15 20
+          pip install uv
 
       - name: Checkout vllm-project/vllm repo
         uses: actions/checkout@v6
@@ -102,14 +108,15 @@ jobs:
       - name: Install vllm-project/vllm from source
         working-directory: ./vllm-empty
         run: |
-          VLLM_TARGET_DEVICE=empty pip install -e .
+          VLLM_TARGET_DEVICE=empty uv pip install -e .
 
       - name: Install vllm-project/vllm-ascend
         env:
           PIP_EXTRA_INDEX_URL: https://mirrors.huaweicloud.com/ascend/repos/pypi
         run: |
-          pip install -r requirements-dev.txt
-          pip install -v -e .
+          pip install uc-manager
+          uv pip install -r requirements-dev.txt
+          uv pip install -v -e .
 
       - name: Install tensorflow (for Molmo-7B-D-0924)
         if: ${{ inputs.runner == 'linux-aarch64-a2b3-1' && contains(inputs.model_list, 'Molmo-7B-D-0924') }}