Skip to content
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
6 changes: 6 additions & 0 deletions .github/actionlint.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -16,3 +16,9 @@ self-hosted-runner:
- linux-aarch64-a2b3-1
- linux-aarch64-a2b3-2
- linux-aarch64-a2b3-4
- linux-amd64-cpu-test-8-hk
- linux-amd64-cpu-test-16-hk
- linux-aarch64-a2b3-test-0
- linux-aarch64-a2b3-test-1
- linux-aarch64-a2b3-test-2
- linux-aarch64-a2b3-test-4
62 changes: 51 additions & 11 deletions .github/workflows/_e2e_nightly_multi_node.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -66,7 +66,7 @@ defaults:
# only cancel in-progress runs of the same workflow
# and ignore the lint / 8 cards test type
concurrency:
group: ascend-nightly-${{ github.workflow_ref }}-${{ github.ref }}-${{ inputs.soc_version }}
group: ascend-nightly-${{ github.workflow_ref }}-${{ github.ref }}-${{ inputs.soc_version }}-${{ inputs.config_file_path }}
cancel-in-progress: true

jobs:
Expand All @@ -80,7 +80,6 @@ jobs:
env:
KUBECONFIG: /tmp/kubeconfig
NAMESPACE: vllm-project
LEADER_POD: vllm-0
steps:
- name: Decode kubeconfig from secrets
run: |
Expand All @@ -101,6 +100,17 @@ jobs:
- name: Checkout code
uses: actions/checkout@v6

- name: Set job variables
run: |
# Derive a unique, valid k8s resource name from config_file_path.
# Strip .yaml extension, lowercase, replace dots/underscores with hyphens, cap at 50 chars.
config_file="${{ inputs.config_file_path }}"
lws_suffix=$(echo "$config_file" | sed 's/\.yaml$//' | tr '[:upper:]' '[:lower:]' | tr '._' '-' | cut -c1-50)
LWS_NAME="vllm-${lws_suffix}"
echo "LWS_NAME=${LWS_NAME}" >> $GITHUB_ENV
echo "LEADER_POD=${LWS_NAME}-0" >> $GITHUB_ENV
echo "Computed LWS_NAME=${LWS_NAME}"

- name: Prepare scripts
run: |
# prepare for lws entrypoint scripts
Expand All @@ -110,14 +120,14 @@ jobs:
run: |
set -euo pipefail

CRD_NAME="${CRD_NAME:-vllm}"
TIMEOUT=${TIMEOUT:-120}
SLEEP_INTERVAL=2

echo "Deleting leaderworkerset [$CRD_NAME] in namespace [$NAMESPACE]..."
kubectl delete leaderworkerset "$CRD_NAME" -n "$NAMESPACE" --ignore-not-found
echo "Deleting leaderworkerset [$LWS_NAME] in namespace [$NAMESPACE]..."
kubectl delete leaderworkerset "$LWS_NAME" -n "$NAMESPACE" --ignore-not-found
kubectl delete service "${LWS_NAME}-leader" -n "$NAMESPACE" --ignore-not-found

echo "Waiting for all pods starting with 'vllm' to be deleted..."
echo "Waiting for pods of leaderworkerset [$LWS_NAME] to be deleted..."
START_TIME=$(date +%s)

while true; do
Expand All @@ -126,14 +136,14 @@ jobs:

if [[ $ELAPSED -ge $TIMEOUT ]]; then
echo "Timeout reached ($TIMEOUT seconds), some pods still exist:"
kubectl get pods -n "$NAMESPACE" | grep '^vllm' || true
kubectl get pods -n "$NAMESPACE" | grep "^${LWS_NAME}-" || true
exit 1
fi

PODS_EXIST=$(kubectl get pods -n "$NAMESPACE" -o jsonpath='{.items[*].metadata.name}' 2>/dev/null | tr ' ' '\n' | grep '^vllm' || true)
PODS_EXIST=$(kubectl get pods -n "$NAMESPACE" -o jsonpath='{.items[*].metadata.name}' 2>/dev/null | tr ' ' '\n' | grep "^${LWS_NAME}-" || true)

if [[ -z "$PODS_EXIST" ]]; then
echo "All vllm pods deleted."
echo "All pods for [$LWS_NAME] deleted."
break
else
echo "Waiting for pods to be deleted: $PODS_EXIST"
Expand Down Expand Up @@ -174,6 +184,7 @@ jobs:
fi

jinja2 $TEMPLATE_FILE \
-D lws_name="$LWS_NAME" \
-D size="$size" \
-D replicas="$replicas" \
-D image="$image" \
Expand All @@ -190,7 +201,7 @@ jobs:

- name: Waiting for pod ready
run: |
POD_PREFIX="${POD_PREFIX:-vllm-0}"
POD_PREFIX="${LWS_NAME}-0"
SIZE="${{ inputs.size }}"
TIMEOUT=1200 # default timeout 20 minutes

Expand Down Expand Up @@ -260,7 +271,7 @@ jobs:
trap cleanup EXIT

for i in $(seq 1 $((size - 1))); do
POD="vllm-0-${i}"
POD="${LWS_NAME}-0-${i}"

echo "==== Collecting logs from worker pod: $POD ===="
kubectl logs -f "$POD" -n "$NAMESPACE" \
Expand Down Expand Up @@ -290,5 +301,34 @@ jobs:
- name: Post process
if: always()
run: |
echo "Current pod status:"
kubectl get pods -n "$NAMESPACE" --ignore-not-found=true

echo "Deleting resources for [$LWS_NAME]..."
kubectl delete -f ./lws.yaml --ignore-not-found=true || true

echo "Waiting for pods of [$LWS_NAME] to fully terminate..."
TIMEOUT=300
SLEEP_INTERVAL=5
START_TIME=$(date +%s)

while true; do
NOW=$(date +%s)
ELAPSED=$((NOW - START_TIME))

if [[ $ELAPSED -ge $TIMEOUT ]]; then
echo "Timeout reached ($TIMEOUT seconds) waiting for termination, continuing anyway."
kubectl get pods -n "$NAMESPACE" | grep "^${LWS_NAME}-" || true
break
fi

PODS_EXIST=$(kubectl get pods -n "$NAMESPACE" -o jsonpath='{.items[*].metadata.name}' 2>/dev/null | tr ' ' '\n' | grep "^${LWS_NAME}-" || true)

if [[ -z "$PODS_EXIST" ]]; then
echo "All pods for [$LWS_NAME] have terminated."
break
else
echo "Waiting for pods to terminate: $PODS_EXIST"
sleep $SLEEP_INTERVAL
fi
done
13 changes: 10 additions & 3 deletions .github/workflows/_e2e_nightly_single_node.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -71,11 +71,17 @@ jobs:
env:
HF_HUB_OFFLINE: 1
VLLM_USE_MODELSCOPE: True
UV_INDEX_URL: http://cache-service.nginx-pypi-cache.svc.cluster.local/pypi/simple
UV_EXTRA_INDEX_URL: https://mirrors.huaweicloud.com/ascend/repos/pypi
UV_INDEX_STRATEGY: unsafe-best-match
UV_NO_CACHE: 1
UV_SYSTEM_PYTHON: 1
steps:
- name: Check npu and CANN info
run: |
npu-smi info
cat /usr/local/Ascend/ascend-toolkit/latest/"$(uname -i)"-linux/ascend_toolkit_install.info
pip install uv

- name: uninstall vlm vllm-ascend and remove code (if pr test)
if: ${{ inputs.is_pr_test }}
Expand Down Expand Up @@ -110,16 +116,17 @@ jobs:
if: ${{ inputs.is_pr_test }}
working-directory: /vllm-workspace/vllm
run: |
VLLM_TARGET_DEVICE=empty pip install -e .
VLLM_TARGET_DEVICE=empty uv pip install -e .

- name: Install vllm-project/vllm-ascend
if: ${{ inputs.is_pr_test }}
working-directory: /vllm-workspace/vllm-ascend
env:
PIP_EXTRA_INDEX_URL: https://mirrors.huaweicloud.com/ascend/repos/pypi
run: |
pip install -r requirements-dev.txt
pip install -v -e .
pip install uc-manager
uv pip install -r requirements-dev.txt
uv pip install -v -e .

- name: Install aisbench
if: ${{ inputs.is_pr_test }}
Expand Down
13 changes: 10 additions & 3 deletions .github/workflows/_e2e_nightly_single_node_models.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -67,6 +67,11 @@ jobs:
env:
VLLM_USE_MODELSCOPE: True
GHA_VLLM_ASCEND_VERSION: ${{ inputs.vllm-ascend }}
UV_INDEX_URL: http://cache-service.nginx-pypi-cache.svc.cluster.local/pypi/simple
UV_EXTRA_INDEX_URL: https://mirrors.huaweicloud.com/ascend/repos/pypi
UV_INDEX_STRATEGY: unsafe-best-match
UV_NO_CACHE: 1
UV_SYSTEM_PYTHON: 1
steps:
- name: Check npu and CANN info
run: |
Expand All @@ -91,6 +96,7 @@ jobs:

update-alternatives --install /usr/bin/clang clang /usr/bin/clang-15 20
update-alternatives --install /usr/bin/clang++ clang++ /usr/bin/clang++-15 20
pip install uv

- name: Checkout vllm-project/vllm repo
uses: actions/checkout@v6
Expand All @@ -102,14 +108,15 @@ jobs:
- name: Install vllm-project/vllm from source
working-directory: ./vllm-empty
run: |
VLLM_TARGET_DEVICE=empty pip install -e .
VLLM_TARGET_DEVICE=empty uv pip install -e .

- name: Install vllm-project/vllm-ascend
env:
PIP_EXTRA_INDEX_URL: https://mirrors.huaweicloud.com/ascend/repos/pypi
run: |
pip install -r requirements-dev.txt
pip install -v -e .
pip install uc-manager
uv pip install -r requirements-dev.txt
uv pip install -v -e .

- name: Install tensorflow (for Molmo-7B-D-0924)
if: ${{ inputs.runner == 'linux-aarch64-a2b3-1' && contains(inputs.model_list, 'Molmo-7B-D-0924') }}
Expand Down
Loading
Loading