Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
62 changes: 51 additions & 11 deletions .github/workflows/_e2e_nightly_multi_node.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -66,7 +66,7 @@ defaults:
# only cancel in-progress runs of the same workflow
# and ignore the lint / 8 cards test type
concurrency:
group: ascend-nightly-${{ github.workflow_ref }}-${{ github.ref }}-${{ inputs.soc_version }}
group: ascend-nightly-${{ github.workflow_ref }}-${{ github.ref }}-${{ inputs.soc_version }}-${{ inputs.config_file_path }}
cancel-in-progress: true

jobs:
Expand All @@ -80,7 +80,6 @@ jobs:
env:
KUBECONFIG: /tmp/kubeconfig
NAMESPACE: vllm-project
LEADER_POD: vllm-0
steps:
- name: Decode kubeconfig from secrets
run: |
Expand All @@ -101,6 +100,17 @@ jobs:
- name: Checkout code
uses: actions/checkout@v6

- name: Set job variables
run: |
# Derive a unique, valid k8s resource name from config_file_path.
# Strip .yaml extension, lowercase, replace dots/underscores with hyphens, cap at 50 chars.
config_file="${{ inputs.config_file_path }}"
lws_suffix=$(echo "$config_file" | sed 's/\.yaml$//' | tr '[:upper:]' '[:lower:]' | tr '._' '-' | cut -c1-50)
LWS_NAME="vllm-${lws_suffix}"
echo "LWS_NAME=${LWS_NAME}" >> $GITHUB_ENV
echo "LEADER_POD=${LWS_NAME}-0" >> $GITHUB_ENV
echo "Computed LWS_NAME=${LWS_NAME}"

- name: Prepare scripts
run: |
# prepare for lws entrypoint scripts
Expand All @@ -110,14 +120,14 @@ jobs:
run: |
set -euo pipefail

CRD_NAME="${CRD_NAME:-vllm}"
TIMEOUT=${TIMEOUT:-120}
SLEEP_INTERVAL=2

echo "Deleting leaderworkerset [$CRD_NAME] in namespace [$NAMESPACE]..."
kubectl delete leaderworkerset "$CRD_NAME" -n "$NAMESPACE" --ignore-not-found
echo "Deleting leaderworkerset [$LWS_NAME] in namespace [$NAMESPACE]..."
kubectl delete leaderworkerset "$LWS_NAME" -n "$NAMESPACE" --ignore-not-found
kubectl delete service "${LWS_NAME}-leader" -n "$NAMESPACE" --ignore-not-found

echo "Waiting for all pods starting with 'vllm' to be deleted..."
echo "Waiting for pods of leaderworkerset [$LWS_NAME] to be deleted..."
START_TIME=$(date +%s)

while true; do
Expand All @@ -126,14 +136,14 @@ jobs:

if [[ $ELAPSED -ge $TIMEOUT ]]; then
echo "Timeout reached ($TIMEOUT seconds), some pods still exist:"
kubectl get pods -n "$NAMESPACE" | grep '^vllm' || true
kubectl get pods -n "$NAMESPACE" | grep "^${LWS_NAME}-" || true
exit 1
fi

PODS_EXIST=$(kubectl get pods -n "$NAMESPACE" -o jsonpath='{.items[*].metadata.name}' 2>/dev/null | tr ' ' '\n' | grep '^vllm' || true)
PODS_EXIST=$(kubectl get pods -n "$NAMESPACE" -o jsonpath='{.items[*].metadata.name}' 2>/dev/null | tr ' ' '\n' | grep "^${LWS_NAME}-" || true)

if [[ -z "$PODS_EXIST" ]]; then
echo "All vllm pods deleted."
echo "All pods for [$LWS_NAME] deleted."
break
else
echo "Waiting for pods to be deleted: $PODS_EXIST"
Expand Down Expand Up @@ -174,6 +184,7 @@ jobs:
fi

jinja2 $TEMPLATE_FILE \
-D lws_name="$LWS_NAME" \
-D size="$size" \
-D replicas="$replicas" \
-D image="$image" \
Expand All @@ -190,7 +201,7 @@ jobs:

- name: Waiting for pod ready
run: |
POD_PREFIX="${POD_PREFIX:-vllm-0}"
POD_PREFIX="${LWS_NAME}-0"
SIZE="${{ inputs.size }}"
TIMEOUT=1200 # default timeout 20 minutes

Expand Down Expand Up @@ -260,7 +271,7 @@ jobs:
trap cleanup EXIT

for i in $(seq 1 $((size - 1))); do
POD="vllm-0-${i}"
POD="${LWS_NAME}-0-${i}"

echo "==== Collecting logs from worker pod: $POD ===="
kubectl logs -f "$POD" -n "$NAMESPACE" \
Expand Down Expand Up @@ -290,5 +301,34 @@ jobs:
- name: Post process
if: always()
run: |
echo "Current pod status:"
kubectl get pods -n "$NAMESPACE" --ignore-not-found=true

echo "Deleting resources for [$LWS_NAME]..."
kubectl delete -f ./lws.yaml --ignore-not-found=true || true

echo "Waiting for pods of [$LWS_NAME] to fully terminate..."
TIMEOUT=300
SLEEP_INTERVAL=5
START_TIME=$(date +%s)

while true; do
NOW=$(date +%s)
ELAPSED=$((NOW - START_TIME))

if [[ $ELAPSED -ge $TIMEOUT ]]; then
echo "Timeout reached ($TIMEOUT seconds) waiting for termination, continuing anyway."
kubectl get pods -n "$NAMESPACE" | grep "^${LWS_NAME}-" || true
break
fi

PODS_EXIST=$(kubectl get pods -n "$NAMESPACE" -o jsonpath='{.items[*].metadata.name}' 2>/dev/null | tr ' ' '\n' | grep "^${LWS_NAME}-" || true)

if [[ -z "$PODS_EXIST" ]]; then
echo "All pods for [$LWS_NAME] have terminated."
break
else
echo "Waiting for pods to terminate: $PODS_EXIST"
sleep $SLEEP_INTERVAL
fi
done
10 changes: 7 additions & 3 deletions .github/workflows/schedule_nightly_test_a2.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -21,8 +21,8 @@ name: Nightly-A2

on:
schedule:
# Run test at 24:00 Beijing time (UTC+8)
- cron: "0 16 * * *"
# Run test at 23:45 Beijing time (UTC+8)
- cron: "45 15 * * *"
workflow_dispatch:
pull_request:
branches:
Expand Down Expand Up @@ -50,6 +50,10 @@ jobs:
parse-trigger:
name: Parse trigger and determine test scope
runs-on: linux-aarch64-a2b3-0
if: >-
github.event_name == 'schedule' ||
github.event_name == 'workflow_dispatch' ||
contains(github.event.pull_request.labels.*.name, 'nightly-test')
outputs:
should_run: ${{ steps.parse.outputs.should_run }}
test_filter: ${{ steps.parse.outputs.test_filter }}
Expand Down Expand Up @@ -201,7 +205,7 @@ jobs:
if: always() && needs.parse-trigger.outputs.should_run == 'true'
strategy:
fail-fast: false
max-parallel: 1
max-parallel: 2
matrix:
test_config:
- name: multi-node-deepseek-dp
Expand Down
10 changes: 7 additions & 3 deletions .github/workflows/schedule_nightly_test_a3.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -22,8 +22,8 @@ name: Nightly-A3

on:
schedule:
# Run test at 24:00 Beijing time (UTC+8)
- cron: "0 16 * * *"
# Run test at 23:45 Beijing time (UTC+8)
- cron: "45 15 * * *"
workflow_dispatch:
pull_request:
branches:
Expand All @@ -50,6 +50,10 @@ jobs:
parse-trigger:
name: Parse trigger and determine test scope
runs-on: linux-aarch64-a2b3-0
if: >-
github.event_name == 'schedule' ||
github.event_name == 'workflow_dispatch' ||
contains(github.event.pull_request.labels.*.name, 'nightly-test')
outputs:
should_run: ${{ steps.parse.outputs.should_run }}
test_filter: ${{ steps.parse.outputs.test_filter }}
Expand Down Expand Up @@ -127,7 +131,7 @@ jobs:
if: always() && needs.parse-trigger.outputs.should_run == 'true'
strategy:
fail-fast: false
max-parallel: 1
max-parallel: 2
matrix:
test_config:
- name: multi-node-deepseek-pd
Expand Down
6 changes: 3 additions & 3 deletions tests/e2e/nightly/multi_node/scripts/lws-a2.yaml.jinja2
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
apiVersion: leaderworkerset.x-k8s.io/v1
kind: LeaderWorkerSet
metadata:
name: vllm
name: {{ lws_name | default("vllm") }}
Comment thread
zhangxinyuehfad marked this conversation as resolved.
namespace: vllm-project
spec:
replicas: {{ replicas | default(1) }}
Expand Down Expand Up @@ -128,7 +128,7 @@ spec:
apiVersion: v1
kind: Service
metadata:
name: vllm-leader
name: {{ lws_name | default("vllm") }}-leader
namespace: vllm-project
spec:
ports:
Expand All @@ -137,6 +137,6 @@ spec:
protocol: TCP
targetPort: 8080
selector:
leaderworkerset.sigs.k8s.io/name: vllm
leaderworkerset.sigs.k8s.io/name: {{ lws_name | default("vllm") }}
role: leader
type: ClusterIP
6 changes: 3 additions & 3 deletions tests/e2e/nightly/multi_node/scripts/lws.yaml.jinja2
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
apiVersion: leaderworkerset.x-k8s.io/v1
kind: LeaderWorkerSet
metadata:
name: vllm
name: {{ lws_name | default("vllm") }}
namespace: vllm-project
spec:
replicas: {{ replicas | default(1) }}
Expand Down Expand Up @@ -128,7 +128,7 @@ spec:
apiVersion: v1
kind: Service
metadata:
name: vllm-leader
name: {{ lws_name | default("vllm") }}-leader
namespace: vllm-project
spec:
ports:
Expand All @@ -137,6 +137,6 @@ spec:
protocol: TCP
targetPort: 8080
selector:
leaderworkerset.sigs.k8s.io/name: vllm
leaderworkerset.sigs.k8s.io/name: {{ lws_name | default("vllm") }}
role: leader
type: ClusterIP
Loading