Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
39 changes: 4 additions & 35 deletions .github/workflows/_e2e_nightly_multi_node.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -69,35 +69,12 @@ jobs:
# This is the runner with no NPU for k8s controller
runs-on: ${{ inputs.runner }}
container:
image: m.daocloud.io/quay.io/ascend/cann:8.3.rc2-a3-ubuntu22.04-py3.11
image: swr.cn-southwest-2.myhuaweicloud.com/base_image/ascend-ci/vllm-ascend:nightly-cpu
env:
KUBECONFIG: /tmp/kubeconfig
KUBECTL: /root/.cache/.kube/kubectl
NAMESPACE: vllm-project
LEADER_POD: vllm-0
RESULT_FILE: /root/.cache/tests/ret_${{ inputs.soc_version }}
steps:
- name: Install system denpendencies
run: |
# configure apt and pip source
sed -i 's|ports.ubuntu.com|mirrors.tuna.tsinghua.edu.cn|g' /etc/apt/sources.list
pip config set global.index-url https://mirrors.tuna.tsinghua.edu.cn/pypi/web/simple
pip install jinja2-cli

- name: Install kubectl
run: |
# Install kubectl
arch=$(uname -m)

if echo "$arch" | grep -qiE "arm|aarch64"; then
echo "Detected ARM architecture: $arch"
KUBECTL="$KUBECTL"_arm
fi
install -o root -g root -m 0755 $KUBECTL /usr/local/bin/kubectl

# Verify kubectl installation
kubectl version --client=true

- name: Decode kubeconfig from secrets
run: |
# Decode and save kubeconfig
Expand All @@ -110,8 +87,6 @@ jobs:
run: |
# prepare for lws entrypoint scripts
install -D tests/e2e/nightly/multi_node/scripts/run.sh /root/.cache/tests/run.sh
# clear log directory
rm -fr $RESULT_FILE

- name: Clear resources
run: |
Expand Down Expand Up @@ -157,10 +132,6 @@ jobs:
replicas="${{ inputs.replicas }}"
image="${{ inputs.image }}"
config_file_path="${{ inputs.config_file_path }}"
vllm_version="${{ inputs.vllm_version }}"
vllm_ascend_ref="${{ inputs.vllm_ascend_ref }}"
vllm_ascend_remote_url="${{ inputs.vllm_ascend_remote_url }}"
result_file_path="$RESULT_FILE"
fail_tag=FAIL_TAG_"${{ inputs.config_file_path }}"
echo "FAIL_TAG=${fail_tag}" >> $GITHUB_ENV

Expand All @@ -174,19 +145,17 @@ jobs:

if [ "${{ inputs.soc_version }}" = "a3" ]; then
npu_per_node=16
TEMPLATE_FILE="tests/e2e/nightly/multi_node/scripts/lws.yaml.jinja2"
else
npu_per_node=8
TEMPLATE_FILE="tests/e2e/nightly/multi_node/scripts/lws-a2.yaml.jinja2"
fi

jinja2 tests/e2e/nightly/multi_node/scripts/lws.yaml.jinja2 \
jinja2 $TEMPLATE_FILE \
-D size="$size" \
-D replicas="$replicas" \
-D image="$image" \
-D config_file_path="$config_file_path" \
-D vllm_version="$vllm_version" \
-D vllm_ascend_remote_url="$vllm_ascend_remote_url" \
-D vllm_ascend_ref="$vllm_ascend_ref" \
-D result_file_path="$result_file_path" \
-D npu_per_node="$npu_per_node" \
-D fail_tag="$fail_tag" \
--outfile lws.yaml
Expand Down
4 changes: 2 additions & 2 deletions .github/workflows/nightly_test_a2.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -93,13 +93,13 @@ jobs:
uses: ./.github/workflows/_e2e_nightly_multi_node.yaml
with:
soc_version: a2
runner: linux-aarch64-a2-0
runner: linux-amd64-cpu-8-hk
image: 'swr.cn-southwest-2.myhuaweicloud.com/base_image/ascend-ci/vllm-ascend:nightly-a2'
replicas: 1
size: ${{ matrix.test_config.size }}
config_file_path: ${{ matrix.test_config.config_file_path }}
secrets:
KUBECONFIG_B64: ${{ secrets.KUBECONFIG_A2_B64 }}
KUBECONFIG_B64: ${{ secrets.KUBECONFIG_HK_001_INTERNAL_B64 }}

single-node-accuracy-tests:
if: >-
Expand Down
138 changes: 138 additions & 0 deletions tests/e2e/nightly/multi_node/scripts/lws-a2.yaml.jinja2
Original file line number Diff line number Diff line change
@@ -0,0 +1,138 @@
apiVersion: leaderworkerset.x-k8s.io/v1
kind: LeaderWorkerSet
metadata:
name: vllm
namespace: vllm-project
spec:
replicas: {{ replicas | default(1) }}
leaderWorkerTemplate:
size: {{ size | default(2) }}
restartPolicy: None
leaderTemplate:
metadata:
labels:
role: leader
spec:
schedulerName: volcano
tolerations:
- key: "instance"
operator: "Equal"
value: "vllm"
effect: "NoSchedule"
containers:
- name: vllm-leader
imagePullPolicy: Always
image: {{ image | default("swr.cn-southwest-2.myhuaweicloud.com/base_image/ascend-ci/vllm-ascend:nightly-a2") }}
env:
- name: CONFIG_YAML_PATH
value: {{ config_file_path | default("DeepSeek-V3.yaml") }}
- name: WORKSPACE
value: "/vllm-workspace"
- name: FAIL_TAG
value: {{ fail_tag | default("FAIL_TAG") }}
command:
- sh
- -c
- |
bash /root/.cache/tests/run.sh
resources:
limits:
huawei.com/ascend-1980: {{ npu_per_node | default("16") }}
memory: 512Gi
ephemeral-storage: 100Gi
requests:
huawei.com/ascend-1980: {{ npu_per_node | default("16") }}
ephemeral-storage: 100Gi
cpu: 125
ports:
- containerPort: 8080
# readinessProbe:
# tcpSocket:
# port: 8080
# initialDelaySeconds: 15
# periodSeconds: 10
volumeMounts:
- mountPath: /root/.cache
name: shared-volume
- mountPath: /usr/local/Ascend/driver/tools
name: driver-tools
- mountPath: /dev/shm
name: dshm
volumes:
- name: dshm
emptyDir:
medium: Memory
sizeLimit: 15Gi
- name: shared-volume
persistentVolumeClaim:
claimName: vllm-project-hk001
- name: driver-tools
hostPath:
path: /usr/local/Ascend/driver/tools
workerTemplate:
spec:
schedulerName: volcano
tolerations:
- key: "instance"
operator: "Equal"
value: "vllm"
effect: "NoSchedule"
containers:
- name: vllm-worker
imagePullPolicy: Always
image: {{ image | default("swr.cn-southwest-2.myhuaweicloud.com/base_image/ascend-ci/vllm-ascend:nightly-a2") }}
env:
- name: CONFIG_YAML_PATH
value: {{ config_file_path | default("DeepSeek-V3.yaml") }}
- name: WORKSPACE
value: "/vllm-workspace"
- name: FAIL_TAG
value: {{ fail_tag | default("FAIL_TAG") }}
command:
- sh
- -c
- |
bash /root/.cache/tests/run.sh
resources:
limits:
huawei.com/ascend-1980: {{ npu_per_node | default("16") }}
memory: 512Gi
ephemeral-storage: 100Gi
requests:
huawei.com/ascend-1980: {{ npu_per_node | default("16") }}
ephemeral-storage: 100Gi
cpu: 125
volumeMounts:
- mountPath: /root/.cache
name: shared-volume
- mountPath: /usr/local/Ascend/driver/tools
name: driver-tools
- mountPath: /dev/shm
name: dshm
volumes:
- name: dshm
emptyDir:
medium: Memory
sizeLimit: 15Gi
- name: shared-volume
persistentVolumeClaim:
claimName: vllm-project-hk001
- name: driver-tools
hostPath:
path: /usr/local/Ascend/driver/tools
---
apiVersion: v1
kind: Service
metadata:
name: vllm-leader
namespace: vllm-project
spec:
ports:
- name: http
port: 8080
protocol: TCP
targetPort: 8080
selector:
leaderworkerset.sigs.k8s.io/name: vllm
role: leader
type: ClusterIP
Comment thread
Potabk marked this conversation as resolved.
14 changes: 0 additions & 14 deletions tests/e2e/nightly/multi_node/scripts/lws.yaml.jinja2
Original file line number Diff line number Diff line change
Expand Up @@ -22,13 +22,6 @@ spec:
value: {{ config_file_path | default("DeepSeek-V3.yaml") }}
- name: WORKSPACE
value: "/vllm-workspace"
# Set vLLM version and vLLM-Ascend version here, once there is a new release, update here.
- name: VLLM_ASCEND_VERSION
value: {{ vllm_ascend_ref | default("main") }}
- name: VLLM_ASCEND_REMOTE_URL
value: {{ vllm_ascend_remote_url | default("https://github.com/vllm-project/vllm-ascend.git") }}
- name: RESULT_FILE_PATH
value: {{ result_file_path | default("/root/.cache/tests/ret") }}
- name: FAIL_TAG
value: {{ fail_tag | default("FAIL_TAG") }}
command:
Expand Down Expand Up @@ -81,13 +74,6 @@ spec:
value: {{ config_file_path | default("DeepSeek-V3.yaml") }}
- name: WORKSPACE
value: "/vllm-workspace"
# Set vLLM version and vLLM-Ascend version here, once there is a new release, update here.
- name: VLLM_ASCEND_VERSION
value: {{ vllm_ascend_ref | default("main") }}
- name: VLLM_ASCEND_REMOTE_URL
value: {{ vllm_ascend_remote_url | default("https://github.com/vllm-project/vllm-ascend.git") }}
- name: RESULT_FILE_PATH
value: {{ result_file_path | default("/root/.cache/tests/ret") }}
- name: FAIL_TAG
value: {{ fail_tag | default("FAIL_TAG") }}
command:
Expand Down
4 changes: 2 additions & 2 deletions tests/e2e/nightly/multi_node/scripts/run.sh
Original file line number Diff line number Diff line change
Expand Up @@ -167,8 +167,8 @@ run_tests_with_log() {
if [ $ret -eq 0 ]; then
print_success "All tests passed!"
else
print_failure "Some tests failed, please check the error stack above for details.\
If this is insufficient to pinpoint the error, please download and review the logs of all other nodes from the job's summary."
print_failure "Some tests failed, please check the error stack above for details. \
If this is insufficient to pinpoint the error, please download and review the logs of all other nodes from the job's summary."
fi
fi
}
Expand Down
Loading