diff --git a/.github/workflows/_e2e_nightly_multi_node.yaml b/.github/workflows/_e2e_nightly_multi_node.yaml index 6ceb9332367..7f98b32739a 100644 --- a/.github/workflows/_e2e_nightly_multi_node.yaml +++ b/.github/workflows/_e2e_nightly_multi_node.yaml @@ -69,35 +69,12 @@ jobs: # This is the runner with no NPU for k8s controller runs-on: ${{ inputs.runner }} container: - image: m.daocloud.io/quay.io/ascend/cann:8.3.rc2-a3-ubuntu22.04-py3.11 + image: swr.cn-southwest-2.myhuaweicloud.com/base_image/ascend-ci/vllm-ascend:nightly-cpu env: KUBECONFIG: /tmp/kubeconfig - KUBECTL: /root/.cache/.kube/kubectl NAMESPACE: vllm-project LEADER_POD: vllm-0 - RESULT_FILE: /root/.cache/tests/ret_${{ inputs.soc_version }} steps: - - name: Install system denpendencies - run: | - # configure apt and pip source - sed -i 's|ports.ubuntu.com|mirrors.tuna.tsinghua.edu.cn|g' /etc/apt/sources.list - pip config set global.index-url https://mirrors.tuna.tsinghua.edu.cn/pypi/web/simple - pip install jinja2-cli - - - name: Install kubectl - run: | - # Install kubectl - arch=$(uname -m) - - if echo "$arch" | grep -qiE "arm|aarch64"; then - echo "Detected ARM architecture: $arch" - KUBECTL="$KUBECTL"_arm - fi - install -o root -g root -m 0755 $KUBECTL /usr/local/bin/kubectl - - # Verify kubectl installation - kubectl version --client=true - - name: Decode kubeconfig from secrets run: | # Decode and save kubeconfig @@ -110,8 +87,6 @@ jobs: run: | # prepare for lws entrypoint scripts install -D tests/e2e/nightly/multi_node/scripts/run.sh /root/.cache/tests/run.sh - # clear log directory - rm -fr $RESULT_FILE - name: Clear resources run: | @@ -157,10 +132,6 @@ jobs: replicas="${{ inputs.replicas }}" image="${{ inputs.image }}" config_file_path="${{ inputs.config_file_path }}" - vllm_version="${{ inputs.vllm_version }}" - vllm_ascend_ref="${{ inputs.vllm_ascend_ref }}" - vllm_ascend_remote_url="${{ inputs.vllm_ascend_remote_url }}" - result_file_path="$RESULT_FILE" fail_tag=FAIL_TAG_"${{ inputs.config_file_path }}" echo "FAIL_TAG=${fail_tag}" >> $GITHUB_ENV @@ -174,19 +145,17 @@ jobs: if [ "${{ inputs.soc_version }}" = "a3" ]; then npu_per_node=16 + TEMPLATE_FILE="tests/e2e/nightly/multi_node/scripts/lws.yaml.jinja2" else npu_per_node=8 + TEMPLATE_FILE="tests/e2e/nightly/multi_node/scripts/lws-a2.yaml.jinja2" fi - jinja2 tests/e2e/nightly/multi_node/scripts/lws.yaml.jinja2 \ + jinja2 $TEMPLATE_FILE \ -D size="$size" \ -D replicas="$replicas" \ -D image="$image" \ -D config_file_path="$config_file_path" \ - -D vllm_version="$vllm_version" \ - -D vllm_ascend_remote_url="$vllm_ascend_remote_url" \ - -D vllm_ascend_ref="$vllm_ascend_ref" \ - -D result_file_path="$result_file_path" \ -D npu_per_node="$npu_per_node" \ -D fail_tag="$fail_tag" \ --outfile lws.yaml diff --git a/.github/workflows/nightly_test_a2.yaml b/.github/workflows/nightly_test_a2.yaml index de2ed06ebae..aac1592c7e4 100644 --- a/.github/workflows/nightly_test_a2.yaml +++ b/.github/workflows/nightly_test_a2.yaml @@ -93,13 +93,13 @@ jobs: uses: ./.github/workflows/_e2e_nightly_multi_node.yaml with: soc_version: a2 - runner: linux-aarch64-a2-0 + runner: linux-amd64-cpu-8-hk image: 'swr.cn-southwest-2.myhuaweicloud.com/base_image/ascend-ci/vllm-ascend:nightly-a2' replicas: 1 size: ${{ matrix.test_config.size }} config_file_path: ${{ matrix.test_config.config_file_path }} secrets: - KUBECONFIG_B64: ${{ secrets.KUBECONFIG_A2_B64 }} + KUBECONFIG_B64: ${{ secrets.KUBECONFIG_HK_001_INTERNAL_B64 }} single-node-accuracy-tests: if: >- diff --git a/tests/e2e/nightly/multi_node/scripts/lws-a2.yaml.jinja2 b/tests/e2e/nightly/multi_node/scripts/lws-a2.yaml.jinja2 new file mode 100644 index 00000000000..77a025cea5a --- /dev/null +++ b/tests/e2e/nightly/multi_node/scripts/lws-a2.yaml.jinja2 @@ -0,0 +1,138 @@ +apiVersion: leaderworkerset.x-k8s.io/v1 +kind: LeaderWorkerSet +metadata: + name: vllm + namespace: vllm-project +spec: + replicas: {{ replicas | default(1) }} + leaderWorkerTemplate: + size: {{ size | default(2) }} + restartPolicy: None + leaderTemplate: + metadata: + labels: + role: leader + spec: + schedulerName: volcano + tolerations: + - key: "instance" + operator: "Equal" + value: "vllm" + effect: "NoSchedule" + containers: + - name: vllm-leader + imagePullPolicy: Always + image: {{ image | default("swr.cn-southwest-2.myhuaweicloud.com/base_image/ascend-ci/vllm-ascend:nightly-a2") }} + env: + - name: CONFIG_YAML_PATH + value: {{ config_file_path | default("DeepSeek-V3.yaml") }} + - name: WORKSPACE + value: "/vllm-workspace" + - name: FAIL_TAG + value: {{ fail_tag | default("FAIL_TAG") }} + command: + - sh + - -c + - | + bash /root/.cache/tests/run.sh + resources: + limits: + huawei.com/ascend-1980: {{ npu_per_node | default("16") }} + memory: 512Gi + ephemeral-storage: 100Gi + requests: + huawei.com/ascend-1980: {{ npu_per_node | default("16") }} + ephemeral-storage: 100Gi + cpu: 125 + ports: + - containerPort: 8080 + # readinessProbe: + # tcpSocket: + # port: 8080 + # initialDelaySeconds: 15 + # periodSeconds: 10 + volumeMounts: + - mountPath: /root/.cache + name: shared-volume + - mountPath: /usr/local/Ascend/driver/tools + name: driver-tools + - mountPath: /dev/shm + name: dshm + volumes: + - name: dshm + emptyDir: + medium: Memory + sizeLimit: 15Gi + - name: shared-volume + persistentVolumeClaim: + claimName: vllm-project-hk001 + - name: driver-tools + hostPath: + path: /usr/local/Ascend/driver/tools + workerTemplate: + spec: + schedulerName: volcano + tolerations: + - key: "instance" + operator: "Equal" + value: "vllm" + effect: "NoSchedule" + containers: + - name: vllm-worker + imagePullPolicy: Always + image: {{ image | default("swr.cn-southwest-2.myhuaweicloud.com/base_image/ascend-ci/vllm-ascend:nightly-a2") }} + env: + - name: CONFIG_YAML_PATH + value: {{ config_file_path | default("DeepSeek-V3.yaml") }} + - name: WORKSPACE + value: "/vllm-workspace" + - name: FAIL_TAG + value: {{ fail_tag | default("FAIL_TAG") }} + command: + - sh + - -c + - | + bash /root/.cache/tests/run.sh + resources: + limits: + huawei.com/ascend-1980: {{ npu_per_node | default("16") }} + memory: 512Gi + ephemeral-storage: 100Gi + requests: + huawei.com/ascend-1980: {{ npu_per_node | default("16") }} + ephemeral-storage: 100Gi + cpu: 125 + volumeMounts: + - mountPath: /root/.cache + name: shared-volume + - mountPath: /usr/local/Ascend/driver/tools + name: driver-tools + - mountPath: /dev/shm + name: dshm + volumes: + - name: dshm + emptyDir: + medium: Memory + sizeLimit: 15Gi + - name: shared-volume + persistentVolumeClaim: + claimName: vllm-project-hk001 + - name: driver-tools + hostPath: + path: /usr/local/Ascend/driver/tools +--- +apiVersion: v1 +kind: Service +metadata: + name: vllm-leader + namespace: vllm-project +spec: + ports: + - name: http + port: 8080 + protocol: TCP + targetPort: 8080 + selector: + leaderworkerset.sigs.k8s.io/name: vllm + role: leader + type: ClusterIP diff --git a/tests/e2e/nightly/multi_node/scripts/lws.yaml.jinja2 b/tests/e2e/nightly/multi_node/scripts/lws.yaml.jinja2 index f7cd188054a..10be7ad74d3 100644 --- a/tests/e2e/nightly/multi_node/scripts/lws.yaml.jinja2 +++ b/tests/e2e/nightly/multi_node/scripts/lws.yaml.jinja2 @@ -22,13 +22,6 @@ spec: value: {{ config_file_path | default("DeepSeek-V3.yaml") }} - name: WORKSPACE value: "/vllm-workspace" - # Set vLLM version and vLLM-Ascend version here, once there is a new release, update here. - - name: VLLM_ASCEND_VERSION - value: {{ vllm_ascend_ref | default("main") }} - - name: VLLM_ASCEND_REMOTE_URL - value: {{ vllm_ascend_remote_url | default("https://github.com/vllm-project/vllm-ascend.git") }} - - name: RESULT_FILE_PATH - value: {{ result_file_path | default("/root/.cache/tests/ret") }} - name: FAIL_TAG value: {{ fail_tag | default("FAIL_TAG") }} command: @@ -81,13 +74,6 @@ spec: value: {{ config_file_path | default("DeepSeek-V3.yaml") }} - name: WORKSPACE value: "/vllm-workspace" - # Set vLLM version and vLLM-Ascend version here, once there is a new release, update here. - - name: VLLM_ASCEND_VERSION - value: {{ vllm_ascend_ref | default("main") }} - - name: VLLM_ASCEND_REMOTE_URL - value: {{ vllm_ascend_remote_url | default("https://github.com/vllm-project/vllm-ascend.git") }} - - name: RESULT_FILE_PATH - value: {{ result_file_path | default("/root/.cache/tests/ret") }} - name: FAIL_TAG value: {{ fail_tag | default("FAIL_TAG") }} command: diff --git a/tests/e2e/nightly/multi_node/scripts/run.sh b/tests/e2e/nightly/multi_node/scripts/run.sh index d6cca47e4cb..2b536db944f 100644 --- a/tests/e2e/nightly/multi_node/scripts/run.sh +++ b/tests/e2e/nightly/multi_node/scripts/run.sh @@ -167,8 +167,8 @@ run_tests_with_log() { if [ $ret -eq 0 ]; then print_success "All tests passed!" else - print_failure "Some tests failed, please check the error stack above for details.\ - If this is insufficient to pinpoint the error, please download and review the logs of all other nodes from the job's summary." + print_failure "Some tests failed, please check the error stack above for details. \ +If this is insufficient to pinpoint the error, please download and review the logs of all other nodes from the job's summary." fi fi }