[XPU] [CI] Change CI ep test from offline to online (#4885)

zccjjj · plusNew001 · web-flow · commit 88da9d978854 · 2025-11-13T16:15:45.000+08:00
* change CI ep test from offline to online

* add ep all2all ci's changes, from offline to online

* change env var in ep-all2all ci test

* add expected response for ep8tp8 all2all

* Adapt to CI refactoring and support dual-concurrent code execution

* Adapt to CI refactoring and support dual-concurrent, second

* Explicitly specify the #port

* change the startup method of all2all

* Modify the command of all2all

* Update assertion to check multiple keywords

* Update assertion to check multiple keywords

* Update run_w4a8.py

* Update run_w4a8.py

---------

Co-authored-by: plusNew001 &lt;95567040+plusNew001@users.noreply.github.com&gt;
diff --git a/scripts/run_ci_xpu.sh b/scripts/run_ci_xpu.sh
@@ -11,6 +11,9 @@ function stop_processes() {
     ps -efww | grep -E 'api_server' | grep -v grep | awk '{print $2}' | xargs kill -9 || true
     ps -efww | grep -E "$((8188 + GPU_ID * 100))" | grep -v grep | awk '{print $2}' | xargs kill -9 || true
     lsof -t -i :$((8188 + GPU_ID * 100)) | xargs kill -9 || true
+    for port in {$((8188 + GPU_ID * 100 + 10))..$((8188 + GPU_ID * 100 + 40))}; do
+        lsof -t -i :${port} | xargs kill -9 || true
+    done
 }
 stop_processes
 
@@ -286,10 +289,11 @@ if [ ${vl_test_exit_code} -ne 0 ]; then
 fi
 
 
-echo "============================开始 EP8TP1 测试!============================"
+echo "============================开始 EP4TP4 在线服务测试!============================"
 sleep 5
 rm -rf log/*
 rm -f core*
+# pkill -9 python #流水线不执行这个
 ipcrm --all=msg
 xpu-smi
 if [[ "$GPU_ID" == "0" ]]; then
@@ -312,11 +316,58 @@ cd xDeepEP
 bash build.sh
 cd -
 
-export enable_expert_parallel=1
-export enable_tensor_parallel=0
+export port_num=$((8188 + GPU_ID * 100))
+# 启动服务
+python -m fastdeploy.entrypoints.openai.api_server \
+    --model ${MODEL_PATH}/ERNIE-4.5-300B-A47B-Paddle \
+    --port $port_num \
+    --tensor-parallel-size 4 \
+    --enable-expert-parallel \
+    --data-parallel-size 1 \
+    --max-model-len 32768 \
+    --max-num-seqs 64 \
+    --quantization "wint4" \
+    --engine-worker-queue-port $((port_num + 10)) \
+    --metrics-port $((port_num + 2)) \
+    --cache-queue-port $((port_num + 47873)) \
+    --disable-sequence-parallel-moe \
+    --gpu-memory-utilization 0.9 \
+    --load-choices "default" > server.log 2>&1 &
 
-python -m pytest -s --timeout=600 tests/ci_use/XPU_45T/run_ep.py
-ep_exit_code=$?
+sleep 60
+# 探活
+TIMEOUT=$((15 * 60))
+INTERVAL=10
+ENDPOINT="http://0.0.0.0:${port_num}/health"
+START_TIME=$(date +%s)
+echo "开始服务健康检查，最长等待时间：${TIMEOUT}秒"
+while true; do
+    CURRENT_TIME=$(date +%s)
+    ELAPSED=$((CURRENT_TIME - START_TIME))
+    if [ $ELAPSED -ge $TIMEOUT ]; then
+        echo -e "\n服务启动超时：经过 $((TIMEOUT/60)) 分钟服务仍未启动！"
+        stop_processes
+        cat server.log
+        echo "log/workerlog.0"
+        cat log/workerlog.0
+        exit 1
+    fi
+    HTTP_CODE=$(curl -s -o /dev/null -w "%{http_code}" -m 2 "$ENDPOINT" || true)
+    echo -e "\r服务健康检查中... 已等待 ${ELAPSED} 秒，当前状态码：${HTTP_CODE}"
+    if [ "$HTTP_CODE" = "200" ]; then
+        echo -e "\n服务启动成功！耗时 ${ELAPSED} 秒"
+        break
+    else
+        sleep $INTERVAL
+    fi
+done
+
+cat server.log
+
+# 执行在线推理验证脚本
+python tests/ci_use/XPU_45T/run_ep_online.py
+ep_online_exit_code=$?
+echo ep_online_exit_code is ${ep_online_exit_code}
 
 unset BKCL_ENABLE_XDR
 unset BKCL_RDMA_NICS
@@ -327,26 +378,24 @@ unset XSHMEM_QP_NUM_PER_RANK
 unset BKCL_RDMA_VERBS
 stop_processes
 
-if [ ${ep_exit_code} -ne 0 ]; then
-    echo "log/workerlog.0"
+if [ ${ep_online_exit_code} -ne 0 ]; then
     cat log/workerlog.0
-    echo "EP8TP1 相关测试失败，请检查pr代码"
+    echo "EP4TP4 在线服务相关测试失败，请检查pr代码"
     exit 1
 fi
 
-
-echo "============================开始 EP8TP8 allreduce 测试!============================"
+echo "============================开始 EP4TP1 在线服务测试!============================"
 sleep 5
 rm -rf log/*
 rm -f core*
+# pkill -9 python #流水线不执行这个
 ipcrm --all=msg
 xpu-smi
 if [[ "$GPU_ID" == "0" ]]; then
     export XPU_VISIBLE_DEVICES="0,1,2,3"
 else
     export XPU_VISIBLE_DEVICES="4,5,6,7"
 fi
-
 export BKCL_ENABLE_XDR=1
 export BKCL_RDMA_NICS=xgbe1,xgbe2,xgbe3,xgbe4
 export BKCL_TRACE_TOPO=1
@@ -355,12 +404,54 @@ export XSHMEM_MODE=1
 export XSHMEM_QP_NUM_PER_RANK=32
 export BKCL_RDMA_VERBS=1
 
-export enable_expert_parallel=1
-export enable_tensor_parallel=1
-export disable_sequence_parallel_moe=1
+export port_num=$((8188 + GPU_ID * 100))
+# 启动服务
+python -m fastdeploy.entrypoints.openai.api_server \
+    --model ${MODEL_PATH}/ERNIE-4.5-300B-A47B-Paddle \
+    --port $port_num \
+    --tensor-parallel-size 1 \
+    --enable-expert-parallel \
+    --data-parallel-size 4 \
+    --max-model-len 32768 \
+    --max-num-seqs 64 \
+    --quantization "wint4" \
+    --engine-worker-queue-port "$((port_num + 10)),$((port_num + 20)),$((port_num + 30)),$((port_num + 40))" \
+    --metrics-port $((port_num + 2)) \
+    --cache-queue-port $((port_num + 47873)) \
+    --gpu-memory-utilization 0.9 \
+    --load-choices "default" > server.log 2>&1 &
+
+sleep 60
+# 探活（同上）
+TIMEOUT=$((15 * 60))
+INTERVAL=10
+ENDPOINT="http://0.0.0.0:${port_num}/health"
+START_TIME=$(date +%s)
+while true; do
+    CURRENT_TIME=$(date +%s)
+    ELAPSED=$((CURRENT_TIME - START_TIME))
+    if [ $ELAPSED -ge $TIMEOUT ]; then
+        echo -e "\n服务启动超时：经过 $((TIMEOUT/60)) 分钟服务仍未启动！"
+        stop_processes
+        cat server.log
+        cat log/workerlog.0
+        exit 1
+    fi
+    HTTP_CODE=$(curl -s -o /dev/null -w "%{http_code}" -m 2 "$ENDPOINT" || true)
+    if [ "$HTTP_CODE" = "200" ]; then
+        echo -e "\n服务启动成功！耗时 ${ELAPSED} 秒"
+        break
+    else
+        sleep $INTERVAL
+    fi
+done
+
+cat server.log
 
-python -m pytest -s --timeout=600 tests/ci_use/XPU_45T/run_ep.py
-ep_exit_code=$?
+# 执行在线推理验证脚本
+python tests/ci_use/XPU_45T/run_ep_online.py
+ep_online_exit_code=$?
+echo ep_online_exit_code is ${ep_online_exit_code}
 
 unset BKCL_ENABLE_XDR
 unset BKCL_RDMA_NICS
@@ -369,30 +460,27 @@ unset BKCL_PCIE_RING
 unset XSHMEM_MODE
 unset XSHMEM_QP_NUM_PER_RANK
 unset BKCL_RDMA_VERBS
-unset enable_expert_parallel
-unset enable_tensor_parallel
-unset disable_sequence_parallel_moe
 stop_processes
 
-if [ ${ep_exit_code} -ne 0 ]; then
-    echo "log/workerlog.0"
+if [ ${ep_online_exit_code} -ne 0 ]; then
     cat log/workerlog.0
-    echo "EP8TP8 allreduce 相关测试失败，请检查pr代码"
+    echo "EP4TP1 在线服务相关测试失败，请检查pr代码"
     exit 1
 fi
 
-
-echo "============================开始 EP8TP8 all2all 测试!============================"
+echo "============================开始 EP4TP4 all2all 测试!============================"
 sleep 5
 rm -rf log/*
 rm -f core*
+# pkill -9 python #流水线不执行这个
 ipcrm --all=msg
 xpu-smi
 if [[ "$GPU_ID" == "0" ]]; then
     export XPU_VISIBLE_DEVICES="0,1,2,3"
 else
     export XPU_VISIBLE_DEVICES="4,5,6,7"
 fi
+
 export BKCL_ENABLE_XDR=1
 export BKCL_RDMA_NICS=xgbe1,xgbe2,xgbe3,xgbe4
 export BKCL_TRACE_TOPO=1
@@ -401,11 +489,57 @@ export XSHMEM_MODE=1
 export XSHMEM_QP_NUM_PER_RANK=32
 export BKCL_RDMA_VERBS=1
 
-export enable_expert_parallel=1
-export enable_tensor_parallel=1
+export port_num=$((8188 + GPU_ID * 100))
+# 启动服务
+python -m fastdeploy.entrypoints.openai.api_server \
+    --model ${MODEL_PATH}/ERNIE-4.5-300B-A47B-Paddle \
+    --port $port_num \
+    --tensor-parallel-size 4 \
+    --enable-expert-parallel \
+    --data-parallel-size 1 \
+    --max-model-len 32768 \
+    --max-num-seqs 64 \
+    --quantization "wint4" \
+    --engine-worker-queue-port $((port_num + 10)) \
+    --metrics-port $((port_num + 2)) \
+    --cache-queue-port $((port_num + 47873)) \
+    --gpu-memory-utilization 0.9 \
+    --load-choices "default" > server.log 2>&1 &
 
-python -m pytest -s --timeout=600 tests/ci_use/XPU_45T/run_ep.py
-ep_exit_code=$?
+sleep 60
+# 探活
+TIMEOUT=$((15 * 60))
+INTERVAL=10
+ENDPOINT="http://0.0.0.0:${port_num}/health"
+START_TIME=$(date +%s)
+echo "开始服务健康检查，最长等待时间：${TIMEOUT}秒"
+while true; do
+    CURRENT_TIME=$(date +%s)
+    ELAPSED=$((CURRENT_TIME - START_TIME))
+    if [ $ELAPSED -ge $TIMEOUT ]; then
+        echo -e "\n服务启动超时：经过 $((TIMEOUT/60)) 分钟服务仍未启动！"
+        stop_processes
+        cat server.log
+        echo "log/workerlog.0"
+        cat log/workerlog.0
+        exit 1
+    fi
+    HTTP_CODE=$(curl -s -o /dev/null -w "%{http_code}" -m 2 "$ENDPOINT" || true)
+    echo -e "\r服务健康检查中... 已等待 ${ELAPSED} 秒，当前状态码：${HTTP_CODE}"
+    if [ "$HTTP_CODE" = "200" ]; then
+        echo -e "\n服务启动成功！耗时 ${ELAPSED} 秒"
+        break
+    else
+        sleep $INTERVAL
+    fi
+done
+
+cat server.log
+
+# 执行在线推理验证脚本
+python tests/ci_use/XPU_45T/run_ep_online.py
+ep_online_exit_code=$?
+echo ep_online_exit_code is ${ep_online_exit_code}
 
 unset BKCL_ENABLE_XDR
 unset BKCL_RDMA_NICS
@@ -414,13 +548,10 @@ unset BKCL_PCIE_RING
 unset XSHMEM_MODE
 unset XSHMEM_QP_NUM_PER_RANK
 unset BKCL_RDMA_VERBS
-unset enable_expert_parallel
-unset enable_tensor_parallel
 stop_processes
 
-if [ ${ep_exit_code} -ne 0 ]; then
-    echo "log/workerlog.0"
+if [ ${ep_online_exit_code} -ne 0 ]; then
     cat log/workerlog.0
-    echo "EP8TP8 all2all 相关测试失败，请检查pr代码"
+    echo "EP4TP4 all2all 在线服务相关测试失败，请检查pr代码"
     exit 1
 fi
diff --git a/tests/ci_use/XPU_45T/run_45T.py b/tests/ci_use/XPU_45T/run_45T.py
@@ -36,7 +36,7 @@ def test_45t():
     )
     print(response.choices[0].message.content)
     # print(base_response)
-    assert "人工智能" in response.choices[0].message.content
+    assert any(keyword in response.choices[0].message.content for keyword in ["人工智能", "文心一言"])
 
 
 if __name__ == "__main__":
diff --git a/tests/ci_use/XPU_45T/run_ep_online.py b/tests/ci_use/XPU_45T/run_ep_online.py
@@ -0,0 +1,42 @@
+# Copyright (c) 2025 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import os
+
+import openai
+
+
+def test_ep():
+    ip = "0.0.0.0"
+    gpu_id = int(os.getenv("GPU_ID", "0"))
+    service_http_port = 8188 + gpu_id * 100  # 服务配置的
+    client = openai.Client(base_url=f"http://{ip}:{service_http_port}/v1", api_key="EMPTY_API_KEY")
+    # 非流式对话
+    response = client.chat.completions.create(
+        model="default",
+        messages=[
+            {"role": "user", "content": "你好，你是谁？"},
+        ],
+        temperature=1,
+        top_p=0,
+        max_tokens=64,
+        stream=False,
+    )
+
+    print(response.choices[0].message.content)
+    # print(base_response)
+    assert any(keyword in response.choices[0].message.content for keyword in ["人工智能", "文心一言"])
+
+
+if __name__ == "__main__":
+    test_ep()
diff --git a/tests/ci_use/XPU_45T/run_w4a8.py b/tests/ci_use/XPU_45T/run_w4a8.py
@@ -36,7 +36,7 @@ def test_w4a8():
     )
     print(response.choices[0].message.content)
     # print(base_response)
-    assert "人工智能" in response.choices[0].message.content
+    assert any(keyword in response.choices[0].message.content for keyword in ["人工智能", "文心一言"])
 
 
 if __name__ == "__main__":

Original file line number	Diff line number	Diff line change
`@@ -36,7 +36,7 @@ def test_45t():`
`36`	`36`	`)`
`37`	`37`	`print(response.choices[0].message.content)`
`38`	`38`	`# print(base_response)`
`39`		`- assert "人工智能" in response.choices[0].message.content`
	`39`	`+ assert any(keyword in response.choices[0].message.content for keyword in ["人工智能", "文心一言"])`
`40`	`40`
`41`	`41`
`42`	`42`	`if __name__ == "__main__":`
Original file line number	Diff line number	Diff line change
`@@ -36,7 +36,7 @@ def test_w4a8():`
`36`	`36`	`)`
`37`	`37`	`print(response.choices[0].message.content)`
`38`	`38`	`# print(base_response)`
`39`		`- assert "人工智能" in response.choices[0].message.content`
	`39`	`+ assert any(keyword in response.choices[0].message.content for keyword in ["人工智能", "文心一言"])`
`40`	`40`
`41`	`41`
`42`	`42`	`if __name__ == "__main__":`