@@ -11,6 +11,9 @@ function stop_processes() {
1111 ps -efww | grep -E ' api_server' | grep -v grep | awk ' {print $2}' | xargs kill -9 || true
1212 ps -efww | grep -E " $(( 8188 + GPU_ID * 100 )) " | grep -v grep | awk ' {print $2}' | xargs kill -9 || true
1313 lsof -t -i :$(( 8188 + GPU_ID * 100 )) | xargs kill -9 || true
14+ for port in {$(( 8188 + GPU_ID * 100 + 10 )) ..$(( 8188 + GPU_ID * 100 + 40 )) }; do
15+ lsof -t -i :${port} | xargs kill -9 || true
16+ done
1417}
1518stop_processes
1619
@@ -286,10 +289,11 @@ if [ ${vl_test_exit_code} -ne 0 ]; then
286289fi
287290
288291
289- echo " ============================开始 EP8TP1 测试 !============================"
292+ echo " ============================开始 EP4TP4 在线服务测试 !============================"
290293sleep 5
291294rm -rf log/*
292295rm -f core*
296+ # pkill -9 python #流水线不执行这个
293297ipcrm --all=msg
294298xpu-smi
295299if [[ " $GPU_ID " == " 0" ]]; then
@@ -312,11 +316,58 @@ cd xDeepEP
312316bash build.sh
313317cd -
314318
315- export enable_expert_parallel=1
316- export enable_tensor_parallel=0
319+ export port_num=$(( 8188 + GPU_ID * 100 ))
320+ # 启动服务
321+ python -m fastdeploy.entrypoints.openai.api_server \
322+ --model ${MODEL_PATH} /ERNIE-4.5-300B-A47B-Paddle \
323+ --port $port_num \
324+ --tensor-parallel-size 4 \
325+ --enable-expert-parallel \
326+ --data-parallel-size 1 \
327+ --max-model-len 32768 \
328+ --max-num-seqs 64 \
329+ --quantization " wint4" \
330+ --engine-worker-queue-port $(( port_num + 10 )) \
331+ --metrics-port $(( port_num + 2 )) \
332+ --cache-queue-port $(( port_num + 47873 )) \
333+ --disable-sequence-parallel-moe \
334+ --gpu-memory-utilization 0.9 \
335+ --load-choices " default" > server.log 2>&1 &
317336
318- python -m pytest -s --timeout=600 tests/ci_use/XPU_45T/run_ep.py
319- ep_exit_code=$?
337+ sleep 60
338+ # 探活
339+ TIMEOUT=$(( 15 * 60 ))
340+ INTERVAL=10
341+ ENDPOINT=" http://0.0.0.0:${port_num} /health"
342+ START_TIME=$( date +%s)
343+ echo " 开始服务健康检查,最长等待时间:${TIMEOUT} 秒"
344+ while true ; do
345+ CURRENT_TIME=$( date +%s)
346+ ELAPSED=$(( CURRENT_TIME - START_TIME))
347+ if [ $ELAPSED -ge $TIMEOUT ]; then
348+ echo -e " \n服务启动超时:经过 $(( TIMEOUT/ 60 )) 分钟服务仍未启动!"
349+ stop_processes
350+ cat server.log
351+ echo " log/workerlog.0"
352+ cat log/workerlog.0
353+ exit 1
354+ fi
355+ HTTP_CODE=$( curl -s -o /dev/null -w " %{http_code}" -m 2 " $ENDPOINT " || true)
356+ echo -e " \r服务健康检查中... 已等待 ${ELAPSED} 秒,当前状态码:${HTTP_CODE} "
357+ if [ " $HTTP_CODE " = " 200" ]; then
358+ echo -e " \n服务启动成功!耗时 ${ELAPSED} 秒"
359+ break
360+ else
361+ sleep $INTERVAL
362+ fi
363+ done
364+
365+ cat server.log
366+
367+ # 执行在线推理验证脚本
368+ python tests/ci_use/XPU_45T/run_ep_online.py
369+ ep_online_exit_code=$?
370+ echo ep_online_exit_code is ${ep_online_exit_code}
320371
321372unset BKCL_ENABLE_XDR
322373unset BKCL_RDMA_NICS
@@ -327,26 +378,24 @@ unset XSHMEM_QP_NUM_PER_RANK
327378unset BKCL_RDMA_VERBS
328379stop_processes
329380
330- if [ ${ep_exit_code} -ne 0 ]; then
331- echo " log/workerlog.0"
381+ if [ ${ep_online_exit_code} -ne 0 ]; then
332382 cat log/workerlog.0
333- echo " EP8TP1 相关测试失败 ,请检查pr代码"
383+ echo " EP4TP4 在线服务相关测试失败 ,请检查pr代码"
334384 exit 1
335385fi
336386
337-
338- echo " ============================开始 EP8TP8 allreduce 测试!============================"
387+ echo " ============================开始 EP4TP1 在线服务测试!============================"
339388sleep 5
340389rm -rf log/*
341390rm -f core*
391+ # pkill -9 python #流水线不执行这个
342392ipcrm --all=msg
343393xpu-smi
344394if [[ " $GPU_ID " == " 0" ]]; then
345395 export XPU_VISIBLE_DEVICES=" 0,1,2,3"
346396else
347397 export XPU_VISIBLE_DEVICES=" 4,5,6,7"
348398fi
349-
350399export BKCL_ENABLE_XDR=1
351400export BKCL_RDMA_NICS=xgbe1,xgbe2,xgbe3,xgbe4
352401export BKCL_TRACE_TOPO=1
@@ -355,12 +404,54 @@ export XSHMEM_MODE=1
355404export XSHMEM_QP_NUM_PER_RANK=32
356405export BKCL_RDMA_VERBS=1
357406
358- export enable_expert_parallel=1
359- export enable_tensor_parallel=1
360- export disable_sequence_parallel_moe=1
407+ export port_num=$(( 8188 + GPU_ID * 100 ))
408+ # 启动服务
409+ python -m fastdeploy.entrypoints.openai.api_server \
410+ --model ${MODEL_PATH} /ERNIE-4.5-300B-A47B-Paddle \
411+ --port $port_num \
412+ --tensor-parallel-size 1 \
413+ --enable-expert-parallel \
414+ --data-parallel-size 4 \
415+ --max-model-len 32768 \
416+ --max-num-seqs 64 \
417+ --quantization " wint4" \
418+ --engine-worker-queue-port " $(( port_num + 10 )) ,$(( port_num + 20 )) ,$(( port_num + 30 )) ,$(( port_num + 40 )) " \
419+ --metrics-port $(( port_num + 2 )) \
420+ --cache-queue-port $(( port_num + 47873 )) \
421+ --gpu-memory-utilization 0.9 \
422+ --load-choices " default" > server.log 2>&1 &
423+
424+ sleep 60
425+ # 探活(同上)
426+ TIMEOUT=$(( 15 * 60 ))
427+ INTERVAL=10
428+ ENDPOINT=" http://0.0.0.0:${port_num} /health"
429+ START_TIME=$( date +%s)
430+ while true ; do
431+ CURRENT_TIME=$( date +%s)
432+ ELAPSED=$(( CURRENT_TIME - START_TIME))
433+ if [ $ELAPSED -ge $TIMEOUT ]; then
434+ echo -e " \n服务启动超时:经过 $(( TIMEOUT/ 60 )) 分钟服务仍未启动!"
435+ stop_processes
436+ cat server.log
437+ cat log/workerlog.0
438+ exit 1
439+ fi
440+ HTTP_CODE=$( curl -s -o /dev/null -w " %{http_code}" -m 2 " $ENDPOINT " || true)
441+ if [ " $HTTP_CODE " = " 200" ]; then
442+ echo -e " \n服务启动成功!耗时 ${ELAPSED} 秒"
443+ break
444+ else
445+ sleep $INTERVAL
446+ fi
447+ done
448+
449+ cat server.log
361450
362- python -m pytest -s --timeout=600 tests/ci_use/XPU_45T/run_ep.py
363- ep_exit_code=$?
451+ # 执行在线推理验证脚本
452+ python tests/ci_use/XPU_45T/run_ep_online.py
453+ ep_online_exit_code=$?
454+ echo ep_online_exit_code is ${ep_online_exit_code}
364455
365456unset BKCL_ENABLE_XDR
366457unset BKCL_RDMA_NICS
@@ -369,30 +460,27 @@ unset BKCL_PCIE_RING
369460unset XSHMEM_MODE
370461unset XSHMEM_QP_NUM_PER_RANK
371462unset BKCL_RDMA_VERBS
372- unset enable_expert_parallel
373- unset enable_tensor_parallel
374- unset disable_sequence_parallel_moe
375463stop_processes
376464
377- if [ ${ep_exit_code} -ne 0 ]; then
378- echo " log/workerlog.0"
465+ if [ ${ep_online_exit_code} -ne 0 ]; then
379466 cat log/workerlog.0
380- echo " EP8TP8 allreduce 相关测试失败 ,请检查pr代码"
467+ echo " EP4TP1 在线服务相关测试失败 ,请检查pr代码"
381468 exit 1
382469fi
383470
384-
385- echo " ============================开始 EP8TP8 all2all 测试!============================"
471+ echo " ============================开始 EP4TP4 all2all 测试!============================"
386472sleep 5
387473rm -rf log/*
388474rm -f core*
475+ # pkill -9 python #流水线不执行这个
389476ipcrm --all=msg
390477xpu-smi
391478if [[ " $GPU_ID " == " 0" ]]; then
392479 export XPU_VISIBLE_DEVICES=" 0,1,2,3"
393480else
394481 export XPU_VISIBLE_DEVICES=" 4,5,6,7"
395482fi
483+
396484export BKCL_ENABLE_XDR=1
397485export BKCL_RDMA_NICS=xgbe1,xgbe2,xgbe3,xgbe4
398486export BKCL_TRACE_TOPO=1
@@ -401,11 +489,57 @@ export XSHMEM_MODE=1
401489export XSHMEM_QP_NUM_PER_RANK=32
402490export BKCL_RDMA_VERBS=1
403491
404- export enable_expert_parallel=1
405- export enable_tensor_parallel=1
492+ export port_num=$(( 8188 + GPU_ID * 100 ))
493+ # 启动服务
494+ python -m fastdeploy.entrypoints.openai.api_server \
495+ --model ${MODEL_PATH} /ERNIE-4.5-300B-A47B-Paddle \
496+ --port $port_num \
497+ --tensor-parallel-size 4 \
498+ --enable-expert-parallel \
499+ --data-parallel-size 1 \
500+ --max-model-len 32768 \
501+ --max-num-seqs 64 \
502+ --quantization " wint4" \
503+ --engine-worker-queue-port $(( port_num + 10 )) \
504+ --metrics-port $(( port_num + 2 )) \
505+ --cache-queue-port $(( port_num + 47873 )) \
506+ --gpu-memory-utilization 0.9 \
507+ --load-choices " default" > server.log 2>&1 &
406508
407- python -m pytest -s --timeout=600 tests/ci_use/XPU_45T/run_ep.py
408- ep_exit_code=$?
509+ sleep 60
510+ # 探活
511+ TIMEOUT=$(( 15 * 60 ))
512+ INTERVAL=10
513+ ENDPOINT=" http://0.0.0.0:${port_num} /health"
514+ START_TIME=$( date +%s)
515+ echo " 开始服务健康检查,最长等待时间:${TIMEOUT} 秒"
516+ while true ; do
517+ CURRENT_TIME=$( date +%s)
518+ ELAPSED=$(( CURRENT_TIME - START_TIME))
519+ if [ $ELAPSED -ge $TIMEOUT ]; then
520+ echo -e " \n服务启动超时:经过 $(( TIMEOUT/ 60 )) 分钟服务仍未启动!"
521+ stop_processes
522+ cat server.log
523+ echo " log/workerlog.0"
524+ cat log/workerlog.0
525+ exit 1
526+ fi
527+ HTTP_CODE=$( curl -s -o /dev/null -w " %{http_code}" -m 2 " $ENDPOINT " || true)
528+ echo -e " \r服务健康检查中... 已等待 ${ELAPSED} 秒,当前状态码:${HTTP_CODE} "
529+ if [ " $HTTP_CODE " = " 200" ]; then
530+ echo -e " \n服务启动成功!耗时 ${ELAPSED} 秒"
531+ break
532+ else
533+ sleep $INTERVAL
534+ fi
535+ done
536+
537+ cat server.log
538+
539+ # 执行在线推理验证脚本
540+ python tests/ci_use/XPU_45T/run_ep_online.py
541+ ep_online_exit_code=$?
542+ echo ep_online_exit_code is ${ep_online_exit_code}
409543
410544unset BKCL_ENABLE_XDR
411545unset BKCL_RDMA_NICS
@@ -414,13 +548,10 @@ unset BKCL_PCIE_RING
414548unset XSHMEM_MODE
415549unset XSHMEM_QP_NUM_PER_RANK
416550unset BKCL_RDMA_VERBS
417- unset enable_expert_parallel
418- unset enable_tensor_parallel
419551stop_processes
420552
421- if [ ${ep_exit_code} -ne 0 ]; then
422- echo " log/workerlog.0"
553+ if [ ${ep_online_exit_code} -ne 0 ]; then
423554 cat log/workerlog.0
424- echo " EP8TP8 all2all 相关测试失败 ,请检查pr代码"
555+ echo " EP4TP4 all2all 在线服务相关测试失败 ,请检查pr代码"
425556 exit 1
426557fi
0 commit comments