From 3122790bf8b2293aa14d5f1106da9aabeb6b97ba Mon Sep 17 00:00:00 2001 From: plusNew001 Date: Wed, 15 Oct 2025 09:27:27 +0000 Subject: [PATCH 01/11] add xpu ci case --- scripts/run_ci_xpu.sh | 18 +++++++++++ tests/ci_use/XPU_45T/test_ep.py | 53 +++++++++++++++++++++++++++++++++ 2 files changed, 71 insertions(+) create mode 100644 tests/ci_use/XPU_45T/test_ep.py diff --git a/scripts/run_ci_xpu.sh b/scripts/run_ci_xpu.sh index 870b463d91..d7a49ef641 100644 --- a/scripts/run_ci_xpu.sh +++ b/scripts/run_ci_xpu.sh @@ -176,3 +176,21 @@ if [ ${kv_block_test_exit_code} -ne 0 ]; then echo "kv block相关测试失败,请检查pr代码" exit 1 fi + +echo "============================开始EP并行测试!============================" + +export XPU_VISIBLE_DEVICES="0,1,2,3" +export BKCL_ENABLE_XDR=1 +export BKCL_RDMA_NICS=xgbe1,xgbe2,xgbe3,xgbe4 +export BKCL_TRACE_TOPO=1 +export BKCL_PCIE_RING=1 +export XSHMEM_MODE=1 +export XSHMEM_QP_NUM_PER_RANK=32 +export BKCL_RDMA_VERBS=1 +python tests/ci_use/XPU_45T/test_ep.py +ep_exit_code=$? + +if [ ${ep_exit_code} -ne 0 ]; then + echo "EP并行 相关测试失败,请检查pr代码" + exit 1 +f \ No newline at end of file diff --git a/tests/ci_use/XPU_45T/test_ep.py b/tests/ci_use/XPU_45T/test_ep.py new file mode 100644 index 0000000000..a7dc554b3d --- /dev/null +++ b/tests/ci_use/XPU_45T/test_ep.py @@ -0,0 +1,53 @@ +import os +import copy +from fastdeploy import LLM, SamplingParams + +msg1=[ + {"role": "system", "content": ""}, + {"role": "user", "content": "北京天安门广场在哪里?"}, +] + +messages = [msg1, + ] + +# 采样参数 +sampling_params = SamplingParams(top_p=0, max_tokens=500) +model=os.getenv("model_path", "/ssd3/model/ERNIE-4.5-300B-A47B-Paddle") + +xpu_visible_devices=os.getenv("XPU_VISIBLE_DEVICES", "0") +xpu_device_num=len(xpu_visible_devices.split(',')) +enable_expert_parallel=True +if enable_expert_parallel: + tensor_parallel_size=1 + data_parallel_size=xpu_device_num +else: + tensor_parallel_size=xpu_device_num + data_parallel_size=1 +engine_worker_queue_port=[str(8023+i*10) for i in range(data_parallel_size)] +engine_worker_queue_port=",".join(engine_worker_queue_port) + +# messages=[copy.deepcopy(msg1) for i in range(data_parallel_size)] +print(f"messages: {messages}") + +llm = LLM(model=model, + enable_expert_parallel=enable_expert_parallel, + tensor_parallel_size=tensor_parallel_size, + data_parallel_size=data_parallel_size, + max_model_len=8192, + quantization="wint4", + engine_worker_queue_port=engine_worker_queue_port, + max_num_seqs=8, + ) + +# 批量进行推理(llm内部基于资源情况进行请求排队、动态插入处理) +outputs = llm.chat(messages, sampling_params) + +# 输出结果 +for output in outputs: + prompt = output.prompt + generated_text = output.outputs.text + print(f"-"*100) + print(f"prompt: {prompt}") + print(f"-"*100) + print(f"generated_text: {generated_text}") + print(f"-"*100) \ No newline at end of file From b82625ce89fa6e3f66ebac3856e644b9a8409034 Mon Sep 17 00:00:00 2001 From: plusNew001 <95567040+plusNew001@users.noreply.github.com> Date: Thu, 16 Oct 2025 14:46:47 +0800 Subject: [PATCH 02/11] Add xDeepEP download and build steps Download and build xDeepEP before running tests. --- scripts/run_ci_xpu.sh | 9 ++++++++- 1 file changed, 8 insertions(+), 1 deletion(-) diff --git a/scripts/run_ci_xpu.sh b/scripts/run_ci_xpu.sh index d7a49ef641..612e6d5f98 100644 --- a/scripts/run_ci_xpu.sh +++ b/scripts/run_ci_xpu.sh @@ -187,10 +187,17 @@ export BKCL_PCIE_RING=1 export XSHMEM_MODE=1 export XSHMEM_QP_NUM_PER_RANK=32 export BKCL_RDMA_VERBS=1 + +wget -q https://paddle-qa.bj.bcebos.com/xpu_third_party/xDeepEP.tar.gz +tar -xzf xDeepEP.tar.gz +cd xDeepEP +bash build.sh +cd - + python tests/ci_use/XPU_45T/test_ep.py ep_exit_code=$? if [ ${ep_exit_code} -ne 0 ]; then echo "EP并行 相关测试失败,请检查pr代码" exit 1 -f \ No newline at end of file +f From 5fe0fed600488b029ff8a676f39389cc596eed64 Mon Sep 17 00:00:00 2001 From: plusNew001 <95567040+plusNew001@users.noreply.github.com> Date: Fri, 17 Oct 2025 10:34:27 +0800 Subject: [PATCH 03/11] Fix formatting and add missing sleep command --- scripts/run_ci_xpu.sh | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/scripts/run_ci_xpu.sh b/scripts/run_ci_xpu.sh index 612e6d5f98..6925419baf 100644 --- a/scripts/run_ci_xpu.sh +++ b/scripts/run_ci_xpu.sh @@ -178,7 +178,8 @@ if [ ${kv_block_test_exit_code} -ne 0 ]; then fi echo "============================开始EP并行测试!============================" - +sleep 5 +xpu_smi export XPU_VISIBLE_DEVICES="0,1,2,3" export BKCL_ENABLE_XDR=1 export BKCL_RDMA_NICS=xgbe1,xgbe2,xgbe3,xgbe4 From 235e2b49494f6314b849eb9df3aa08c3fe697b65 Mon Sep 17 00:00:00 2001 From: plusNew001 <95567040+plusNew001@users.noreply.github.com> Date: Fri, 17 Oct 2025 11:12:37 +0800 Subject: [PATCH 04/11] Update Docker image version in CI workflow --- .github/workflows/ci_xpu.yml | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/.github/workflows/ci_xpu.yml b/.github/workflows/ci_xpu.yml index f99ca7d172..7398af53d3 100644 --- a/.github/workflows/ci_xpu.yml +++ b/.github/workflows/ci_xpu.yml @@ -24,7 +24,7 @@ jobs: - name: Code Checkout env: - docker_image: ccr-2vdh3abv-pub.cnc.bj.baidubce.com/paddlepaddle/fastdeploy-xpu:2.1.0 + docker_image: ccr-2vdh3abv-pub.cnc.bj.baidubce.com/paddlepaddle/fastdeploy-xpu:2.2.0 run: | REPO="https://github.com/${{ github.repository }}.git" FULL_REPO="${{ github.repository }}" @@ -55,7 +55,7 @@ jobs: - name: Run CI unittest env: - docker_image: ccr-2vdh3abv-pub.cnc.bj.baidubce.com/paddlepaddle/fastdeploy-xpu:2.1.0 + docker_image: ccr-2vdh3abv-pub.cnc.bj.baidubce.com/paddlepaddle/fastdeploy-xpu:2.2.0 run: | runner_name="${{ runner.name }}" last_char="${runner_name: -1}" From 6f1252d3f8edfee98d1733817470d5235917313e Mon Sep 17 00:00:00 2001 From: plusNew001 <95567040+plusNew001@users.noreply.github.com> Date: Fri, 17 Oct 2025 12:51:50 +0800 Subject: [PATCH 05/11] Modify run_ci_xpu.sh for log cleanup and error handling Clean up log files before running tests and output worker log on failure. --- scripts/run_ci_xpu.sh | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/scripts/run_ci_xpu.sh b/scripts/run_ci_xpu.sh index 6925419baf..3de41ac970 100644 --- a/scripts/run_ci_xpu.sh +++ b/scripts/run_ci_xpu.sh @@ -179,7 +179,9 @@ fi echo "============================开始EP并行测试!============================" sleep 5 -xpu_smi +rm -rf log/* +rm -f core* +xpu-smi export XPU_VISIBLE_DEVICES="0,1,2,3" export BKCL_ENABLE_XDR=1 export BKCL_RDMA_NICS=xgbe1,xgbe2,xgbe3,xgbe4 @@ -199,6 +201,8 @@ python tests/ci_use/XPU_45T/test_ep.py ep_exit_code=$? if [ ${ep_exit_code} -ne 0 ]; then + echo "log/workerlog.0" + cat log/workerlog.0 echo "EP并行 相关测试失败,请检查pr代码" exit 1 f From 4f4b523bb652b116c4e0acdf210b5ed79d579c04 Mon Sep 17 00:00:00 2001 From: plusNew001 <95567040+plusNew001@users.noreply.github.com> Date: Fri, 17 Oct 2025 16:57:26 +0800 Subject: [PATCH 06/11] Enhance test_ep.py with process management and assertions Refactor test function to include process cleanup and assertions. --- tests/ci_use/XPU_45T/test_ep.py | 124 +++++++++++++++++++------------- 1 file changed, 74 insertions(+), 50 deletions(-) diff --git a/tests/ci_use/XPU_45T/test_ep.py b/tests/ci_use/XPU_45T/test_ep.py index a7dc554b3d..1ddcf50a98 100644 --- a/tests/ci_use/XPU_45T/test_ep.py +++ b/tests/ci_use/XPU_45T/test_ep.py @@ -1,53 +1,77 @@ import os -import copy +import psutil from fastdeploy import LLM, SamplingParams -msg1=[ - {"role": "system", "content": ""}, - {"role": "user", "content": "北京天安门广场在哪里?"}, -] - -messages = [msg1, - ] - -# 采样参数 -sampling_params = SamplingParams(top_p=0, max_tokens=500) -model=os.getenv("model_path", "/ssd3/model/ERNIE-4.5-300B-A47B-Paddle") - -xpu_visible_devices=os.getenv("XPU_VISIBLE_DEVICES", "0") -xpu_device_num=len(xpu_visible_devices.split(',')) -enable_expert_parallel=True -if enable_expert_parallel: - tensor_parallel_size=1 - data_parallel_size=xpu_device_num -else: - tensor_parallel_size=xpu_device_num - data_parallel_size=1 -engine_worker_queue_port=[str(8023+i*10) for i in range(data_parallel_size)] -engine_worker_queue_port=",".join(engine_worker_queue_port) - -# messages=[copy.deepcopy(msg1) for i in range(data_parallel_size)] -print(f"messages: {messages}") - -llm = LLM(model=model, - enable_expert_parallel=enable_expert_parallel, - tensor_parallel_size=tensor_parallel_size, - data_parallel_size=data_parallel_size, - max_model_len=8192, - quantization="wint4", - engine_worker_queue_port=engine_worker_queue_port, - max_num_seqs=8, - ) - -# 批量进行推理(llm内部基于资源情况进行请求排队、动态插入处理) -outputs = llm.chat(messages, sampling_params) - -# 输出结果 -for output in outputs: - prompt = output.prompt - generated_text = output.outputs.text - print(f"-"*100) - print(f"prompt: {prompt}") - print(f"-"*100) - print(f"generated_text: {generated_text}") - print(f"-"*100) \ No newline at end of file +def test_fd_ep(): + """ + """ + + msg1 = [ + {"role": "system", "content": ""}, + {"role": "user", "content": "北京天安门广场在哪里?"}, + ] + messages = [msg1] + + # 采样参数 + sampling_params = SamplingParams(top_p=0, max_tokens=500) + + # 模型路径与设备配置 + model = os.getenv("model_path", "/home/ERNIE-4.5-300B-A47B-Paddle") + xpu_visible_devices = os.getenv("XPU_VISIBLE_DEVICES", "0") + xpu_device_num = len(xpu_visible_devices.split(',')) + + enable_expert_parallel = True + if enable_expert_parallel: + tensor_parallel_size = 1 + data_parallel_size = xpu_device_num + else: + tensor_parallel_size = xpu_device_num + data_parallel_size = 1 + + engine_worker_queue_port = [str(8023 + i * 10) for i in range(data_parallel_size)] + engine_worker_queue_port = ",".join(engine_worker_queue_port) + + print(f"[INFO] messages: {messages}") + + llm = LLM( + model=model, + enable_expert_parallel=enable_expert_parallel, + tensor_parallel_size=tensor_parallel_size, + data_parallel_size=data_parallel_size, + max_model_len=8192, + quantization="wint4", + engine_worker_queue_port=engine_worker_queue_port, + max_num_seqs=8, + ) + + try: + outputs = llm.chat(messages, sampling_params) + assert outputs, "❌ LLM 推理返回空结果。" + + for idx, output in enumerate(outputs): + prompt = output.prompt + generated_text = getattr(output.outputs, "text", "").strip() + + print(f"{'-'*100}") + print(f"[PROMPT {idx}] {prompt}") + print(f"{'-'*100}") + print(f"[GENERATED TEXT] {generated_text}") + print(f"{'-'*100}") + + # 核心断言:输出不能为空 + assert generated_text, f"❌ 推理结果为空 (index={idx})" + + finally: + # 无论是否报错都清理子进程 + current_process = psutil.Process(os.getpid()) + for child in current_process.children(recursive=True): + try: + child.kill() + print(f"[CLEANUP] 已杀死子进程 {child.pid}") + except Exception as e: + print(f"[WARN] 无法杀死子进程 {child.pid}: {e}") + print("✅ 已清理所有 FastDeploy 子进程。") + + +if __name__ == "__main__": + test_fastdeploy_llm() From 771600e1afffe9152206d1a83cee1d3e35b874ce Mon Sep 17 00:00:00 2001 From: plusNew001 <95567040+plusNew001@users.noreply.github.com> Date: Fri, 17 Oct 2025 17:19:56 +0800 Subject: [PATCH 07/11] Replace test_fastdeploy_llm with test_fd_ep --- tests/ci_use/XPU_45T/test_ep.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/ci_use/XPU_45T/test_ep.py b/tests/ci_use/XPU_45T/test_ep.py index 1ddcf50a98..0b4fe3a7d7 100644 --- a/tests/ci_use/XPU_45T/test_ep.py +++ b/tests/ci_use/XPU_45T/test_ep.py @@ -74,4 +74,4 @@ def test_fd_ep(): if __name__ == "__main__": - test_fastdeploy_llm() + test_fd_ep() From 392b9ded64e9c48bd690d050474e6db9b41ba7ba Mon Sep 17 00:00:00 2001 From: plusNew001 <95567040+plusNew001@users.noreply.github.com> Date: Fri, 17 Oct 2025 18:54:28 +0800 Subject: [PATCH 08/11] Fix conditional statement in run_ci_xpu.sh --- scripts/run_ci_xpu.sh | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/scripts/run_ci_xpu.sh b/scripts/run_ci_xpu.sh index 3de41ac970..320f00f7c5 100644 --- a/scripts/run_ci_xpu.sh +++ b/scripts/run_ci_xpu.sh @@ -205,4 +205,4 @@ if [ ${ep_exit_code} -ne 0 ]; then cat log/workerlog.0 echo "EP并行 相关测试失败,请检查pr代码" exit 1 -f +fi From 7b1eead28d9167d20976bbd63023b9209f61690a Mon Sep 17 00:00:00 2001 From: plusNew001 <95567040+plusNew001@users.noreply.github.com> Date: Tue, 21 Oct 2025 10:34:26 +0800 Subject: [PATCH 09/11] Update test_ep.py for string handling and formatting Fix string encoding issues and improve readability. --- tests/ci_use/XPU_45T/test_ep.py | 8 +++++--- 1 file changed, 5 insertions(+), 3 deletions(-) diff --git a/tests/ci_use/XPU_45T/test_ep.py b/tests/ci_use/XPU_45T/test_ep.py index 0b4fe3a7d7..c82242aa39 100644 --- a/tests/ci_use/XPU_45T/test_ep.py +++ b/tests/ci_use/XPU_45T/test_ep.py @@ -1,10 +1,12 @@ import os + import psutil + from fastdeploy import LLM, SamplingParams + def test_fd_ep(): - """ - """ + """ """ msg1 = [ {"role": "system", "content": ""}, @@ -18,7 +20,7 @@ def test_fd_ep(): # 模型路径与设备配置 model = os.getenv("model_path", "/home/ERNIE-4.5-300B-A47B-Paddle") xpu_visible_devices = os.getenv("XPU_VISIBLE_DEVICES", "0") - xpu_device_num = len(xpu_visible_devices.split(',')) + xpu_device_num = len(xpu_visible_devices.split(",")) enable_expert_parallel = True if enable_expert_parallel: From 83f3108f3448452d0618aa2eb1080a23b1efe74f Mon Sep 17 00:00:00 2001 From: plusNew001 <95567040+plusNew001@users.noreply.github.com> Date: Tue, 21 Oct 2025 14:29:32 +0800 Subject: [PATCH 10/11] Rename test_ep.py to run_ep.py --- tests/ci_use/XPU_45T/{test_ep.py => run_ep.py} | 0 1 file changed, 0 insertions(+), 0 deletions(-) rename tests/ci_use/XPU_45T/{test_ep.py => run_ep.py} (100%) diff --git a/tests/ci_use/XPU_45T/test_ep.py b/tests/ci_use/XPU_45T/run_ep.py similarity index 100% rename from tests/ci_use/XPU_45T/test_ep.py rename to tests/ci_use/XPU_45T/run_ep.py From ab3d52494af4f374731a6d8ceb379d7057516c3a Mon Sep 17 00:00:00 2001 From: plusNew001 <95567040+plusNew001@users.noreply.github.com> Date: Tue, 21 Oct 2025 14:30:07 +0800 Subject: [PATCH 11/11] Change test script from test_ep.py to run_ep.py --- scripts/run_ci_xpu.sh | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/scripts/run_ci_xpu.sh b/scripts/run_ci_xpu.sh index 320f00f7c5..9afd81249b 100644 --- a/scripts/run_ci_xpu.sh +++ b/scripts/run_ci_xpu.sh @@ -197,7 +197,7 @@ cd xDeepEP bash build.sh cd - -python tests/ci_use/XPU_45T/test_ep.py +python tests/ci_use/XPU_45T/run_ep.py ep_exit_code=$? if [ ${ep_exit_code} -ne 0 ]; then