PaddlePaddle
diff --git a/‎benchmarks/backend_request_func.py‎
Lines changed: 5 additions & 5 deletions b/‎benchmarks/backend_request_func.py‎
Lines changed: 5 additions & 5 deletions
diff --git a/‎benchmarks/benchmark_dataset.py‎
Lines changed: 3 additions & 3 deletions b/‎benchmarks/benchmark_dataset.py‎
Lines changed: 3 additions & 3 deletions
diff --git a/‎benchmarks/benchmark_serving.py‎
Lines changed: 4 additions & 4 deletions b/‎benchmarks/benchmark_serving.py‎
Lines changed: 4 additions & 4 deletions
diff --git a/‎benchmarks/yaml/qwen25_7b-vl-32k-bf16.yaml‎
Lines changed: 1 addition & 1 deletion b/‎benchmarks/yaml/qwen25_7b-vl-32k-bf16.yaml‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎benchmarks/yaml/request_yaml/qwen25-vl-32k.yaml‎
Lines changed: 1 addition & 1 deletion b/‎benchmarks/yaml/request_yaml/qwen25-vl-32k.yaml‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎docs/features/multi-node_deployment.md‎
Lines changed: 2 additions & 2 deletions b/‎docs/features/multi-node_deployment.md‎
Lines changed: 2 additions & 2 deletions
diff --git a/‎docs/zh/features/multi-node_deployment.md‎
Lines changed: 2 additions & 2 deletions b/‎docs/zh/features/multi-node_deployment.md‎
Lines changed: 2 additions & 2 deletions
diff --git a/‎examples/splitwise/start_mixed.sh‎
Lines changed: 71 additions & 0 deletions b/‎examples/splitwise/start_mixed.sh‎
Lines changed: 71 additions & 0 deletions
diff --git a/‎examples/splitwise/start_v0_tp1.sh‎
Lines changed: 66 additions & 0 deletions b/‎examples/splitwise/start_v0_tp1.sh‎
Lines changed: 66 additions & 0 deletions
diff --git a/‎examples/splitwise/start_v1_tp1.sh‎
Lines changed: 96 additions & 0 deletions b/‎examples/splitwise/start_v1_tp1.sh‎
Lines changed: 96 additions & 0 deletions
@@ -94,10 +94,11 @@ async def async_request_eb_openai_chat_completions(
             "stream_options": {
                 "include_usage": True,
                 "continuous_usage_stats": True,
-            }
+            },
+            "max_tokens": request_func_input.output_len,
         }
         if request_func_input.response_format:
-            payload["response_format"] =request_func_input.response_format
+            payload["response_format"] = request_func_input.response_format
 
         # 超参由yaml传入
         payload.update(request_func_input.hyper_parameters)
@@ -132,13 +133,13 @@ async def async_request_eb_openai_chat_completions(
 
                         chunk = chunk_bytes.decode("utf-8").removeprefix("data: ")
                         if chunk != "[DONE]":
-                            #print("####chunk:", chunk, type(chunk))
+                            # print("####chunk:", chunk, type(chunk))
                             timestamp = time.perf_counter()
                             data = json.loads(chunk)
 
                             if request_id == "None" and "id" in data:
                                 request_id = data["id"]
-                            
+
                             if choices := data.get("choices"):
                                 content = choices[0]["delta"].get("content")
                                 reason_content = choices[0]["delta"].get("reasoning_content")
@@ -164,7 +165,6 @@ async def async_request_eb_openai_chat_completions(
                             elif usage := data.get("usage", {}):
                                 output.output_tokens = usage.get("completion_tokens", 0)
                                 output.prompt_tokens = usage.get("prompt_tokens", 0)
-                            
 
                             most_recent_timestamp = timestamp
 
 
@@ -46,7 +46,7 @@ class SampleRequest:
     prompt_len: int
     expected_output_len: int
     response_format: Optional[dict] = None
-    
+
 
 class BenchmarkDataset(ABC):
     """BenchmarkDataset"""
@@ -299,7 +299,7 @@ def sample(
             prompt = entry["messages"][-1].get("content", "")
             history_QA = entry.get("messages", [])
             response_format = entry.get("response_format")
-            new_output_len = int(entry.get("max_tokens", 12288))
+            new_output_len = int(entry.get("max_tokens", output_len if output_len else 12288))
 
             if enable_multimodal_chat:
                 prompt = self.apply_multimodal_chat_transformation(prompt, None)
@@ -311,7 +311,7 @@ def sample(
                     prompt_len=0,
                     history_QA=history_QA,
                     expected_output_len=new_output_len,
-                    response_format=response_format
+                    response_format=response_format,
                 )
             )
             cnt += 1
 
@@ -352,7 +352,7 @@ async def benchmark(
         ignore_eos=ignore_eos,
         debug=debug,
         extra_body=extra_body,
-        response_format=response_format
+        response_format=response_format,
     )
 
     print("test_input:", test_input)
@@ -384,7 +384,7 @@ async def benchmark(
             logprobs=logprobs,
             ignore_eos=ignore_eos,
             extra_body=extra_body,
-            response_format=response_format
+            response_format=response_format,
         )
         profile_output = await request_func(request_func_input=profile_input)
         if profile_output.success:
@@ -444,7 +444,7 @@ async def limited_request_func(request_func_input, pbar):
             debug=debug,
             ignore_eos=ignore_eos,
             extra_body=extra_body,
-            response_format=response_format
+            response_format=response_format,
         )
         tasks.append(asyncio.create_task(limited_request_func(request_func_input=request_func_input, pbar=pbar)))
     outputs: list[RequestFuncOutput] = await asyncio.gather(*tasks)
@@ -460,7 +460,7 @@ async def limited_request_func(request_func_input, pbar):
             api_url=base_url + "/stop_profile",
             output_len=test_output_len,
             logprobs=logprobs,
-            response_format=response_format
+            response_format=response_format,
         )
         profile_output = await request_func(request_func_input=profile_input)
         if profile_output.success:
 
@@ -3,4 +3,4 @@ max_num_seqs: 128
 gpu_memory_utilization: 0.85
 tensor_parallel_size: 1
 limit_mm_per_prompt: '{"image": 100, "video": 100}'
-enable_mm: True
+enable_mm: True
@@ -5,4 +5,4 @@ metadata:
 max_tokens: 32768
 repetition_penalty: 1.05
 frequency_penalty: 0
-presence_penalty: 0
+presence_penalty: 0
@@ -26,7 +26,7 @@ We recommend using mpirun for one-command startup without manually starting each
 4. Ensure all nodes can resolve each other's hostnames
 
 * Online inference startup example:
-  
+
     ```shell
     python -m fastdeploy.entrypoints.openai.api_server \
     --model baidu/ERNIE-4.5-300B-A47B-Paddle \
@@ -40,7 +40,7 @@ We recommend using mpirun for one-command startup without manually starting each
     ```
 
 * Offline startup example:
-  
+
     ```python
     from fastdeploy.engine.sampling_params import SamplingParams
     from fastdeploy.entrypoints.llm import LLM
 
@@ -26,7 +26,7 @@
 4. 确保所有节点能够解析彼此的主机名
 
 * 在线推理启动示例：
-  
+
     ```shell
     python -m fastdeploy.entrypoints.openai.api_server \
     --model baidu/ERNIE-4.5-300B-A47B-Paddle \
@@ -40,7 +40,7 @@
     ```
 
 * 离线启动示例：
-  
+
     ```python
     from fastdeploy.engine.sampling_params import SamplingParams
     from fastdeploy.entrypoints.llm import LLM
 
@@ -0,0 +1,71 @@
+#!/bin/bash
+set -e
+
+wait_for_health() {
+       local server_port=$1
+       while true; do
+       status_code=$(curl -s -o /dev/null -w "%{http_code}" "http://0.0.0.0:${server_port}/health" || echo "000")
+       if [ "$status_code" -eq 200 ]; then
+              break
+       else
+              echo "Service not ready. Retrying in 2s..."
+              sleep 2
+       fi
+       done
+}
+
+# prepare environment
+MODEL_NAME="PaddlePaddle/ERNIE-4.5-0.3B-Paddle"
+# MODEL_NAME="baidu/ERNIE-4.5-21B-A3B-Paddle"
+
+export FD_DEBUG=1
+export ENABLE_V1_KVCACHE_SCHEDULER=0
+export KVCACHE_GDRCOPY_FLUSH_ENABLE=1
+
+unset http_proxy && unset https_proxy
+rm -rf log_*
+
+# start router
+export FD_LOG_DIR="log_router"
+mkdir -p ${FD_LOG_DIR}
+
+router_port=9000
+nohup python -m fastdeploy.router.launch \
+    --port ${router_port} \
+    2>&1 >${FD_LOG_DIR}/nohup &
+sleep 1
+
+# start modelserver 0
+export CUDA_VISIBLE_DEVICES=0
+export FD_LOG_DIR="log_server_0"
+mkdir -p ${FD_LOG_DIR}
+
+nohup python -m fastdeploy.entrypoints.openai.api_server \
+       --model ${MODEL_NAME} \
+       --port 8100 \
+       --metrics-port 8101 \
+       --engine-worker-queue-port 8102 \
+       --cache-queue-port 8103 \
+       --max-model-len 32768 \
+       --router "0.0.0.0:${router_port}" \
+       2>&1 >${FD_LOG_DIR}/nohup &
+sleep 1
+
+wait_for_health 8100
+
+# start modelserver 1
+export CUDA_VISIBLE_DEVICES=1
+export FD_LOG_DIR="log_server_1"
+mkdir -p ${FD_LOG_DIR}
+
+nohup python -m fastdeploy.entrypoints.openai.api_server \
+       --model ${MODEL_NAME} \
+       --port 8200 \
+       --metrics-port 8201 \
+       --engine-worker-queue-port 8202 \
+       --cache-queue-port 8203 \
+       --max-model-len 32768 \
+       --router "0.0.0.0:${router_port}" \
+       2>&1 >${FD_LOG_DIR}/nohup &
+
+wait_for_health 8200
@@ -0,0 +1,66 @@
+#!/bin/bash
+set -e
+
+# Test splitwise deployment
+# v0 requires prefill and decode in one node and it uses local scheduler
+# v1 supports prefill and decode in multi node and it uses splitwise scheduler
+# v2 supports prefill and decode in multi node and it uses router and local scheduler
+
+wait_for_health() {
+       local server_port=$1
+       while true; do
+       status_code=$(curl -s -o /dev/null -w "%{http_code}" "http://0.0.0.0:${server_port}/health" || echo "000")
+       if [ "$status_code" -eq 200 ]; then
+              break
+       else
+              echo "Service not ready. Retrying in 2s..."
+              sleep 2
+       fi
+       done
+}
+
+MODEL_NAME="PaddlePaddle/ERNIE-4.5-0.3B-Paddle"
+# MODEL_NAME="baidu/ERNIE-4.5-21B-A3B-Paddle"
+aistudio download --model ${MODEL_NAME}
+
+unset http_proxy && unset https_proxy
+rm -rf log_*
+
+# start prefill
+export FD_LOG_DIR="log_prefill"
+mkdir -p ${FD_LOG_DIR}
+
+export CUDA_VISIBLE_DEVICES=0
+export FD_DEBUG=1
+export ENABLE_V1_KVCACHE_SCHEDULER=0
+
+nohup python -m fastdeploy.entrypoints.openai.api_server \
+       --model ${MODEL_NAME} \
+       --port 8100 \
+       --metrics-port 8101 \
+       --engine-worker-queue-port 8102 \
+       --cache-queue-port 8103 \
+       --max-model-len 32768 \
+       --splitwise-role "prefill" \
+       2>&1 >${FD_LOG_DIR}/nohup &
+wait_for_health 8100
+
+# start decode
+export FD_LOG_DIR="log_decode"
+mkdir -p ${FD_LOG_DIR}
+
+export CUDA_VISIBLE_DEVICES=1
+export FD_DEBUG=1
+export ENABLE_V1_KVCACHE_SCHEDULER=0
+
+nohup python -m fastdeploy.entrypoints.openai.api_server \
+       --model ${MODEL_NAME} \
+       --port 9000 \
+       --metrics-port 9001 \
+       --engine-worker-queue-port 9002 \
+       --cache-queue-port 9003 \
+       --max-model-len 32768 \
+       --splitwise-role "decode" \
+       --innode-prefill-ports 8102 \
+       2>&1 >${FD_LOG_DIR}/nohup &
+wait_for_health 9000
@@ -0,0 +1,96 @@
+#!/bin/bash
+set -e
+
+# Test splitwise deployment
+# v0 requires prefill and decode in one node and it uses local scheduler
+# v1 supports prefill and decode in multi node and it uses splitwise scheduler
+# v2 supports prefill and decode in multi node and it uses router and local scheduler
+
+wait_for_health() {
+       local server_port=$1
+       while true; do
+       status_code=$(curl -s -o /dev/null -w "%{http_code}" "http://0.0.0.0:${server_port}/health" || echo "000")
+       if [ "$status_code" -eq 200 ]; then
+              break
+       else
+              echo "Service not ready. Retrying in 2s..."
+              sleep 2
+       fi
+       done
+}
+
+# prepare environment
+MODEL_NAME="PaddlePaddle/ERNIE-4.5-0.3B-Paddle"
+# MODEL_NAME="baidu/ERNIE-4.5-21B-A3B-Paddle"
+
+export FD_DEBUG=1
+export ENABLE_V1_KVCACHE_SCHEDULER=0
+export KVCACHE_GDRCOPY_FLUSH_ENABLE=1
+
+SCRIPT_PATH=$(readlink -f "$0")
+SCRIPT_DIR=$(dirname "$SCRIPT_PATH")
+export $(bash ${SCRIPT_DIR}/../../scripts/get_rdma_nics.sh gpu)
+echo "KVCACHE_RDMA_NICS:${KVCACHE_RDMA_NICS}"
+if [ -z "${KVCACHE_RDMA_NICS}" ]; then
+  echo "KVCACHE_RDMA_NICS is empty, please check the output of get_rdma_nics.sh"
+  exit 1
+fi
+
+unset http_proxy && unset https_proxy
+rm -rf log_*
+
+# start redis
+if ! redis-cli ping &>/dev/null; then
+    echo "Redis is not running. Starting redis-server..."
+    redis-server --daemonize yes
+    sleep 1
+else
+    echo "Redis is already running."
+fi
+sleep 1
+
+# start prefill
+export CUDA_VISIBLE_DEVICES=0
+export FD_LOG_DIR="log_prefill"
+mkdir -p ${FD_LOG_DIR}
+
+nohup python -m fastdeploy.entrypoints.openai.api_server \
+       --model ${MODEL_NAME} \
+       --port 8100 \
+       --metrics-port 8101 \
+       --engine-worker-queue-port 8102 \
+       --cache-queue-port 8103 \
+       --max-model-len 32768 \
+       --splitwise-role "prefill" \
+       --cache-transfer-protocol "rdma,ipc" \
+       --rdma-comm-ports 8104 \
+       --pd-comm-port 8105 \
+       --scheduler-name "splitwise" \
+       --scheduler-host "127.0.0.1" \
+       --scheduler-port 6379 \
+       --scheduler-ttl 9000 \
+       2>&1 >${FD_LOG_DIR}/nohup &
+wait_for_health 8100
+
+# start decode
+export CUDA_VISIBLE_DEVICES=1
+export FD_LOG_DIR="log_decode"
+mkdir -p ${FD_LOG_DIR}
+
+nohup python -m fastdeploy.entrypoints.openai.api_server \
+       --model ${MODEL_NAME} \
+       --port 9000 \
+       --metrics-port 9001 \
+       --engine-worker-queue-port 9002 \
+       --cache-queue-port 9003 \
+       --max-model-len 32768 \
+       --splitwise-role "decode" \
+       --cache-transfer-protocol "rdma,ipc" \
+       --rdma-comm-ports 9004 \
+       --pd-comm-port 9005 \
+       --scheduler-name "splitwise" \
+       --scheduler-host "127.0.0.1" \
+       --scheduler-port 6379 \
+       --scheduler-ttl 9000 \
+       2>&1 >${FD_LOG_DIR}/nohup &
+wait_for_health 9000