PaddlePaddle
diff --git a/‎examples/splitwise/start_v0.sh‎
Lines changed: 3 additions & 2 deletions b/‎examples/splitwise/start_v0.sh‎
Lines changed: 3 additions & 2 deletions
diff --git a/‎examples/splitwise/start_v1.sh‎
Lines changed: 7 additions & 2 deletions b/‎examples/splitwise/start_v1.sh‎
Lines changed: 7 additions & 2 deletions
diff --git a/‎examples/splitwise/start_v1_tp2.sh‎
Lines changed: 7 additions & 2 deletions b/‎examples/splitwise/start_v1_tp2.sh‎
Lines changed: 7 additions & 2 deletions
diff --git a/‎examples/splitwise/start_v2.sh‎
Lines changed: 12 additions & 3 deletions b/‎examples/splitwise/start_v2.sh‎
Lines changed: 12 additions & 3 deletions
diff --git a/‎examples/splitwise/start_v2_tp2.sh‎
Lines changed: 8 additions & 3 deletions b/‎examples/splitwise/start_v2_tp2.sh‎
Lines changed: 8 additions & 3 deletions
diff --git a/‎fastdeploy/cache_manager/cache_messager.py‎
Lines changed: 0 additions & 2 deletions b/‎fastdeploy/cache_manager/cache_messager.py‎
Lines changed: 0 additions & 2 deletions
diff --git a/‎fastdeploy/config.py‎
Lines changed: 36 additions & 17 deletions b/‎fastdeploy/config.py‎
Lines changed: 36 additions & 17 deletions
diff --git a/‎fastdeploy/engine/args_utils.py‎
Lines changed: 3 additions & 1 deletion b/‎fastdeploy/engine/args_utils.py‎
Lines changed: 3 additions & 1 deletion
@@ -1,12 +1,14 @@
 #!/bin/bash
+set -e
+
 # Test splitwise deployment
 # v0 requires prefill and decode in one node and it uses local scheduler
 # v1 supports prefill and decode in multi node and it uses splitwise scheduler
 # v2 supports prefill and decode in multi node and it uses router and local scheduler
 
 # start prefill
 export FD_LOG_DIR="log_prefill"
-rm -rf ${FD_LOG_DIR}
+rm -rf log_*
 mkdir -p ${FD_LOG_DIR}
 
 export CUDA_VISIBLE_DEVICES=0
@@ -26,7 +28,6 @@ sleep 2
 
 # start decode
 export FD_LOG_DIR="log_decode"
-rm -rf ${FD_LOG_DIR}
 mkdir -p ${FD_LOG_DIR}
 
 export CUDA_VISIBLE_DEVICES=1
 
@@ -1,4 +1,6 @@
 #!/bin/bash
+set -e
+
 # Test splitwise deployment
 # v0 requires prefill and decode in one node and it uses local scheduler
 # v1 supports prefill and decode in multi node and it uses splitwise scheduler
@@ -14,6 +16,10 @@ SCRIPT_PATH=$(readlink -f "$0")
 SCRIPT_DIR=$(dirname "$SCRIPT_PATH")
 export $(bash ${SCRIPT_DIR}/../../scripts/get_rdma_nics.sh gpu)
 echo "KVCACHE_RDMA_NICS:${KVCACHE_RDMA_NICS}"
+if [ -z "${KVCACHE_RDMA_NICS}" ]; then
+  echo "KVCACHE_RDMA_NICS is empty, please check the output of get_rdma_nics.sh"
+  exit 1
+fi
 
 # start redis
 if ! redis-cli ping &>/dev/null; then
@@ -28,7 +34,7 @@ sleep 1
 # start prefill
 export CUDA_VISIBLE_DEVICES=0
 export FD_LOG_DIR="log_prefill"
-rm -rf ${FD_LOG_DIR}
+rm -rf log_*
 mkdir -p ${FD_LOG_DIR}
 
 nohup python -m fastdeploy.entrypoints.openai.api_server \
@@ -52,7 +58,6 @@ sleep 1
 # start decode
 export CUDA_VISIBLE_DEVICES=1
 export FD_LOG_DIR="log_decode"
-rm -rf ${FD_LOG_DIR}
 mkdir -p ${FD_LOG_DIR}
 
 nohup python -m fastdeploy.entrypoints.openai.api_server \
 
@@ -1,4 +1,6 @@
 #!/bin/bash
+set -e
+
 # Test splitwise deployment
 # v0 requires prefill and decode in one node and it uses local scheduler
 # v1 supports prefill and decode in multi node and it uses splitwise scheduler
@@ -14,6 +16,10 @@ SCRIPT_PATH=$(readlink -f "$0")
 SCRIPT_DIR=$(dirname "$SCRIPT_PATH")
 export $(bash ${SCRIPT_DIR}/../../scripts/get_rdma_nics.sh gpu)
 echo "KVCACHE_RDMA_NICS:${KVCACHE_RDMA_NICS}"
+if [ -z "${KVCACHE_RDMA_NICS}" ]; then
+  echo "KVCACHE_RDMA_NICS is empty, please check the output of get_rdma_nics.sh"
+  exit 1
+fi
 
 # start redis
 if ! redis-cli ping &>/dev/null; then
@@ -28,7 +34,7 @@ sleep 1
 # start prefill
 export CUDA_VISIBLE_DEVICES=0,1
 export FD_LOG_DIR="log_prefill"
-rm -rf ${FD_LOG_DIR}
+rm -rf log_*
 mkdir -p ${FD_LOG_DIR}
 
 nohup python -m fastdeploy.entrypoints.openai.api_server \
@@ -53,7 +59,6 @@ sleep 1
 # start decode
 export CUDA_VISIBLE_DEVICES=2,3
 export FD_LOG_DIR="log_decode"
-rm -rf ${FD_LOG_DIR}
 mkdir -p ${FD_LOG_DIR}
 
 nohup python -m fastdeploy.entrypoints.openai.api_server \
 
@@ -1,11 +1,15 @@
 #!/bin/bash
+set -e
+
 # Test splitwise deployment
 # v0 requires prefill and decode in one node and it uses local scheduler
 # v1 supports prefill and decode in multi node and it uses splitwise scheduler
 # v2 supports prefill and decode in multi node and it uses router and local scheduler
 
 # prepare environment
 MODEL_NAME="baidu/ERNIE-4.5-0.3B-Paddle"
+# MODEL_NAME="baidu/ERNIE-4.5-21B-A3B-Paddle"
+
 export FD_DEBUG=1
 export ENABLE_V1_KVCACHE_SCHEDULER=0
 export KVCACHE_GDRCOPY_FLUSH_ENABLE=1
@@ -14,22 +18,26 @@ SCRIPT_PATH=$(readlink -f "$0")
 SCRIPT_DIR=$(dirname "$SCRIPT_PATH")
 export $(bash ${SCRIPT_DIR}/../../scripts/get_rdma_nics.sh gpu)
 echo "KVCACHE_RDMA_NICS:${KVCACHE_RDMA_NICS}"
+if [ -z "${KVCACHE_RDMA_NICS}" ]; then
+  echo "KVCACHE_RDMA_NICS is empty, please check the output of get_rdma_nics.sh"
+  exit 1
+fi
 
 # start router
 export FD_LOG_DIR="log_router"
-rm -rf ${FD_LOG_DIR}
+rm -rf log_*
 mkdir -p ${FD_LOG_DIR}
 
 router_port=9000
 nohup python -m fastdeploy.router.launch \
     --port ${router_port} \
+    --splitwise \
     2>&1 >${FD_LOG_DIR}/nohup &
 sleep 1
 
 # start prefill
 export CUDA_VISIBLE_DEVICES=0
 export FD_LOG_DIR="log_prefill"
-rm -rf ${FD_LOG_DIR}
 mkdir -p ${FD_LOG_DIR}
 
 nohup python -m fastdeploy.entrypoints.openai.api_server \
@@ -40,6 +48,7 @@ nohup python -m fastdeploy.entrypoints.openai.api_server \
        --cache-queue-port 8103 \
        --max-model-len 32768 \
        --splitwise-role "prefill" \
+       --cache-transfer-protocol "ipc,rdma" \
        --rdma-comm-ports 8104 \
        --pd-comm-port 8105 \
        --router "0.0.0.0:${router_port}" \
@@ -49,7 +58,6 @@ sleep 1
 # start decode
 export CUDA_VISIBLE_DEVICES=1
 export FD_LOG_DIR="log_decode"
-rm -rf ${FD_LOG_DIR}
 mkdir -p ${FD_LOG_DIR}
 
 nohup python -m fastdeploy.entrypoints.openai.api_server \
@@ -60,6 +68,7 @@ nohup python -m fastdeploy.entrypoints.openai.api_server \
        --cache-queue-port 8203 \
        --max-model-len 32768 \
        --splitwise-role "decode" \
+       --cache-transfer-protocol "ipc,rdma" \
        --rdma-comm-ports 8204 \
        --pd-comm-port 8205 \
        --router "0.0.0.0:${router_port}" \
 
@@ -1,4 +1,6 @@
 #!/bin/bash
+set -e
+
 # Test splitwise deployment
 # v0 requires prefill and decode in one node and it uses local scheduler
 # v1 supports prefill and decode in multi node and it uses splitwise scheduler
@@ -14,22 +16,26 @@ SCRIPT_PATH=$(readlink -f "$0")
 SCRIPT_DIR=$(dirname "$SCRIPT_PATH")
 export $(bash ${SCRIPT_DIR}/../../scripts/get_rdma_nics.sh gpu)
 echo "KVCACHE_RDMA_NICS:${KVCACHE_RDMA_NICS}"
+if [ -z "${KVCACHE_RDMA_NICS}" ]; then
+  echo "KVCACHE_RDMA_NICS is empty, please check the output of get_rdma_nics.sh"
+  exit 1
+fi
 
 # start router
 export FD_LOG_DIR="log_router"
-rm -rf ${FD_LOG_DIR}
+rm -rf log_*
 mkdir -p ${FD_LOG_DIR}
 
 router_port=9000
 nohup python -m fastdeploy.router.launch \
     --port ${router_port} \
+    --splitwise \
     2>&1 >${FD_LOG_DIR}/nohup &
 sleep 1
 
 # start prefill
 export CUDA_VISIBLE_DEVICES=0,1
 export FD_LOG_DIR="log_prefill"
-rm -rf ${FD_LOG_DIR}
 mkdir -p ${FD_LOG_DIR}
 
 nohup python -m fastdeploy.entrypoints.openai.api_server \
@@ -50,7 +56,6 @@ sleep 1
 # start decode
 export CUDA_VISIBLE_DEVICES=2,3
 export FD_LOG_DIR="log_decode"
-rm -rf ${FD_LOG_DIR}
 mkdir -p ${FD_LOG_DIR}
 
 nohup python -m fastdeploy.entrypoints.openai.api_server \
 
@@ -35,8 +35,6 @@
 from fastdeploy.model_executor.ops.gpu import get_output_kv_signal, set_data_ipc
 from fastdeploy.utils import envs, get_logger
 
-logger = get_logger("cache_messager", "cache_messager.log")
-
 
 def parse_args():
     """
 
@@ -1303,6 +1303,24 @@ def print(self):
         logger.info("=============================================================")
 
 
+class RouterConfig:
+    """
+    Configuration for router
+    Attributes:
+        router: the url of router, such as http://127.0.0.1:8000
+        api_server_host: the host ip of model server
+        api_server_port: the http port of model server
+    """
+
+    def __init__(self, args: dict):
+        self.router = args["router"]
+        if self.router is not None and not self.router.startswith(("http://", "https://")):
+            self.router = f"http://{self.router}"
+
+        self.api_server_host = get_host_ip()
+        self.api_server_port = args["port"]
+
+
 class CommitConfig:
     """
     Configuration for tracking version information from version.txt
@@ -1404,6 +1422,7 @@ def __init__(
         speculative_config: SpeculativeConfig = None,
         eplb_config: EPLBConfig = None,
         structured_outputs_config: StructuredOutputsConfig = None,
+        router_config: RouterConfig = None,
         tokenizer: str = None,
         ips: str = None,
         use_warmup: bool = False,
@@ -1416,7 +1435,6 @@ def __init__(
         early_stop_config: Optional[Dict[str, Any]] = None,
         tool_parser: str = None,
         test_mode=False,
-        port=None,
     ):
         self.model_config: ModelConfig = model_config  # type: ignore
         self.cache_config: CacheConfig = cache_config  # type: ignore
@@ -1432,6 +1450,7 @@ def __init__(
         self.cache_config: CacheConfig = cache_config  # type: ignore
         self.plas_attention_config: Optional[PlasAttentionConfig] = plas_attention_config
         self.structured_outputs_config: StructuredOutputsConfig = structured_outputs_config
+        self.router_config: RouterConfig = router_config
 
         # Initialize cuda graph capture list
         max_capture_shape = self.scheduler_config.max_num_seqs
@@ -1459,7 +1478,6 @@ def __init__(
             self.ips = self.ips.split(",")
 
         self.host_ip = get_host_ip()
-        self.port = port
 
         if self.ips is None:
             self.nnode = 1
@@ -1730,39 +1748,39 @@ def init_cache_info(self):
         """
         initialize cache info
         """
-        # TODO: group the splitiwse params
+        # TODO: group the splitiwse params, remove code of v0
         # v0 requires prefill and decode in one node and it uses local scheduler
         # v1 supports prefill and decode in multi node and it uses splitwise or dp scheduler
         # v2 supports prefill and decode in multi node and it uses router and local scheduler
         self.splitwise_version = None
-        if self.scheduler_config.name == "local" and self.scheduler_config.router is None:
+        if self.scheduler_config.name == "local" and (self.router_config is None or self.router_config.router is None):
             self.splitwise_version = "v0"
         elif self.scheduler_config.name in ("splitwise", "dp"):
             self.splitwise_version = "v1"
-        elif self.scheduler_config.name == "local" and self.scheduler_config.router:
+        elif self.scheduler_config.name == "local" and self.router_config and self.router_config.router:
             self.splitwise_version = "v2"
         else:
             raise ValueError(
                 f"Unsupported scheduler mode, scheduler_name: {self.scheduler_config.name}, "
-                f"router: {self.scheduler_config.router}"
+                f"router_config: {self.router_config}"
             )
         logger.info(f"splitwise_version: {self.splitwise_version}")
 
+        if isinstance(self.parallel_config.engine_worker_queue_port, (int, str)):
+            engine_worker_queue_port = self.parallel_config.engine_worker_queue_port
+        else:
+            engine_worker_queue_port = self.parallel_config.engine_worker_queue_port[
+                self.parallel_config.local_data_parallel_id
+            ]
+        connector_port = self.cache_config.pd_comm_port[0] if self.cache_config.pd_comm_port else None
+
         self.disaggregate_info = {}
         if self.scheduler_config.splitwise_role != "mixed":
             self.disaggregate_info["role"] = self.scheduler_config.splitwise_role
             self.disaggregate_info["cache_info"] = dict()
             current_protocol = self.cache_config.cache_transfer_protocol.split(",")
             self.disaggregate_info["transfer_protocol"] = current_protocol
 
-            if isinstance(self.parallel_config.engine_worker_queue_port, (int, str)):
-                engine_worker_queue_port = self.parallel_config.engine_worker_queue_port
-            else:
-                engine_worker_queue_port = self.parallel_config.engine_worker_queue_port[
-                    self.parallel_config.local_data_parallel_id
-                ]
-            connector_port = self.cache_config.pd_comm_port[0] if self.cache_config.pd_comm_port else None
-
             for protocol in current_protocol:
                 if protocol == "ipc":
                     self.disaggregate_info["cache_info"][protocol] = {
@@ -1778,17 +1796,18 @@ def init_cache_info(self):
                     }
             logger.info(f"disaggregate_info: {self.disaggregate_info}")
 
-            self.splitwise_instance_info = {
+        if self.router_config:
+            self.register_info = {
                 "role": self.scheduler_config.splitwise_role,
                 "host_ip": self.host_ip,
-                "port": self.port,
+                "port": self.router_config.api_server_port,
                 "connector_port": connector_port,
                 "rdma_ports": self.cache_config.rdma_comm_ports,
                 "engine_worker_queue_port": engine_worker_queue_port,
                 "device_ids": self.local_device_ids,
                 "transfer_protocol": self.cache_config.cache_transfer_protocol.split(","),
             }
-            logger.info(f"splitwise_instance_info: {self.splitwise_instance_info}")
+            logger.info(f"register_info: {self.register_info}")
 
     def read_from_config(self):
         """
 
@@ -34,6 +34,7 @@
     ParallelConfig,
     PlasAttentionConfig,
     PoolerConfig,
+    RouterConfig,
     RunnerOption,
     SpeculativeConfig,
     StructuredOutputsConfig,
@@ -1141,6 +1142,7 @@ def create_engine_config(self, port_availability_check=True) -> FDConfig:
         scheduler_cfg = self.create_scheduler_config()
         graph_opt_cfg = self.create_graph_optimization_config()
         plas_attention_config = self.create_plas_attention_config()
+        router_config = RouterConfig(all_dict)
 
         early_stop_cfg = self.create_early_stop_config()
         early_stop_cfg.update_enable_early_stop(self.enable_early_stop)
@@ -1160,6 +1162,7 @@ def create_engine_config(self, port_availability_check=True) -> FDConfig:
             speculative_config=speculative_cfg,
             eplb_config=eplb_cfg,
             structured_outputs_config=structured_outputs_config,
+            router_config=router_config,
             ips=self.ips,
             use_warmup=self.use_warmup,
             limit_mm_per_prompt=self.limit_mm_per_prompt,
@@ -1172,5 +1175,4 @@ def create_engine_config(self, port_availability_check=True) -> FDConfig:
             graph_opt_config=graph_opt_cfg,
             plas_attention_config=plas_attention_config,
             early_stop_config=early_stop_cfg,
-            port=self.port,
         )