diff --git a/docs/backend/pd_disaggregation.md b/docs/backend/pd_disaggregation.md index e77164372ca..9dbc2705d3a 100644 --- a/docs/backend/pd_disaggregation.md +++ b/docs/backend/pd_disaggregation.md @@ -47,6 +47,23 @@ $ python -m sglang.launch_server --model-path deepseek-ai/DeepSeek-V3-0324 --dis # decode 1 $ python -m sglang.launch_server --model-path deepseek-ai/DeepSeek-V3-0324 --disaggregation-ib-device ${device_name} --disaggregation-mode decode --host ${local_ip} --port 30001 --trust-remote-code --dist-init-addr ${decode_master_ip}:5000 --nnodes 2 --node-rank 1 --tp-size 16 --dp-size 8 --enable-dp-attention --enable-deepep-moe --deepep-mode low_latency --mem-fraction-static 0.8 --max-running-requests 128 ``` +### Advanced Configuration + +PD Disaggregation with Mooncake supports the following environment variables for fine-grained control over system behavior. + +#### Prefill Server Configuration +| Variable | Description | Default | +|:--------:|:-----------:|:--------: +| **`SGLANG_DISAGGREGATION_THREAD_POOL_SIZE`** | Controls the total number of worker threads for KV transfer operations per TP rank | A dynamic value calculated by `int(0.75 * os.cpu_count()) // 8)`, which is limited to be larger than 4 and less than 12 to ensure efficiency and prevent thread race conditions | +| **`SGLANG_DISAGGREGATION_QUEUE_SIZE`** | Sets the maximum pending tasks in the parallel transfer queue | `4` | +| **`SGLANG_DISAGGREGATION_BOOTSTRAP_TIMEOUT`** | Timeout (seconds) for receiving destination KV indices during request initialization | `30` | + +#### Decode Server Configuration +| Variable | Description | Default | +|:--------:|:-----------:|:--------: +| **`SGLANG_DISAGGREGATION_HEARTBEAT_INTERVAL`** | Interval (seconds) between health checks to prefill bootstrap servers | `5.0` | +| **`SGLANG_DISAGGREGATION_HEARTBEAT_MAX_FAILURE`** | Consecutive heartbeat failures before marking prefill server offline | `2` | + ## NIXL ### Requirements diff --git a/python/sglang/srt/disaggregation/mooncake/conn.py b/python/sglang/srt/disaggregation/mooncake/conn.py index 4c3faeeb6b7..940a25d7423 100644 --- a/python/sglang/srt/disaggregation/mooncake/conn.py +++ b/python/sglang/srt/disaggregation/mooncake/conn.py @@ -677,14 +677,15 @@ def __init__( self.kv_mgr.update_status(bootstrap_room, KVPoll.Bootstrapping) self.aux_index = None self.bootstrap_server_url = bootstrap_addr - self.init_time = time.time() self.conclude_state = None + self.init_time = None # inner state self.curr_idx = 0 def init(self, num_kv_indices: int, aux_index: Optional[int] = None): self.num_kv_indices = num_kv_indices self.aux_index = aux_index + self.init_time = time.time() def send( self, @@ -713,15 +714,16 @@ def poll(self) -> KVPoll: if status in (KVPoll.Success, KVPoll.Failed): self.conclude_state = status elif status == KVPoll.Bootstrapping: - now = time.time() - elapsed = now - self.init_time - if elapsed >= self.kv_mgr.bootstrap_time_out: - self.kv_mgr.record_failure( - self.bootstrap_room, - f"Request {self.bootstrap_room} timed out after {elapsed:.1f}s in KVPoll.Bootstrapping", - ) - self.conclude_state = KVPoll.Failed - return KVPoll.Failed + if self.init_time is not None: + now = time.time() + elapsed = now - self.init_time + if elapsed >= self.kv_mgr.bootstrap_time_out: + self.kv_mgr.record_failure( + self.bootstrap_room, + f"Request {self.bootstrap_room} timed out after {elapsed:.1f}s in KVPoll.Bootstrapping", + ) + self.conclude_state = KVPoll.Failed + return KVPoll.Failed return status else: