sgl-project · michael7193 · Apr 22, 2026 · Apr 22, 2026 · Apr 22, 2026 · Apr 23, 2026
@@ -673,6 +673,36 @@ SGLang supports various environment variables that can be used to configure its
       <td style={{padding: "9px 12px", backgroundColor: "rgba(255,255,255,0.05)"}}>Force using PyTorch gather/scatter fallback instead of Triton fused kernels for staging operations. Useful for debugging.</td>
       <td style={{padding: "9px 12px", backgroundColor: "rgba(255,255,255,0.02)"}}><code>false</code></td>
     </tr>
+    <tr>
+      <td style={{padding: "9px 12px", fontWeight: 500, backgroundColor: "rgba(255,255,255,0.02)"}}><code>SGLANG_ENABLE_PIPELINED_KV_TRANSFER</code></td>
+      <td style={{padding: "9px 12px", backgroundColor: "rgba(255,255,255,0.05)"}}>Enable layer-pipelined KV transfer. Splits prefill into layer groups and transfers KV cache incrementally after each group, overlapping RDMA transfer with GPU compute. Only effective in PD disaggregation prefill mode. When enabled together with overlap schedule, overlap is automatically disabled (pipelined subsumes its CPU sync savings). When using with <code>--hicache-storage-backend mooncake</code>, set <code>MOONCAKE_DEVICE</code> to a separate IB port (e.g. <code>mlx5_5</code>) to avoid RDMA resource conflicts.</td>
+      <td style={{padding: "9px 12px", backgroundColor: "rgba(255,255,255,0.02)"}}><code>false</code></td>
+    </tr>
+    <tr>
+      <td style={{padding: "9px 12px", fontWeight: 500, backgroundColor: "rgba(255,255,255,0.02)"}}><code>SGLANG_PIPELINE_GROUP_SIZE</code></td>
+      <td style={{padding: "9px 12px", backgroundColor: "rgba(255,255,255,0.05)"}}>(Optional) Override adaptive formula with a fixed number of layers per pipeline group. When not set, group_size is computed automatically based on prompt length.</td>
+      <td style={{padding: "9px 12px", backgroundColor: "rgba(255,255,255,0.02)"}}>Not set (adaptive)</td>
+    </tr>
+    <tr>
+      <td style={{padding: "9px 12px", fontWeight: 500, backgroundColor: "rgba(255,255,255,0.02)"}}><code>SGLANG_PIPELINE_MIN_TOKENS</code></td>
+      <td style={{padding: "9px 12px", backgroundColor: "rgba(255,255,255,0.05)"}}>Minimum average input token length to activate pipelined mode. Batches below this threshold use the normal (non-pipelined) path to avoid overhead on short prompts.</td>
+      <td style={{padding: "9px 12px", backgroundColor: "rgba(255,255,255,0.02)"}}><code>3072</code></td>
+    </tr>
+    <tr>
+      <td style={{padding: "9px 12px", fontWeight: 500, backgroundColor: "rgba(255,255,255,0.02)"}}><code>SGLANG_PIPELINE_SAT_MULTIPLIER</code></td>
+      <td style={{padding: "9px 12px", backgroundColor: "rgba(255,255,255,0.05)"}}>Multiplier for MIN_TOKENS that defines the prompt-length saturation point in the adaptive pipeline group-size formula.</td>
+      <td style={{padding: "9px 12px", backgroundColor: "rgba(255,255,255,0.02)"}}><code>3.0</code></td>
+    </tr>
+    <tr>
+      <td style={{padding: "9px 12px", fontWeight: 500, backgroundColor: "rgba(255,255,255,0.02)"}}><code>SGLANG_PIPELINE_MAX_ITERS</code></td>
+      <td style={{padding: "9px 12px", backgroundColor: "rgba(255,255,255,0.05)"}}>Maximum pipeline iterations (used for shortest eligible prompts). The adaptive formula linearly interpolates between MAX_ITERS and MIN_ITERS based on prompt length. Higher values mean smaller groups and more transfer overlap.</td>
+      <td style={{padding: "9px 12px", backgroundColor: "rgba(255,255,255,0.02)"}}><code>10</code></td>
+    </tr>
+    <tr>
+      <td style={{padding: "9px 12px", fontWeight: 500, backgroundColor: "rgba(255,255,255,0.02)"}}><code>SGLANG_PIPELINE_MIN_ITERS</code></td>
+      <td style={{padding: "9px 12px", backgroundColor: "rgba(255,255,255,0.05)"}}>Minimum pipeline iterations (used for longest prompts at saturation point = SAT_MULTIPLIER × MIN_TOKENS). Lower bound ensures large groups don't add excessive per-group overhead.</td>
+      <td style={{padding: "9px 12px", backgroundColor: "rgba(255,255,255,0.02)"}}><code>4</code></td>
+    </tr>
   </tbody>
 </table>
 

@@ -124,6 +124,33 @@ def send(
         """
         ...
 
+    def send_layer(
+        self,
+        kv_indices: npt.NDArray[np.int32],
+        layer_id: int,
+        cuda_event: object,
+        is_last: bool = False,
+        state_indices: Optional[List[int]] = None,
+    ):
+        """Send a single layer's KV cache for layer-pipelined transfer.
+
+        Backends that support layer-pipelined KV transfer should override
+        this method. The default raises NotImplementedError so that
+        unsupported backends fail loudly if pipelining is misconfigured.
+        """
+        raise NotImplementedError(
+            f"{type(self).__name__} does not support layer-pipelined KV transfer"
+        )
+
+    def send_final_metadata(
+        self,
+        state_indices: Optional[List[int]] = None,
+    ):
+        """Send final metadata after layer-pipelined KV transfer."""
+        raise NotImplementedError(
+            f"{type(self).__name__} does not support layer-pipelined KV transfer"
+        )
+
     def pop_decode_prefix_len(self) -> int:
         return 0
 

@@ -811,6 +811,46 @@ def _prepare_send_indices(
 
         return kv_indices, index_slice, is_last_chunk, False
 
+    def _prepare_layer_send_indices(
+        self,
+        kv_indices: npt.NDArray[np.int32],
+    ) -> Tuple[npt.NDArray[np.int32], slice, bool]:
+        """Common pre-processing for per-layer sends.
+
+        Layer-pipelined transfer sends the same page slice once per layer, so it
+        must not advance curr_idx for every layer. It still needs the same CP
+        rank filtering as send(). Final status is sent separately after metadata
+        is written.
+        """
+        num_indices = len(kv_indices)
+        index_slice = slice(self.curr_idx, self.curr_idx + num_indices)
+
+        if self.kv_mgr.enable_all_cp_ranks_for_transfer:
+            cache_key = (self.curr_idx, num_indices)
+            cached_key = getattr(self, "_layer_send_cache_key", None)
+            if cached_key == cache_key:
+                kv_indices, index_slice = self._layer_send_cache
+            else:
+                kv_indices, index_slice = filter_kv_indices_for_cp_rank(
+                    self.kv_mgr,
+                    kv_indices,
+                    index_slice,
+                )
+                self._layer_send_cache_key = cache_key
+                self._layer_send_cache = (kv_indices, index_slice)
+        elif self.kv_mgr.is_dummy_cp_rank:
+            return kv_indices, index_slice, True
+
+        return kv_indices, index_slice, len(kv_indices) == 0
+
+    def _prepare_final_metadata_send(self) -> Tuple[slice, bool]:
+        index_slice = slice(self.curr_idx, self.curr_idx)
+        self.curr_idx = self.num_kv_indices
+        if self.kv_mgr.is_dummy_cp_rank:
+            self.kv_mgr.update_status(self.bootstrap_room, KVPoll.Success)
+            return index_slice, True
+        return index_slice, False
+
     def send(
         self,
         kv_indices: npt.NDArray[np.int32],

@@ -25,6 +25,8 @@ class TransferKVChunk:
     prefill_aux_index: Optional[int]
     state_indices: Optional[List]
     chunk_id: Optional[int] = None
+    layer_id: Optional[int] = None
+    cuda_event: Optional[object] = None
     trace_ctx: Union[TraceReqContext, TraceNullContext] = dataclasses.field(
         default_factory=TraceNullContext
     )

@@ -63,6 +63,9 @@ def poll(self) -> KVPoll:
     def get_transfer_metric(self) -> KVTransferMetric:
         return KVTransferMetric()
 
+    def should_send_kv_chunk(self, num_pages: int, last_chunk: bool) -> bool:
+        return num_pages > 0 or last_chunk
+
     def init(
         self,
         kv_indices: list[int],
@@ -83,6 +86,23 @@ def send(
             f"FakeKVSender send with kv_indices: {kv_indices}, state_indices: {state_indices}"
         )
 
+    def send_layer(
+        self,
+        kv_indices: npt.NDArray[np.int32],
+        layer_id: int = 0,
+        cuda_event=None,
+        is_last: bool = False,
+        state_indices=None,
+    ):
+        """Per-layer KV send stub for warmup."""
+        logger.debug(f"FakeKVSender send_layer layer_id={layer_id} is_last={is_last}")
+
+    def send_final_metadata(self, state_indices=None):
+        self.has_sent = True
+        logger.debug(
+            f"FakeKVSender send_final_metadata with state_indices: {state_indices}"
+        )
+
     def failure_exception(self):
         raise Exception("Fake KVSender Exception")