From 4871b45efcc8d9afefa161967c01e6ffb657d84c Mon Sep 17 00:00:00 2001
From: Feiyue Zhai <feiyue.zhai@amd.com>
Date: Thu, 12 Mar 2026 11:00:32 +0800
Subject: [PATCH 1/3] [PD] Add SGLANG_DISAGGREGATION_NUM_PRE_ALLOCATE_REQS env
 var for configurable KV transfer overlap

When max_num_reqs > 32, the decode worker's req_to_token_pool pre_alloc_size
was hardcoded to 0, limiting KV cache transfer concurrency. This adds an
environment variable to configure extra pool slots so more KV transfers can
overlap with decode execution without raising max_running_requests.

Made-with: Cursor
---
 python/sglang/srt/environ.py                                  | 4 ++++
 .../sglang/srt/model_executor/model_runner_kv_cache_mixin.py  | 4 +++-
 2 files changed, 7 insertions(+), 1 deletion(-)

diff --git a/python/sglang/srt/environ.py b/python/sglang/srt/environ.py
index e9c6481ba9f8..e8fdc7d17155 100644
--- a/python/sglang/srt/environ.py
+++ b/python/sglang/srt/environ.py
@@ -244,6 +244,10 @@ class Envs:
     SGLANG_DISAGGREGATION_WAITING_TIMEOUT = EnvInt(300)
     SGLANG_DISAGGREGATION_NIXL_BACKEND = EnvStr("UCX")
     SGLANG_DISAGGREGATION_ALL_CP_RANKS_TRANSFER = EnvBool(False)
+    # Extra slots in req_to_token_pool for decode workers (only effective when
+    # max_num_reqs > 32). Increases pool capacity so more KV cache transfers
+    # can overlap with decode execution without raising max_running_requests.
+    SGLANG_DISAGGREGATION_NUM_PRE_ALLOCATE_REQS = EnvInt(0)
 
     # Scheduler: others:
     SGLANG_EMPTY_CACHE_INTERVAL = EnvFloat(-1)  # in seconds. Set if you observe high memory accumulation over a long serving period.
diff --git a/python/sglang/srt/model_executor/model_runner_kv_cache_mixin.py b/python/sglang/srt/model_executor/model_runner_kv_cache_mixin.py
index 588da80342ca..0f25ab8ca81f 100644
--- a/python/sglang/srt/model_executor/model_runner_kv_cache_mixin.py
+++ b/python/sglang/srt/model_executor/model_runner_kv_cache_mixin.py
@@ -388,7 +388,9 @@ def _init_pools(self: ModelRunner):
 
                 # subscribe memory for pre-allocated requests
                 # if max_num_reqs <= 32, we pre-allocate 2x requests
-                pre_alloc_size = max_num_reqs * 2 if max_num_reqs <= 32 else 0
+                from sglang.srt.environ import envs
+                pre_alloc_size = envs.SGLANG_DISAGGREGATION_NUM_PRE_ALLOCATE_REQS.get()
+                pre_alloc_size = max_num_reqs * 2 if max_num_reqs <= 32 else pre_alloc_size
                 if config := self.mambaish_config:
                     self.req_to_token_pool = HybridMambaDecodeReqToTokenPool(
                         size=max_num_reqs,

From 163ea06deec7843260c7e2597af5b5d7649f87cc Mon Sep 17 00:00:00 2001
From: Feiyue Zhai <feiyue.zhai@amd.com>
Date: Fri, 13 Mar 2026 10:37:59 +0800
Subject: [PATCH 2/3] fix format

---
 .../sglang/srt/model_executor/model_runner_kv_cache_mixin.py | 5 ++++-
 1 file changed, 4 insertions(+), 1 deletion(-)

diff --git a/python/sglang/srt/model_executor/model_runner_kv_cache_mixin.py b/python/sglang/srt/model_executor/model_runner_kv_cache_mixin.py
index 0f25ab8ca81f..cdb142e765c9 100644
--- a/python/sglang/srt/model_executor/model_runner_kv_cache_mixin.py
+++ b/python/sglang/srt/model_executor/model_runner_kv_cache_mixin.py
@@ -389,8 +389,11 @@ def _init_pools(self: ModelRunner):
                 # subscribe memory for pre-allocated requests
                 # if max_num_reqs <= 32, we pre-allocate 2x requests
                 from sglang.srt.environ import envs
+
                 pre_alloc_size = envs.SGLANG_DISAGGREGATION_NUM_PRE_ALLOCATE_REQS.get()
-                pre_alloc_size = max_num_reqs * 2 if max_num_reqs <= 32 else pre_alloc_size
+                pre_alloc_size = (
+                    max_num_reqs * 2 if max_num_reqs <= 32 else pre_alloc_size
+                )
                 if config := self.mambaish_config:
                     self.req_to_token_pool = HybridMambaDecodeReqToTokenPool(
                         size=max_num_reqs,

From a232e1c29b47b70afdbf6bd8ee0f0125dc34e7fe Mon Sep 17 00:00:00 2001
From: HaiShaw <hixiao@gmail.com>
Date: Fri, 27 Mar 2026 01:20:56 -0700
Subject: [PATCH 3/3] Address review comments

---
 python/sglang/srt/model_executor/model_runner_kv_cache_mixin.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/python/sglang/srt/model_executor/model_runner_kv_cache_mixin.py b/python/sglang/srt/model_executor/model_runner_kv_cache_mixin.py
index cdb142e765c9..5b3f8515a484 100644
--- a/python/sglang/srt/model_executor/model_runner_kv_cache_mixin.py
+++ b/python/sglang/srt/model_executor/model_runner_kv_cache_mixin.py
@@ -8,6 +8,7 @@
 
 from sglang.srt.configs.model_config import get_nsa_index_head_dim, is_deepseek_nsa
 from sglang.srt.distributed.parallel_state import get_world_group
+from sglang.srt.environ import envs
 from sglang.srt.layers.dp_attention import get_attention_tp_size
 from sglang.srt.mem_cache.allocator import (
     PagedTokenToKVPoolAllocator,
@@ -388,7 +389,6 @@ def _init_pools(self: ModelRunner):
 
                 # subscribe memory for pre-allocated requests
                 # if max_num_reqs <= 32, we pre-allocate 2x requests
-                from sglang.srt.environ import envs
 
                 pre_alloc_size = envs.SGLANG_DISAGGREGATION_NUM_PRE_ALLOCATE_REQS.get()
                 pre_alloc_size = (