diff --git a/python/sglang/srt/environ.py b/python/sglang/srt/environ.py index 28b9ee4f4b86..adc21301ba15 100644 --- a/python/sglang/srt/environ.py +++ b/python/sglang/srt/environ.py @@ -243,6 +243,10 @@ class Envs: SGLANG_DISAGGREGATION_WAITING_TIMEOUT = EnvInt(300) SGLANG_DISAGGREGATION_NIXL_BACKEND = EnvStr("UCX") SGLANG_DISAGGREGATION_ALL_CP_RANKS_TRANSFER = EnvBool(False) + # Extra slots in req_to_token_pool for decode workers (only effective when + # max_num_reqs > 32). Increases pool capacity so more KV cache transfers + # can overlap with decode execution without raising max_running_requests. + SGLANG_DISAGGREGATION_NUM_PRE_ALLOCATE_REQS = EnvInt(0) # Scheduler: others: SGLANG_EMPTY_CACHE_INTERVAL = EnvFloat(-1) # in seconds. Set if you observe high memory accumulation over a long serving period. diff --git a/python/sglang/srt/model_executor/model_runner_kv_cache_mixin.py b/python/sglang/srt/model_executor/model_runner_kv_cache_mixin.py index 810f31380b74..291d9515df17 100644 --- a/python/sglang/srt/model_executor/model_runner_kv_cache_mixin.py +++ b/python/sglang/srt/model_executor/model_runner_kv_cache_mixin.py @@ -8,6 +8,7 @@ from sglang.srt.configs.model_config import get_nsa_index_head_dim, is_deepseek_nsa from sglang.srt.distributed.parallel_state import get_world_group +from sglang.srt.environ import envs from sglang.srt.layers.dp_attention import get_attention_tp_size from sglang.srt.mem_cache.allocator import ( PagedTokenToKVPoolAllocator, @@ -392,7 +393,11 @@ def _init_pools(self: ModelRunner): # subscribe memory for pre-allocated requests # if max_num_reqs <= 32, we pre-allocate 2x requests - pre_alloc_size = max_num_reqs * 2 if max_num_reqs <= 32 else 0 + + pre_alloc_size = envs.SGLANG_DISAGGREGATION_NUM_PRE_ALLOCATE_REQS.get() + pre_alloc_size = ( + max_num_reqs * 2 if max_num_reqs <= 32 else pre_alloc_size + ) if config := self.mambaish_config: self.req_to_token_pool = HybridMambaDecodeReqToTokenPool( size=max_num_reqs,