Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 4 additions & 0 deletions python/sglang/srt/environ.py
Original file line number Diff line number Diff line change
Expand Up @@ -243,6 +243,10 @@ class Envs:
SGLANG_DISAGGREGATION_WAITING_TIMEOUT = EnvInt(300)
SGLANG_DISAGGREGATION_NIXL_BACKEND = EnvStr("UCX")
SGLANG_DISAGGREGATION_ALL_CP_RANKS_TRANSFER = EnvBool(False)
# Extra slots in req_to_token_pool for decode workers (only effective when
# max_num_reqs > 32). Increases pool capacity so more KV cache transfers
# can overlap with decode execution without raising max_running_requests.
SGLANG_DISAGGREGATION_NUM_PRE_ALLOCATE_REQS = EnvInt(0)

# Scheduler: others:
SGLANG_EMPTY_CACHE_INTERVAL = EnvFloat(-1) # in seconds. Set if you observe high memory accumulation over a long serving period.
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,7 @@

from sglang.srt.configs.model_config import get_nsa_index_head_dim, is_deepseek_nsa
from sglang.srt.distributed.parallel_state import get_world_group
from sglang.srt.environ import envs
from sglang.srt.layers.dp_attention import get_attention_tp_size
from sglang.srt.mem_cache.allocator import (
PagedTokenToKVPoolAllocator,
Expand Down Expand Up @@ -392,7 +393,11 @@ def _init_pools(self: ModelRunner):

# subscribe memory for pre-allocated requests
# if max_num_reqs <= 32, we pre-allocate 2x requests
pre_alloc_size = max_num_reqs * 2 if max_num_reqs <= 32 else 0

pre_alloc_size = envs.SGLANG_DISAGGREGATION_NUM_PRE_ALLOCATE_REQS.get()
pre_alloc_size = (
max_num_reqs * 2 if max_num_reqs <= 32 else pre_alloc_size
)
if config := self.mambaish_config:
self.req_to_token_pool = HybridMambaDecodeReqToTokenPool(
size=max_num_reqs,
Expand Down
Loading