From 6a8068e47db591ce444803dfe3c2b9e4952a8795 Mon Sep 17 00:00:00 2001 From: Roger Wang Date: Sat, 2 May 2026 18:09:19 -0700 Subject: [PATCH 1/2] add Co-authored-by: Copilot --- vllm/envs.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/vllm/envs.py b/vllm/envs.py index 8378c9762ae7..6dc916d65094 100755 --- a/vllm/envs.py +++ b/vllm/envs.py @@ -245,7 +245,7 @@ VLLM_DEBUG_WORKSPACE: bool = False VLLM_DISABLE_SHARED_EXPERTS_STREAM: bool = False VLLM_SHARED_EXPERTS_STREAM_TOKEN_THRESHOLD: int = 256 - VLLM_MULTI_STREAM_GEMM_TOKEN_THRESHOLD: int = 4096 + VLLM_MULTI_STREAM_GEMM_TOKEN_THRESHOLD: int = 1024 VLLM_COMPILE_CACHE_SAVE_FORMAT: Literal["binary", "unpacked"] = "binary" VLLM_USE_V2_MODEL_RUNNER: bool = False VLLM_LOG_MODEL_INSPECTION: bool = False @@ -1689,7 +1689,7 @@ def _get_or_set_default() -> str: # the multi-stream path entirely. Empirical crossover on B300 (148 SMs) # is ~4096; B200 (132 SMs) is expected ~3072. "VLLM_MULTI_STREAM_GEMM_TOKEN_THRESHOLD": lambda: int( - os.getenv("VLLM_MULTI_STREAM_GEMM_TOKEN_THRESHOLD", "4096") + os.getenv("VLLM_MULTI_STREAM_GEMM_TOKEN_THRESHOLD", "1024") ), # Format for saving torch.compile cache artifacts # - "binary": saves as binary file From 0e1a35bc73b2e29d890d49da3d1cce040fc88e93 Mon Sep 17 00:00:00 2001 From: Roger Wang Date: Sat, 2 May 2026 18:25:14 -0700 Subject: [PATCH 2/2] comment Co-authored-by: Copilot --- vllm/envs.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/vllm/envs.py b/vllm/envs.py index 6dc916d65094..0955894754fb 100755 --- a/vllm/envs.py +++ b/vllm/envs.py @@ -1686,8 +1686,8 @@ def _get_or_set_default() -> str: # tokens the FP8 main GEMM has idle SMs to share with the bf16 aux GEMMs # and overlap is a 5-45% win; above it the FP8 GEMM saturates the device # and the cross-stream sync becomes pure overhead. Set to 0 to disable - # the multi-stream path entirely. Empirical crossover on B300 (148 SMs) - # is ~4096; B200 (132 SMs) is expected ~3072. + # the multi-stream path entirely. See #PR 41526 for the empirical result + # for the default value of 1024 tokens. "VLLM_MULTI_STREAM_GEMM_TOKEN_THRESHOLD": lambda: int( os.getenv("VLLM_MULTI_STREAM_GEMM_TOKEN_THRESHOLD", "1024") ),