From 6a8068e47db591ce444803dfe3c2b9e4952a8795 Mon Sep 17 00:00:00 2001
From: Roger Wang <hey@rogerw.io>
Date: Sat, 2 May 2026 18:09:19 -0700
Subject: [PATCH 1/2] add

Co-authored-by: Copilot <copilot@github.com>
---
 vllm/envs.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/vllm/envs.py b/vllm/envs.py
index 8378c9762ae7..6dc916d65094 100755
--- a/vllm/envs.py
+++ b/vllm/envs.py
@@ -245,7 +245,7 @@
     VLLM_DEBUG_WORKSPACE: bool = False
     VLLM_DISABLE_SHARED_EXPERTS_STREAM: bool = False
     VLLM_SHARED_EXPERTS_STREAM_TOKEN_THRESHOLD: int = 256
-    VLLM_MULTI_STREAM_GEMM_TOKEN_THRESHOLD: int = 4096
+    VLLM_MULTI_STREAM_GEMM_TOKEN_THRESHOLD: int = 1024
     VLLM_COMPILE_CACHE_SAVE_FORMAT: Literal["binary", "unpacked"] = "binary"
     VLLM_USE_V2_MODEL_RUNNER: bool = False
     VLLM_LOG_MODEL_INSPECTION: bool = False
@@ -1689,7 +1689,7 @@ def _get_or_set_default() -> str:
     # the multi-stream path entirely. Empirical crossover on B300 (148 SMs)
     # is ~4096; B200 (132 SMs) is expected ~3072.
     "VLLM_MULTI_STREAM_GEMM_TOKEN_THRESHOLD": lambda: int(
-        os.getenv("VLLM_MULTI_STREAM_GEMM_TOKEN_THRESHOLD", "4096")
+        os.getenv("VLLM_MULTI_STREAM_GEMM_TOKEN_THRESHOLD", "1024")
     ),
     # Format for saving torch.compile cache artifacts
     # - "binary": saves as binary file

From 0e1a35bc73b2e29d890d49da3d1cce040fc88e93 Mon Sep 17 00:00:00 2001
From: Roger Wang <hey@rogerw.io>
Date: Sat, 2 May 2026 18:25:14 -0700
Subject: [PATCH 2/2] comment

Co-authored-by: Copilot <copilot@github.com>
---
 vllm/envs.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/vllm/envs.py b/vllm/envs.py
index 6dc916d65094..0955894754fb 100755
--- a/vllm/envs.py
+++ b/vllm/envs.py
@@ -1686,8 +1686,8 @@ def _get_or_set_default() -> str:
     # tokens the FP8 main GEMM has idle SMs to share with the bf16 aux GEMMs
     # and overlap is a 5-45% win; above it the FP8 GEMM saturates the device
     # and the cross-stream sync becomes pure overhead. Set to 0 to disable
-    # the multi-stream path entirely. Empirical crossover on B300 (148 SMs)
-    # is ~4096; B200 (132 SMs) is expected ~3072.
+    # the multi-stream path entirely. See #PR 41526 for the empirical result
+    # for the default value of 1024 tokens.
     "VLLM_MULTI_STREAM_GEMM_TOKEN_THRESHOLD": lambda: int(
         os.getenv("VLLM_MULTI_STREAM_GEMM_TOKEN_THRESHOLD", "1024")
     ),