From 092337b47ff714c82926c1b7e50ffbccfd28d3ef Mon Sep 17 00:00:00 2001 From: Lucas Kabela Date: Fri, 24 Apr 2026 16:41:52 -0700 Subject: [PATCH 1/2] Remove workaround code for fixed cublas issue Signed-off-by: Lucas Kabela --- vllm/model_executor/layers/batch_invariant.py | 25 +++++++++++-------- 1 file changed, 14 insertions(+), 11 deletions(-) diff --git a/vllm/model_executor/layers/batch_invariant.py b/vllm/model_executor/layers/batch_invariant.py index 3831f7aa9658..d605df1f09e9 100644 --- a/vllm/model_executor/layers/batch_invariant.py +++ b/vllm/model_executor/layers/batch_invariant.py @@ -930,22 +930,21 @@ def enable_batch_invariant_mode(): _batch_invariant_MODE = True _batch_invariant_LIB = torch.library.Library("aten", "IMPL") - if current_platform.is_device_capability_family( - 100 - ) or current_platform.is_device_capability_family(80): - # For PyTorch 2.9, B200 uses GEMV for bs=1 - # Requires https://github.com/pytorch/pytorch/pull/166735 + if current_platform.is_device_capability_family(80): + # SM80 (Ampere) cannot rely on cuBLASLt-only determinism; install the + # triton persistent matmul overrides for mm/addmm/matmul/linear. _batch_invariant_LIB.impl("aten::mm", mm_batch_invariant, "CUDA") _batch_invariant_LIB.impl("aten::addmm", addmm_batch_invariant, "CUDA") _batch_invariant_LIB.impl("aten::matmul", matmul_batch_invariant, "CUDA") _batch_invariant_LIB.impl("aten::linear", linear_batch_invariant, "CUDA") - - # Query the shared memory size and set block size - # accordingly to avoid triton OutOfResources - _fp16_block_size_n = 256 if get_max_shared_memory_bytes() > 106496 else 128 else: - # Only source of batch invariance for Hopper is split-k, can disable through - # cuBLAS workspace config + # Hopper (SM90) and Blackwell (SM100): the only source of batch + # variance is split-k, which we disable via the cuBLAS workspace + # config. The previous SM100 override that routed mm/addmm/matmul/ + # linear through the triton persistent kernel was a workaround for a + # torch 2.9 cuBLAS GEMV-at-bs=1 path (pytorch/pytorch#166735); torch + # 2.12 makes cuBLASLt batch-invariant on B200 directly, so B200 now + # uses the same path as H100. _original_cublas_workspace_cfg = os.environ.get("CUBLAS_WORKSPACE_CONFIG", None) _original_cublaslt_workspace_size = os.environ.get( "CUBLASLT_WORKSPACE_SIZE", None @@ -953,6 +952,10 @@ def enable_batch_invariant_mode(): os.environ["CUBLAS_WORKSPACE_CONFIG"] = ":16:8" os.environ["CUBLASLT_WORKSPACE_SIZE"] = "1" + # Triton bmm/persistent-matmul kernels read this for the FP16 N-tile size; + # set unconditionally because bmm is overridden on all CUDA platforms. + _fp16_block_size_n = 256 if get_max_shared_memory_bytes() > 106496 else 128 + _batch_invariant_LIB.impl( "aten::_log_softmax", _log_softmax_batch_invariant, "CUDA" ) From d92565b2e28e934b60de5544d05f9ca08adbde37 Mon Sep 17 00:00:00 2001 From: Lucas Kabela Date: Mon, 27 Apr 2026 16:24:53 -0700 Subject: [PATCH 2/2] Update vllm/model_executor/layers/batch_invariant.py Co-authored-by: Wentao Ye <44945378+yewentao256@users.noreply.github.com> Signed-off-by: Lucas Kabela --- vllm/model_executor/layers/batch_invariant.py | 6 +----- 1 file changed, 1 insertion(+), 5 deletions(-) diff --git a/vllm/model_executor/layers/batch_invariant.py b/vllm/model_executor/layers/batch_invariant.py index 49ce6c517a73..bcdd30500329 100644 --- a/vllm/model_executor/layers/batch_invariant.py +++ b/vllm/model_executor/layers/batch_invariant.py @@ -940,11 +940,7 @@ def enable_batch_invariant_mode(): else: # Hopper (SM90) and Blackwell (SM100): the only source of batch # variance is split-k, which we disable via the cuBLAS workspace - # config. The previous SM100 override that routed mm/addmm/matmul/ - # linear through the triton persistent kernel was a workaround for a - # torch 2.9 cuBLAS GEMV-at-bs=1 path (pytorch/pytorch#166735); torch - # 2.12 makes cuBLASLt batch-invariant on B200 directly, so B200 now - # uses the same path as H100. + # config. _original_cublas_workspace_cfg = os.environ.get("CUBLAS_WORKSPACE_CONFIG", None) _original_cublaslt_workspace_size = os.environ.get( "CUBLASLT_WORKSPACE_SIZE", None