From bb88154e99f29a6fc4a563be0bb938ae86f9fdf4 Mon Sep 17 00:00:00 2001 From: Majid Taheri Andani Date: Mon, 1 Jun 2026 16:54:36 +0000 Subject: [PATCH] [Perf] Add tuned selective_state_update configs for H200 and RTX PRO 6000 Blackwell The merged set from #43083 only ships configs for B200, GB200, and H100_80GB_HBM3. On H200 and RTX PRO 6000 Blackwell Server Edition the loader falls back to the kernel's built-in defaults, leaving measurable performance on the table. This adds 4 JSON config files (no code change) generated by the existing benchmarks/kernels/benchmark_selective_state_update.py --save-configs script, matching the loader filename pattern in vllm/model_executor/layers/mamba/ops/mamba_ssm.py. Devices added (headdim=64, dstate=128, same shape Nemotron-H/Nano/Super uses): - NVIDIA_H200 (float16, float32) - NVIDIA_RTX_PRO_6000_Blackwell_Server_Edition (float16, float32) Validation: - H200 (p5en.48xlarge): +2.6% end-to-end serving throughput on Nemotron-Nano-9B-v2 at TP=1; kernel-level 1.2-1.5x (fp32) and ~2x (fp16) vs the default fallback. - RTX PRO 6000 Blackwell (g7e): end-to-end neutral on the current default fp32 path (Triton's heuristic already happened to pick a near-optimal config); fp16 kernel-level shows ~2x. JSON shipped to lock the choice across Triton releases. Signed-off-by: Majid Taheri Andani --- ..._name=NVIDIA_H200,cache_dtype=float16.json | 87 +++++++++++++++++++ ..._name=NVIDIA_H200,cache_dtype=float32.json | 87 +++++++++++++++++++ ...ll_Server_Edition,cache_dtype=float16.json | 87 +++++++++++++++++++ ...ll_Server_Edition,cache_dtype=float32.json | 87 +++++++++++++++++++ 4 files changed, 348 insertions(+) create mode 100644 vllm/model_executor/layers/mamba/ops/configs/selective_state_update/headdim=64,dstate=128,device_name=NVIDIA_H200,cache_dtype=float16.json create mode 100644 vllm/model_executor/layers/mamba/ops/configs/selective_state_update/headdim=64,dstate=128,device_name=NVIDIA_H200,cache_dtype=float32.json create mode 100644 vllm/model_executor/layers/mamba/ops/configs/selective_state_update/headdim=64,dstate=128,device_name=NVIDIA_RTX_PRO_6000_Blackwell_Server_Edition,cache_dtype=float16.json create mode 100644 vllm/model_executor/layers/mamba/ops/configs/selective_state_update/headdim=64,dstate=128,device_name=NVIDIA_RTX_PRO_6000_Blackwell_Server_Edition,cache_dtype=float32.json diff --git a/vllm/model_executor/layers/mamba/ops/configs/selective_state_update/headdim=64,dstate=128,device_name=NVIDIA_H200,cache_dtype=float16.json b/vllm/model_executor/layers/mamba/ops/configs/selective_state_update/headdim=64,dstate=128,device_name=NVIDIA_H200,cache_dtype=float16.json new file mode 100644 index 000000000000..fdf38cdf042c --- /dev/null +++ b/vllm/model_executor/layers/mamba/ops/configs/selective_state_update/headdim=64,dstate=128,device_name=NVIDIA_H200,cache_dtype=float16.json @@ -0,0 +1,87 @@ +{ + "triton_version": "3.6.0", + "8": { + "BLOCK_SIZE_M": 4, + "num_warps": 2 + }, + "16": { + "BLOCK_SIZE_M": 4, + "num_warps": 1 + }, + "32": { + "BLOCK_SIZE_M": 4, + "num_warps": 1 + }, + "64": { + "BLOCK_SIZE_M": 16, + "num_warps": 4 + }, + "128": { + "BLOCK_SIZE_M": 8, + "num_warps": 2 + }, + "256": { + "BLOCK_SIZE_M": 8, + "num_warps": 2 + }, + "512": { + "BLOCK_SIZE_M": 16, + "num_warps": 1 + }, + "1024": { + "BLOCK_SIZE_M": 16, + "num_warps": 1 + }, + "2048": { + "BLOCK_SIZE_M": 8, + "num_warps": 2 + }, + "4096": { + "BLOCK_SIZE_M": 16, + "num_warps": 2 + }, + "8192": { + "BLOCK_SIZE_M": 32, + "num_warps": 2 + }, + "12288": { + "BLOCK_SIZE_M": 32, + "num_warps": 4 + }, + "16384": { + "BLOCK_SIZE_M": 16, + "num_warps": 2 + }, + "24576": { + "BLOCK_SIZE_M": 32, + "num_warps": 4 + }, + "32768": { + "BLOCK_SIZE_M": 32, + "num_warps": 2 + }, + "49152": { + "BLOCK_SIZE_M": 16, + "num_warps": 2 + }, + "65536": { + "BLOCK_SIZE_M": 16, + "num_warps": 2 + }, + "98304": { + "BLOCK_SIZE_M": 16, + "num_warps": 2 + }, + "131072": { + "BLOCK_SIZE_M": 16, + "num_warps": 2 + }, + "196608": { + "BLOCK_SIZE_M": 16, + "num_warps": 2 + }, + "262144": { + "BLOCK_SIZE_M": 16, + "num_warps": 2 + } +} \ No newline at end of file diff --git a/vllm/model_executor/layers/mamba/ops/configs/selective_state_update/headdim=64,dstate=128,device_name=NVIDIA_H200,cache_dtype=float32.json b/vllm/model_executor/layers/mamba/ops/configs/selective_state_update/headdim=64,dstate=128,device_name=NVIDIA_H200,cache_dtype=float32.json new file mode 100644 index 000000000000..82bdff701342 --- /dev/null +++ b/vllm/model_executor/layers/mamba/ops/configs/selective_state_update/headdim=64,dstate=128,device_name=NVIDIA_H200,cache_dtype=float32.json @@ -0,0 +1,87 @@ +{ + "triton_version": "3.6.0", + "8": { + "BLOCK_SIZE_M": 8, + "num_warps": 4 + }, + "16": { + "BLOCK_SIZE_M": 4, + "num_warps": 1 + }, + "32": { + "BLOCK_SIZE_M": 4, + "num_warps": 1 + }, + "64": { + "BLOCK_SIZE_M": 4, + "num_warps": 1 + }, + "128": { + "BLOCK_SIZE_M": 4, + "num_warps": 1 + }, + "256": { + "BLOCK_SIZE_M": 4, + "num_warps": 1 + }, + "512": { + "BLOCK_SIZE_M": 4, + "num_warps": 1 + }, + "1024": { + "BLOCK_SIZE_M": 4, + "num_warps": 1 + }, + "2048": { + "BLOCK_SIZE_M": 4, + "num_warps": 1 + }, + "4096": { + "BLOCK_SIZE_M": 4, + "num_warps": 1 + }, + "8192": { + "BLOCK_SIZE_M": 4, + "num_warps": 1 + }, + "12288": { + "BLOCK_SIZE_M": 8, + "num_warps": 1 + }, + "16384": { + "BLOCK_SIZE_M": 8, + "num_warps": 1 + }, + "24576": { + "BLOCK_SIZE_M": 8, + "num_warps": 1 + }, + "32768": { + "BLOCK_SIZE_M": 4, + "num_warps": 1 + }, + "49152": { + "BLOCK_SIZE_M": 4, + "num_warps": 1 + }, + "65536": { + "BLOCK_SIZE_M": 8, + "num_warps": 2 + }, + "98304": { + "BLOCK_SIZE_M": 4, + "num_warps": 1 + }, + "131072": { + "BLOCK_SIZE_M": 32, + "num_warps": 4 + }, + "196608": { + "BLOCK_SIZE_M": 16, + "num_warps": 1 + }, + "262144": { + "BLOCK_SIZE_M": 16, + "num_warps": 1 + } +} \ No newline at end of file diff --git a/vllm/model_executor/layers/mamba/ops/configs/selective_state_update/headdim=64,dstate=128,device_name=NVIDIA_RTX_PRO_6000_Blackwell_Server_Edition,cache_dtype=float16.json b/vllm/model_executor/layers/mamba/ops/configs/selective_state_update/headdim=64,dstate=128,device_name=NVIDIA_RTX_PRO_6000_Blackwell_Server_Edition,cache_dtype=float16.json new file mode 100644 index 000000000000..6be92a4bc28c --- /dev/null +++ b/vllm/model_executor/layers/mamba/ops/configs/selective_state_update/headdim=64,dstate=128,device_name=NVIDIA_RTX_PRO_6000_Blackwell_Server_Edition,cache_dtype=float16.json @@ -0,0 +1,87 @@ +{ + "triton_version": "3.6.0", + "8": { + "BLOCK_SIZE_M": 4, + "num_warps": 4 + }, + "16": { + "BLOCK_SIZE_M": 4, + "num_warps": 1 + }, + "32": { + "BLOCK_SIZE_M": 4, + "num_warps": 1 + }, + "64": { + "BLOCK_SIZE_M": 8, + "num_warps": 8 + }, + "128": { + "BLOCK_SIZE_M": 16, + "num_warps": 8 + }, + "256": { + "BLOCK_SIZE_M": 16, + "num_warps": 8 + }, + "512": { + "BLOCK_SIZE_M": 16, + "num_warps": 8 + }, + "1024": { + "BLOCK_SIZE_M": 16, + "num_warps": 8 + }, + "2048": { + "BLOCK_SIZE_M": 16, + "num_warps": 8 + }, + "4096": { + "BLOCK_SIZE_M": 16, + "num_warps": 8 + }, + "8192": { + "BLOCK_SIZE_M": 32, + "num_warps": 1 + }, + "12288": { + "BLOCK_SIZE_M": 16, + "num_warps": 1 + }, + "16384": { + "BLOCK_SIZE_M": 32, + "num_warps": 4 + }, + "24576": { + "BLOCK_SIZE_M": 32, + "num_warps": 4 + }, + "32768": { + "BLOCK_SIZE_M": 4, + "num_warps": 1 + }, + "49152": { + "BLOCK_SIZE_M": 32, + "num_warps": 2 + }, + "65536": { + "BLOCK_SIZE_M": 32, + "num_warps": 1 + }, + "98304": { + "BLOCK_SIZE_M": 4, + "num_warps": 1 + }, + "131072": { + "BLOCK_SIZE_M": 4, + "num_warps": 1 + }, + "196608": { + "BLOCK_SIZE_M": 32, + "num_warps": 1 + }, + "262144": { + "BLOCK_SIZE_M": 16, + "num_warps": 1 + } +} \ No newline at end of file diff --git a/vllm/model_executor/layers/mamba/ops/configs/selective_state_update/headdim=64,dstate=128,device_name=NVIDIA_RTX_PRO_6000_Blackwell_Server_Edition,cache_dtype=float32.json b/vllm/model_executor/layers/mamba/ops/configs/selective_state_update/headdim=64,dstate=128,device_name=NVIDIA_RTX_PRO_6000_Blackwell_Server_Edition,cache_dtype=float32.json new file mode 100644 index 000000000000..7b55fab1add0 --- /dev/null +++ b/vllm/model_executor/layers/mamba/ops/configs/selective_state_update/headdim=64,dstate=128,device_name=NVIDIA_RTX_PRO_6000_Blackwell_Server_Edition,cache_dtype=float32.json @@ -0,0 +1,87 @@ +{ + "triton_version": "3.6.0", + "8": { + "BLOCK_SIZE_M": 8, + "num_warps": 8 + }, + "16": { + "BLOCK_SIZE_M": 8, + "num_warps": 8 + }, + "32": { + "BLOCK_SIZE_M": 8, + "num_warps": 8 + }, + "64": { + "BLOCK_SIZE_M": 8, + "num_warps": 8 + }, + "128": { + "BLOCK_SIZE_M": 8, + "num_warps": 8 + }, + "256": { + "BLOCK_SIZE_M": 8, + "num_warps": 8 + }, + "512": { + "BLOCK_SIZE_M": 8, + "num_warps": 8 + }, + "1024": { + "BLOCK_SIZE_M": 16, + "num_warps": 8 + }, + "2048": { + "BLOCK_SIZE_M": 16, + "num_warps": 8 + }, + "4096": { + "BLOCK_SIZE_M": 16, + "num_warps": 1 + }, + "8192": { + "BLOCK_SIZE_M": 4, + "num_warps": 8 + }, + "12288": { + "BLOCK_SIZE_M": 16, + "num_warps": 1 + }, + "16384": { + "BLOCK_SIZE_M": 4, + "num_warps": 1 + }, + "24576": { + "BLOCK_SIZE_M": 4, + "num_warps": 1 + }, + "32768": { + "BLOCK_SIZE_M": 4, + "num_warps": 4 + }, + "49152": { + "BLOCK_SIZE_M": 16, + "num_warps": 4 + }, + "65536": { + "BLOCK_SIZE_M": 64, + "num_warps": 8 + }, + "98304": { + "BLOCK_SIZE_M": 16, + "num_warps": 1 + }, + "131072": { + "BLOCK_SIZE_M": 8, + "num_warps": 1 + }, + "196608": { + "BLOCK_SIZE_M": 64, + "num_warps": 8 + }, + "262144": { + "BLOCK_SIZE_M": 64, + "num_warps": 4 + } +} \ No newline at end of file