From 562440537a90ad8ddc69cbc20842b7a7bfd345c1 Mon Sep 17 00:00:00 2001 From: Tony Liu Date: Sun, 26 Apr 2026 23:10:06 +0800 Subject: [PATCH 1/3] [Kernel] Marlin MoE: include SM 12.x in default arch list On SM 12.x (RTX 50-series, GB10/DGX Spark), Marlin and Marlin-MoE kernels are currently absent from the compiled `_C.so` / `_moe_C.so`. The driver JIT-promotes the `8.0+PTX` fallback to PTX-as-SM-12.x at first use, but the resulting cubin produces silently-wrong outputs on Marlin-MoE (observed: V4-Flash MoE forward emits gibberish tokens on a GB10 box, while the same model on Hopper emits coherent text). Note that PTX-JIT correctness is not guaranteed across major arch jumps; this is the expected failure mode of relying on `8.0+PTX` for sm_120/sm_121. `MARLIN_ARCHS`, `MARLIN_BF16_ARCHS`, and `MARLIN_MOE_ARCHS` in CMakeLists.txt do not list `12.0;12.1`, so the build omits native sm_120/sm_121 ELF entries from the kernel object. The neighbouring `MARLIN_FP8_ARCHS` and `MARLIN_MOE_FP8_ARCHS` already include `8.9;12.0;12.1`, so the precedent for SM 12.x in this file is set; this change extends the same pattern to the BF16/FP16 paths. Add `12.0;12.1` to the three arch lists. After rebuild on a GB10: `cuobjdump --list-elf _moe_C.abi3.so | grep sm_121` returns 22 native sm_121 ELF entries (was 0), and V4-Flash MoE forward output becomes coherent (verified haiku generation, 6.28 t/s steady on dual DGX Spark TP=2, max_tokens=80, single request). Refs #40860 (V4 rebase touches the build matrix, no overlap with this arch-list change) Signed-off-by: Tony Liu --- CMakeLists.txt | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/CMakeLists.txt b/CMakeLists.txt index fb8a1d7e1e14..8ede29eb5fcf 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -359,11 +359,11 @@ if(VLLM_GPU_LANG STREQUAL "CUDA") # are not supported by Machete yet. # marlin arches for fp16 output - cuda_archs_loose_intersection(MARLIN_ARCHS "8.0+PTX" "${CUDA_ARCHS}") + cuda_archs_loose_intersection(MARLIN_ARCHS "8.0+PTX;12.0;12.1" "${CUDA_ARCHS}") # marlin has limited support for turing cuda_archs_loose_intersection(MARLIN_SM75_ARCHS "7.5" "${CUDA_ARCHS}") # marlin arches for bf16 output (we need 9.0 for bf16 atomicAdd PTX) - cuda_archs_loose_intersection(MARLIN_BF16_ARCHS "8.0+PTX;9.0+PTX" "${CUDA_ARCHS}") + cuda_archs_loose_intersection(MARLIN_BF16_ARCHS "8.0+PTX;9.0+PTX;12.0;12.1" "${CUDA_ARCHS}") # marlin arches for fp8 input # - sm80 doesn't support fp8 computation # - sm90 and sm100 don't support QMMA.16832.F32.E4M3.E4M3 SAAS instruction @@ -1081,7 +1081,7 @@ if(VLLM_GPU_LANG STREQUAL "CUDA") # moe marlin arches # note that we always set `use_atomic_add=False` for moe marlin now, # so we don't need 9.0 for bf16 atomicAdd PTX - cuda_archs_loose_intersection(MARLIN_MOE_ARCHS "8.0+PTX" "${CUDA_ARCHS}") + cuda_archs_loose_intersection(MARLIN_MOE_ARCHS "8.0+PTX;12.0;12.1" "${CUDA_ARCHS}") # moe marlin has limited support for turing cuda_archs_loose_intersection(MARLIN_MOE_SM75_ARCHS "7.5" "${CUDA_ARCHS}") # moe marlin arches for fp8 input From 19f86242da9ef5d12409a3adc39446b5a4f31c25 Mon Sep 17 00:00:00 2001 From: Tony Liu Date: Mon, 27 Apr 2026 14:31:19 +0800 Subject: [PATCH 2/3] Use 12.0f family flag instead of explicit 12.0;12.1 Per @Harry-Chen review: family-conditional 12.0f produces a single cubin covering the entire SM12x family (SM120, SM121, future) instead of two separate cubins, reducing binary size. Aligns with existing convention in this file (SCALED_MM_ARCHS, FP4_ARCHS, MLA_ARCHS, CUTLASS_MOE_DATA_ARCHS all use Xf family flags). Signed-off-by: Tony Liu --- CMakeLists.txt | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/CMakeLists.txt b/CMakeLists.txt index 8ede29eb5fcf..52aee59c01ec 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -359,11 +359,11 @@ if(VLLM_GPU_LANG STREQUAL "CUDA") # are not supported by Machete yet. # marlin arches for fp16 output - cuda_archs_loose_intersection(MARLIN_ARCHS "8.0+PTX;12.0;12.1" "${CUDA_ARCHS}") + cuda_archs_loose_intersection(MARLIN_ARCHS "8.0+PTX;12.0f" "${CUDA_ARCHS}") # marlin has limited support for turing cuda_archs_loose_intersection(MARLIN_SM75_ARCHS "7.5" "${CUDA_ARCHS}") # marlin arches for bf16 output (we need 9.0 for bf16 atomicAdd PTX) - cuda_archs_loose_intersection(MARLIN_BF16_ARCHS "8.0+PTX;9.0+PTX;12.0;12.1" "${CUDA_ARCHS}") + cuda_archs_loose_intersection(MARLIN_BF16_ARCHS "8.0+PTX;9.0+PTX;12.0f" "${CUDA_ARCHS}") # marlin arches for fp8 input # - sm80 doesn't support fp8 computation # - sm90 and sm100 don't support QMMA.16832.F32.E4M3.E4M3 SAAS instruction @@ -1081,7 +1081,7 @@ if(VLLM_GPU_LANG STREQUAL "CUDA") # moe marlin arches # note that we always set `use_atomic_add=False` for moe marlin now, # so we don't need 9.0 for bf16 atomicAdd PTX - cuda_archs_loose_intersection(MARLIN_MOE_ARCHS "8.0+PTX;12.0;12.1" "${CUDA_ARCHS}") + cuda_archs_loose_intersection(MARLIN_MOE_ARCHS "8.0+PTX;12.0f" "${CUDA_ARCHS}") # moe marlin has limited support for turing cuda_archs_loose_intersection(MARLIN_MOE_SM75_ARCHS "7.5" "${CUDA_ARCHS}") # moe marlin arches for fp8 input From 06b90bd4429f28254a540fed699b144b2b5b33b3 Mon Sep 17 00:00:00 2001 From: Tony Liu Date: Mon, 27 Apr 2026 15:41:38 +0800 Subject: [PATCH 3/3] Gate 12.0f family flag behind CUDA >= 13.0 Per @Harry-Chen review: family specifier 12.0f was added in CUDA 12.9, but vLLM still supports CUDA 12.8 builds. Without the gate, builds on 12.8 fail at compile time. Mirrors the existing pattern for MLA_ARCHS at L499-L503. Pre-13.0 fallback uses 12.0a;12.1a (architecture-specific cubins) which all CUDA 12.x toolchains accept. Post-13.0 uses 12.0f (single SM12x family cubin) for smaller binary size. Also applied unified handling to MARLIN_FP8_ARCHS (was previously 12.0;12.1 without family-flag option) for consistency, per Harry's suggestion. Signed-off-by: Tony Liu --- CMakeLists.txt | 26 ++++++++++++++++++++++---- 1 file changed, 22 insertions(+), 4 deletions(-) diff --git a/CMakeLists.txt b/CMakeLists.txt index 52aee59c01ec..7e2df042a441 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -359,16 +359,30 @@ if(VLLM_GPU_LANG STREQUAL "CUDA") # are not supported by Machete yet. # marlin arches for fp16 output - cuda_archs_loose_intersection(MARLIN_ARCHS "8.0+PTX;12.0f" "${CUDA_ARCHS}") + # Family-conditional 12.0f (one cubin for SM12x family) requires CUDA >= 13.0; + # fall back to architecture-specific 12.0a;12.1a on CUDA < 13.0 (e.g. 12.8). + if(${CMAKE_CUDA_COMPILER_VERSION} VERSION_GREATER_EQUAL 13.0) + cuda_archs_loose_intersection(MARLIN_ARCHS "8.0+PTX;12.0f" "${CUDA_ARCHS}") + else() + cuda_archs_loose_intersection(MARLIN_ARCHS "8.0+PTX;12.0a;12.1a" "${CUDA_ARCHS}") + endif() # marlin has limited support for turing cuda_archs_loose_intersection(MARLIN_SM75_ARCHS "7.5" "${CUDA_ARCHS}") # marlin arches for bf16 output (we need 9.0 for bf16 atomicAdd PTX) - cuda_archs_loose_intersection(MARLIN_BF16_ARCHS "8.0+PTX;9.0+PTX;12.0f" "${CUDA_ARCHS}") + if(${CMAKE_CUDA_COMPILER_VERSION} VERSION_GREATER_EQUAL 13.0) + cuda_archs_loose_intersection(MARLIN_BF16_ARCHS "8.0+PTX;9.0+PTX;12.0f" "${CUDA_ARCHS}") + else() + cuda_archs_loose_intersection(MARLIN_BF16_ARCHS "8.0+PTX;9.0+PTX;12.0a;12.1a" "${CUDA_ARCHS}") + endif() # marlin arches for fp8 input # - sm80 doesn't support fp8 computation # - sm90 and sm100 don't support QMMA.16832.F32.E4M3.E4M3 SAAS instruction # so we only enable fp8 computation for SM89 (e.g. RTX 40x0) and 12.0 (e.g. RTX 50x0) - cuda_archs_loose_intersection(MARLIN_FP8_ARCHS "8.9;12.0;12.1" "${CUDA_ARCHS}") + if(${CMAKE_CUDA_COMPILER_VERSION} VERSION_GREATER_EQUAL 13.0) + cuda_archs_loose_intersection(MARLIN_FP8_ARCHS "8.9;12.0f" "${CUDA_ARCHS}") + else() + cuda_archs_loose_intersection(MARLIN_FP8_ARCHS "8.9;12.0a;12.1a" "${CUDA_ARCHS}") + endif() # marlin arches for other files cuda_archs_loose_intersection(MARLIN_OTHER_ARCHS "7.5;8.0+PTX" "${CUDA_ARCHS}") @@ -1081,7 +1095,11 @@ if(VLLM_GPU_LANG STREQUAL "CUDA") # moe marlin arches # note that we always set `use_atomic_add=False` for moe marlin now, # so we don't need 9.0 for bf16 atomicAdd PTX - cuda_archs_loose_intersection(MARLIN_MOE_ARCHS "8.0+PTX;12.0f" "${CUDA_ARCHS}") + if(${CMAKE_CUDA_COMPILER_VERSION} VERSION_GREATER_EQUAL 13.0) + cuda_archs_loose_intersection(MARLIN_MOE_ARCHS "8.0+PTX;12.0f" "${CUDA_ARCHS}") + else() + cuda_archs_loose_intersection(MARLIN_MOE_ARCHS "8.0+PTX;12.0a;12.1a" "${CUDA_ARCHS}") + endif() # moe marlin has limited support for turing cuda_archs_loose_intersection(MARLIN_MOE_SM75_ARCHS "7.5" "${CUDA_ARCHS}") # moe marlin arches for fp8 input