From 562440537a90ad8ddc69cbc20842b7a7bfd345c1 Mon Sep 17 00:00:00 2001
From: Tony Liu <tonyliu0512@gmail.com>
Date: Sun, 26 Apr 2026 23:10:06 +0800
Subject: [PATCH 1/3] [Kernel] Marlin MoE: include SM 12.x in default arch list

On SM 12.x (RTX 50-series, GB10/DGX Spark), Marlin and Marlin-MoE kernels
are currently absent from the compiled `_C.so` / `_moe_C.so`. The driver
JIT-promotes the `8.0+PTX` fallback to PTX-as-SM-12.x at first use, but
the resulting cubin produces silently-wrong outputs on Marlin-MoE
(observed: V4-Flash MoE forward emits gibberish tokens on a GB10 box,
while the same model on Hopper emits coherent text). Note that PTX-JIT
correctness is not guaranteed across major arch jumps; this is the
expected failure mode of relying on `8.0+PTX` for sm_120/sm_121.

`MARLIN_ARCHS`, `MARLIN_BF16_ARCHS`, and `MARLIN_MOE_ARCHS` in
CMakeLists.txt do not list `12.0;12.1`, so the build omits native
sm_120/sm_121 ELF entries from the kernel object. The neighbouring
`MARLIN_FP8_ARCHS` and `MARLIN_MOE_FP8_ARCHS` already include
`8.9;12.0;12.1`, so the precedent for SM 12.x in this file is set;
this change extends the same pattern to the BF16/FP16 paths.

Add `12.0;12.1` to the three arch lists. After rebuild on a GB10:
`cuobjdump --list-elf _moe_C.abi3.so | grep sm_121` returns 22 native
sm_121 ELF entries (was 0), and V4-Flash MoE forward output becomes
coherent (verified haiku generation, 6.28 t/s steady on dual DGX Spark
TP=2, max_tokens=80, single request).

Refs #40860 (V4 rebase touches the build matrix, no overlap with this
arch-list change)

Signed-off-by: Tony Liu <tonyliu0512@gmail.com>
---
 CMakeLists.txt | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/CMakeLists.txt b/CMakeLists.txt
index fb8a1d7e1e14..8ede29eb5fcf 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -359,11 +359,11 @@ if(VLLM_GPU_LANG STREQUAL "CUDA")
   # are not supported by Machete yet.
 
   # marlin arches for fp16 output
-  cuda_archs_loose_intersection(MARLIN_ARCHS "8.0+PTX" "${CUDA_ARCHS}")
+  cuda_archs_loose_intersection(MARLIN_ARCHS "8.0+PTX;12.0;12.1" "${CUDA_ARCHS}")
   # marlin has limited support for turing
   cuda_archs_loose_intersection(MARLIN_SM75_ARCHS "7.5" "${CUDA_ARCHS}")
   # marlin arches for bf16 output (we need 9.0 for bf16 atomicAdd PTX)
-  cuda_archs_loose_intersection(MARLIN_BF16_ARCHS "8.0+PTX;9.0+PTX" "${CUDA_ARCHS}")
+  cuda_archs_loose_intersection(MARLIN_BF16_ARCHS "8.0+PTX;9.0+PTX;12.0;12.1" "${CUDA_ARCHS}")
   # marlin arches for fp8 input
   # - sm80 doesn't support fp8 computation
   # - sm90 and sm100 don't support QMMA.16832.F32.E4M3.E4M3 SAAS instruction
@@ -1081,7 +1081,7 @@ if(VLLM_GPU_LANG STREQUAL "CUDA")
   # moe marlin arches
   # note that we always set `use_atomic_add=False` for moe marlin now,
   # so we don't need 9.0 for bf16 atomicAdd PTX
-  cuda_archs_loose_intersection(MARLIN_MOE_ARCHS "8.0+PTX" "${CUDA_ARCHS}")
+  cuda_archs_loose_intersection(MARLIN_MOE_ARCHS "8.0+PTX;12.0;12.1" "${CUDA_ARCHS}")
   # moe marlin has limited support for turing
   cuda_archs_loose_intersection(MARLIN_MOE_SM75_ARCHS "7.5" "${CUDA_ARCHS}")
   # moe marlin arches for fp8 input

From 19f86242da9ef5d12409a3adc39446b5a4f31c25 Mon Sep 17 00:00:00 2001
From: Tony Liu <tonyliu0512@gmail.com>
Date: Mon, 27 Apr 2026 14:31:19 +0800
Subject: [PATCH 2/3] Use 12.0f family flag instead of explicit 12.0;12.1

Per @Harry-Chen review: family-conditional 12.0f produces a single cubin
covering the entire SM12x family (SM120, SM121, future) instead of two
separate cubins, reducing binary size. Aligns with existing convention in
this file (SCALED_MM_ARCHS, FP4_ARCHS, MLA_ARCHS, CUTLASS_MOE_DATA_ARCHS
all use Xf family flags).

Signed-off-by: Tony Liu <tonyliu0512@gmail.com>
---
 CMakeLists.txt | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/CMakeLists.txt b/CMakeLists.txt
index 8ede29eb5fcf..52aee59c01ec 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -359,11 +359,11 @@ if(VLLM_GPU_LANG STREQUAL "CUDA")
   # are not supported by Machete yet.
 
   # marlin arches for fp16 output
-  cuda_archs_loose_intersection(MARLIN_ARCHS "8.0+PTX;12.0;12.1" "${CUDA_ARCHS}")
+  cuda_archs_loose_intersection(MARLIN_ARCHS "8.0+PTX;12.0f" "${CUDA_ARCHS}")
   # marlin has limited support for turing
   cuda_archs_loose_intersection(MARLIN_SM75_ARCHS "7.5" "${CUDA_ARCHS}")
   # marlin arches for bf16 output (we need 9.0 for bf16 atomicAdd PTX)
-  cuda_archs_loose_intersection(MARLIN_BF16_ARCHS "8.0+PTX;9.0+PTX;12.0;12.1" "${CUDA_ARCHS}")
+  cuda_archs_loose_intersection(MARLIN_BF16_ARCHS "8.0+PTX;9.0+PTX;12.0f" "${CUDA_ARCHS}")
   # marlin arches for fp8 input
   # - sm80 doesn't support fp8 computation
   # - sm90 and sm100 don't support QMMA.16832.F32.E4M3.E4M3 SAAS instruction
@@ -1081,7 +1081,7 @@ if(VLLM_GPU_LANG STREQUAL "CUDA")
   # moe marlin arches
   # note that we always set `use_atomic_add=False` for moe marlin now,
   # so we don't need 9.0 for bf16 atomicAdd PTX
-  cuda_archs_loose_intersection(MARLIN_MOE_ARCHS "8.0+PTX;12.0;12.1" "${CUDA_ARCHS}")
+  cuda_archs_loose_intersection(MARLIN_MOE_ARCHS "8.0+PTX;12.0f" "${CUDA_ARCHS}")
   # moe marlin has limited support for turing
   cuda_archs_loose_intersection(MARLIN_MOE_SM75_ARCHS "7.5" "${CUDA_ARCHS}")
   # moe marlin arches for fp8 input

From 06b90bd4429f28254a540fed699b144b2b5b33b3 Mon Sep 17 00:00:00 2001
From: Tony Liu <tonyliu0512@gmail.com>
Date: Mon, 27 Apr 2026 15:41:38 +0800
Subject: [PATCH 3/3] Gate 12.0f family flag behind CUDA >= 13.0

Per @Harry-Chen review: family specifier 12.0f was added in CUDA 12.9,
but vLLM still supports CUDA 12.8 builds. Without the gate, builds on
12.8 fail at compile time. Mirrors the existing pattern for MLA_ARCHS
at L499-L503.

Pre-13.0 fallback uses 12.0a;12.1a (architecture-specific cubins) which
all CUDA 12.x toolchains accept. Post-13.0 uses 12.0f (single SM12x
family cubin) for smaller binary size.

Also applied unified handling to MARLIN_FP8_ARCHS (was previously
12.0;12.1 without family-flag option) for consistency, per Harry's
suggestion.

Signed-off-by: Tony Liu <tonyliu0512@gmail.com>
---
 CMakeLists.txt | 26 ++++++++++++++++++++++----
 1 file changed, 22 insertions(+), 4 deletions(-)

diff --git a/CMakeLists.txt b/CMakeLists.txt
index 52aee59c01ec..7e2df042a441 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -359,16 +359,30 @@ if(VLLM_GPU_LANG STREQUAL "CUDA")
   # are not supported by Machete yet.
 
   # marlin arches for fp16 output
-  cuda_archs_loose_intersection(MARLIN_ARCHS "8.0+PTX;12.0f" "${CUDA_ARCHS}")
+  # Family-conditional 12.0f (one cubin for SM12x family) requires CUDA >= 13.0;
+  # fall back to architecture-specific 12.0a;12.1a on CUDA < 13.0 (e.g. 12.8).
+  if(${CMAKE_CUDA_COMPILER_VERSION} VERSION_GREATER_EQUAL 13.0)
+    cuda_archs_loose_intersection(MARLIN_ARCHS "8.0+PTX;12.0f" "${CUDA_ARCHS}")
+  else()
+    cuda_archs_loose_intersection(MARLIN_ARCHS "8.0+PTX;12.0a;12.1a" "${CUDA_ARCHS}")
+  endif()
   # marlin has limited support for turing
   cuda_archs_loose_intersection(MARLIN_SM75_ARCHS "7.5" "${CUDA_ARCHS}")
   # marlin arches for bf16 output (we need 9.0 for bf16 atomicAdd PTX)
-  cuda_archs_loose_intersection(MARLIN_BF16_ARCHS "8.0+PTX;9.0+PTX;12.0f" "${CUDA_ARCHS}")
+  if(${CMAKE_CUDA_COMPILER_VERSION} VERSION_GREATER_EQUAL 13.0)
+    cuda_archs_loose_intersection(MARLIN_BF16_ARCHS "8.0+PTX;9.0+PTX;12.0f" "${CUDA_ARCHS}")
+  else()
+    cuda_archs_loose_intersection(MARLIN_BF16_ARCHS "8.0+PTX;9.0+PTX;12.0a;12.1a" "${CUDA_ARCHS}")
+  endif()
   # marlin arches for fp8 input
   # - sm80 doesn't support fp8 computation
   # - sm90 and sm100 don't support QMMA.16832.F32.E4M3.E4M3 SAAS instruction
   # so we only enable fp8 computation for SM89 (e.g. RTX 40x0)  and 12.0 (e.g. RTX 50x0)
-  cuda_archs_loose_intersection(MARLIN_FP8_ARCHS "8.9;12.0;12.1" "${CUDA_ARCHS}")
+  if(${CMAKE_CUDA_COMPILER_VERSION} VERSION_GREATER_EQUAL 13.0)
+    cuda_archs_loose_intersection(MARLIN_FP8_ARCHS "8.9;12.0f" "${CUDA_ARCHS}")
+  else()
+    cuda_archs_loose_intersection(MARLIN_FP8_ARCHS "8.9;12.0a;12.1a" "${CUDA_ARCHS}")
+  endif()
   # marlin arches for other files
   cuda_archs_loose_intersection(MARLIN_OTHER_ARCHS "7.5;8.0+PTX" "${CUDA_ARCHS}")
 
@@ -1081,7 +1095,11 @@ if(VLLM_GPU_LANG STREQUAL "CUDA")
   # moe marlin arches
   # note that we always set `use_atomic_add=False` for moe marlin now,
   # so we don't need 9.0 for bf16 atomicAdd PTX
-  cuda_archs_loose_intersection(MARLIN_MOE_ARCHS "8.0+PTX;12.0f" "${CUDA_ARCHS}")
+  if(${CMAKE_CUDA_COMPILER_VERSION} VERSION_GREATER_EQUAL 13.0)
+    cuda_archs_loose_intersection(MARLIN_MOE_ARCHS "8.0+PTX;12.0f" "${CUDA_ARCHS}")
+  else()
+    cuda_archs_loose_intersection(MARLIN_MOE_ARCHS "8.0+PTX;12.0a;12.1a" "${CUDA_ARCHS}")
+  endif()
   # moe marlin has limited support for turing
   cuda_archs_loose_intersection(MARLIN_MOE_SM75_ARCHS "7.5" "${CUDA_ARCHS}")
   # moe marlin arches for fp8 input