From d9a040a9f5720e85e6528ed3cd709599b1cfe3ca Mon Sep 17 00:00:00 2001
From: Rob Tand <robert.tand@icloud.com>
Date: Fri, 20 Mar 2026 12:19:37 -0400
Subject: [PATCH] fix: Exclude SM12x (desktop Blackwell) from
 CUDA_PTX_FP4FP6_CVT_ENABLED
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

SM12x GPUs (RTX 5090/5080/PRO 6000, DGX Spark GB10) have mma.e2m1
tensor cores but lack the cvt.rn.satfinite.e2m1x2.f32 PTX instruction
for native FP4/FP6 conversion — this instruction is SM100-family only.

When SM120A/F or SM121A/F is included in the CUDA_PTX_FP4FP6_CVT_ENABLED
guard, CUTLASS emits the missing PTX instruction, which produces NaN
during NVFP4 inference.

This change removes all SM12x variants from the guard, causing SM12x to
fall through to the existing software E2M1 conversion path.

Tested on DGX Spark (SM121) running Nemotron-3-Super-120B and
Qwen3.5-122B NVFP4 models via vLLM + FlashInfer. Without this fix,
all NVFP4 inference on SM12x produces NaN output.

Signed-off-by: Rob Tand <robert.tand@icloud.com>
---
 include/cutlass/float_subbyte.h | 12 ++++++++----
 1 file changed, 8 insertions(+), 4 deletions(-)

diff --git a/include/cutlass/float_subbyte.h b/include/cutlass/float_subbyte.h
index 56d512980f..950c05aa44 100644
--- a/include/cutlass/float_subbyte.h
+++ b/include/cutlass/float_subbyte.h
@@ -44,15 +44,19 @@
 #define CUDA_FP4_ENABLED 1
 #endif
 
+// Note: SM12x (desktop Blackwell: RTX 5090/5080, DGX Spark GB10) is intentionally
+// excluded below. SM12x GPUs have mma.e2m1 tensor cores but lack the
+// cvt.rn.satfinite.e2m1x2.f32 PTX instruction required for native FP4/FP6
+// conversion — this instruction is SM100-family only. Including SM12x here causes
+// CUTLASS to emit invalid PTX, producing NaN during NVFP4 inference. SM12x falls
+// through to the existing software E2M1 conversion path instead.
 #if (defined(CUTLASS_ARCH_MMA_SM100A_ENABLED) || defined(CUTLASS_ARCH_MMA_SM101A_ENABLED) ||\
-     defined(CUTLASS_ARCH_MMA_SM103A_ENABLED) || defined(CUTLASS_ARCH_MMA_SM110A_ENABLED) ||\
-     defined(CUTLASS_ARCH_MMA_SM120A_ENABLED) || defined(CUTLASS_ARCH_MMA_SM121A_ENABLED))
+     defined(CUTLASS_ARCH_MMA_SM103A_ENABLED) || defined(CUTLASS_ARCH_MMA_SM110A_ENABLED))
 #  define CUDA_PTX_FP4FP6_CVT_ENABLED 1
 #endif
 
 #if (defined(CUTLASS_ARCH_MMA_SM100F_ENABLED) || defined(CUTLASS_ARCH_MMA_SM101F_ENABLED) ||\
-     defined(CUTLASS_ARCH_MMA_SM103F_ENABLED) || defined(CUTLASS_ARCH_MMA_SM110F_ENABLED) ||\
-     defined(CUTLASS_ARCH_MMA_SM120F_ENABLED) || defined(CUTLASS_ARCH_MMA_SM121F_ENABLED))
+     defined(CUTLASS_ARCH_MMA_SM103F_ENABLED) || defined(CUTLASS_ARCH_MMA_SM110F_ENABLED))
 #  define CUDA_PTX_FP4FP6_CVT_ENABLED 1
 #endif