Skip to content
Closed
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
12 changes: 8 additions & 4 deletions include/cutlass/float_subbyte.h
Original file line number Diff line number Diff line change
Expand Up @@ -44,15 +44,19 @@
#define CUDA_FP4_ENABLED 1
#endif

// Note: SM12x (desktop Blackwell: RTX 5090/5080, DGX Spark GB10) is intentionally
// excluded below. SM12x GPUs have mma.e2m1 tensor cores but lack the
// cvt.rn.satfinite.e2m1x2.f32 PTX instruction required for native FP4/FP6
// conversion — this instruction is SM100-family only. Including SM12x here causes
// CUTLASS to emit invalid PTX, producing NaN during NVFP4 inference. SM12x falls
// through to the existing software E2M1 conversion path instead.
#if (defined(CUTLASS_ARCH_MMA_SM100A_ENABLED) || defined(CUTLASS_ARCH_MMA_SM101A_ENABLED) ||\
defined(CUTLASS_ARCH_MMA_SM103A_ENABLED) || defined(CUTLASS_ARCH_MMA_SM110A_ENABLED) ||\
defined(CUTLASS_ARCH_MMA_SM120A_ENABLED) || defined(CUTLASS_ARCH_MMA_SM121A_ENABLED))
defined(CUTLASS_ARCH_MMA_SM103A_ENABLED) || defined(CUTLASS_ARCH_MMA_SM110A_ENABLED))
# define CUDA_PTX_FP4FP6_CVT_ENABLED 1
#endif

#if (defined(CUTLASS_ARCH_MMA_SM100F_ENABLED) || defined(CUTLASS_ARCH_MMA_SM101F_ENABLED) ||\
defined(CUTLASS_ARCH_MMA_SM103F_ENABLED) || defined(CUTLASS_ARCH_MMA_SM110F_ENABLED) ||\
defined(CUTLASS_ARCH_MMA_SM120F_ENABLED) || defined(CUTLASS_ARCH_MMA_SM121F_ENABLED))
defined(CUTLASS_ARCH_MMA_SM103F_ENABLED) || defined(CUTLASS_ARCH_MMA_SM110F_ENABLED))
# define CUDA_PTX_FP4FP6_CVT_ENABLED 1
#endif

Expand Down