vllm-project · StrongerXi · Oct 1, 2025 · StrongerXi · Oct 2, 2025
@@ -315,6 +315,7 @@ if(VLLM_GPU_LANG STREQUAL "CUDA")
     "csrc/quantization/awq/gemm_kernels.cu"
     "csrc/permute_cols.cu"
     "csrc/quantization/cutlass_w8a8/scaled_mm_entry.cu"
+    "csrc/quantization/fp4/nvfp4_blockwise_moe_entry.cu"
     "csrc/quantization/fp4/nvfp4_quant_entry.cu"
     "csrc/quantization/fp4/nvfp4_scaled_mm_entry.cu"
     "csrc/sparse/cutlass/sparse_scaled_mm_entry.cu"

diff --git a/csrc/quantization/fp4/nvfp4_blockwise_moe_entry.cu b/csrc/quantization/fp4/nvfp4_blockwise_moe_entry.cu
@@ -0,0 +1,40 @@
+/*
+ * Copyright (c) 2025, NVIDIA CORPORATION.  All rights reserved.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include <torch/all.h>
+
+#if defined ENABLE_NVFP4_SM100 && ENABLE_NVFP4_SM100
+void cutlass_fp4_group_mm_sm100a(
+    torch::Tensor& output, const torch::Tensor& a, const torch::Tensor& b,
+    const torch::Tensor& a_blockscale, const torch::Tensor& b_blockscales,
+    const torch::Tensor& alphas, const torch::Tensor& problem_sizes,
+    const torch::Tensor& expert_offsets, const torch::Tensor& sf_offsets);
+#endif
+
+void cutlass_fp4_group_mm(
+    torch::Tensor& output, const torch::Tensor& a, const torch::Tensor& b,
+    const torch::Tensor& a_blockscale, const torch::Tensor& b_blockscales,
+    const torch::Tensor& alphas, const torch::Tensor& problem_sizes,
+    const torch::Tensor& expert_offsets, const torch::Tensor& sf_offsets) {
+#if defined ENABLE_NVFP4_SM100 && ENABLE_NVFP4_SM100
+  return cutlass_fp4_group_mm_sm100a(output, a, b, a_blockscale, b_blockscale,
+          alphas, problem_sizes, expert_offsets, sf_offsets);
+#endif
+  TORCH_CHECK_NOT_IMPLEMENTED(false,
+                              "No compiled nvfp4 mm kernel, vLLM should "
+                              "be compiled using CUDA 12.8 and target "
+                              "compute capability 100 or above.");
+}
diff --git a/csrc/quantization/fp4/nvfp4_blockwise_moe_kernel.cu b/csrc/quantization/fp4/nvfp4_blockwise_moe_kernel.cu
@@ -367,7 +367,7 @@ constexpr auto SF_DTYPE = at::ScalarType::Float8_e4m3fn;
   CHECK_CONTIGUOUS(x, m);     \
   CHECK_TYPE(x, st, m)
 
-void cutlass_fp4_group_mm(
 if(${CMAKE_CUDA_COMPILER_VERSION} VERSION_GREATER_EQUAL 12.8 AND FP4_ARCHS) 
   set(SRCS 
     "csrc/quantization/fp4/nvfp4_quant_kernels.cu" 
     "csrc/quantization/fp4/activation_nvfp4_quant_fusion_kernels.cu" 
     "csrc/quantization/fp4/nvfp4_experts_quant.cu" 
     "csrc/quantization/fp4/nvfp4_scaled_mm_kernels.cu" 
     "csrc/quantization/fp4/nvfp4_blockwise_moe_kernel.cu") 
 if(${CMAKE_CUDA_COMPILER_VERSION} VERSION_GREATER_EQUAL 12.8 AND FP4_ARCHS) 
   set(SRCS 
     "csrc/quantization/fp4/nvfp4_quant_kernels.cu" 
     "csrc/quantization/fp4/activation_nvfp4_quant_fusion_kernels.cu" 
     "csrc/quantization/fp4/nvfp4_experts_quant.cu" 
     "csrc/quantization/fp4/nvfp4_scaled_mm_kernels.cu" 
     "csrc/quantization/fp4/nvfp4_blockwise_moe_kernel.cu") 
+void cutlass_fp4_group_mm_sm100a(
     torch::Tensor& output, const torch::Tensor& a, const torch::Tensor& b,
     const torch::Tensor& a_blockscale, const torch::Tensor& b_blockscales,
     const torch::Tensor& alphas, const torch::Tensor& problem_sizes,