diff --git a/CMakeLists.txt b/CMakeLists.txt index 5f2c43fea284..ff56232b63cd 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -315,6 +315,7 @@ if(VLLM_GPU_LANG STREQUAL "CUDA") "csrc/quantization/awq/gemm_kernels.cu" "csrc/permute_cols.cu" "csrc/quantization/cutlass_w8a8/scaled_mm_entry.cu" + "csrc/quantization/fp4/nvfp4_blockwise_moe_entry.cu" "csrc/quantization/fp4/nvfp4_quant_entry.cu" "csrc/quantization/fp4/nvfp4_scaled_mm_entry.cu" "csrc/sparse/cutlass/sparse_scaled_mm_entry.cu" diff --git a/csrc/quantization/fp4/nvfp4_blockwise_moe_entry.cu b/csrc/quantization/fp4/nvfp4_blockwise_moe_entry.cu new file mode 100644 index 000000000000..80512bd61b76 --- /dev/null +++ b/csrc/quantization/fp4/nvfp4_blockwise_moe_entry.cu @@ -0,0 +1,40 @@ +/* + * Copyright (c) 2025, NVIDIA CORPORATION. All rights reserved. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include + +#if defined ENABLE_NVFP4_SM100 && ENABLE_NVFP4_SM100 +void cutlass_fp4_group_mm_sm100a( + torch::Tensor& output, const torch::Tensor& a, const torch::Tensor& b, + const torch::Tensor& a_blockscale, const torch::Tensor& b_blockscales, + const torch::Tensor& alphas, const torch::Tensor& problem_sizes, + const torch::Tensor& expert_offsets, const torch::Tensor& sf_offsets); +#endif + +void cutlass_fp4_group_mm( + torch::Tensor& output, const torch::Tensor& a, const torch::Tensor& b, + const torch::Tensor& a_blockscale, const torch::Tensor& b_blockscales, + const torch::Tensor& alphas, const torch::Tensor& problem_sizes, + const torch::Tensor& expert_offsets, const torch::Tensor& sf_offsets) { +#if defined ENABLE_NVFP4_SM100 && ENABLE_NVFP4_SM100 + return cutlass_fp4_group_mm_sm100a(output, a, b, a_blockscale, b_blockscale, + alphas, problem_sizes, expert_offsets, sf_offsets); +#endif + TORCH_CHECK_NOT_IMPLEMENTED(false, + "No compiled nvfp4 mm kernel, vLLM should " + "be compiled using CUDA 12.8 and target " + "compute capability 100 or above."); +} diff --git a/csrc/quantization/fp4/nvfp4_blockwise_moe_kernel.cu b/csrc/quantization/fp4/nvfp4_blockwise_moe_kernel.cu index 2c8df6144bf4..ecc0bfd014b9 100644 --- a/csrc/quantization/fp4/nvfp4_blockwise_moe_kernel.cu +++ b/csrc/quantization/fp4/nvfp4_blockwise_moe_kernel.cu @@ -367,7 +367,7 @@ constexpr auto SF_DTYPE = at::ScalarType::Float8_e4m3fn; CHECK_CONTIGUOUS(x, m); \ CHECK_TYPE(x, st, m) -void cutlass_fp4_group_mm( +void cutlass_fp4_group_mm_sm100a( torch::Tensor& output, const torch::Tensor& a, const torch::Tensor& b, const torch::Tensor& a_blockscale, const torch::Tensor& b_blockscales, const torch::Tensor& alphas, const torch::Tensor& problem_sizes,