Address func name

wenscarl · wenscarl · commit c7eaba634476 · 2025-09-30T02:32:47.000Z
diff --git a/csrc/nv_internal/tensorrt_llm/thop/fp4Quantize.cpp b/csrc/nv_internal/tensorrt_llm/thop/fp4Quantize.cpp
@@ -194,9 +194,9 @@ void fp4_batched_quantize(Tensor self, Optional<Tensor> const& mask, Tensor glob
 #undef LAUNCH_FP4_QUANTIZE_KERNEL
 }
 
-void silu_and_mul_fp4_batched_quantize(Tensor const& self, Tensor const& mask,
-                                       Tensor const& globalScale, Tensor valueE2M1,
-                                       Tensor scaleFP8SF, int64_t sfVecSize) {
+void silu_and_mul_nvfp4_batched_quantize(Tensor const& self, Tensor const& mask,
+                                         Tensor const& globalScale, Tensor valueE2M1,
+                                         Tensor scaleFP8SF, int64_t sfVecSize) {
   // TODO(shuw): mask can be none
   CHECK_CUDA(self);
   CHECK_CONTIGUOUS(self);
@@ -225,18 +225,18 @@ void silu_and_mul_fp4_batched_quantize(Tensor const& self, Tensor const& mask,
   const thread_local int mMultiProcessorCount = tensorrt_llm::common::getMultiProcessorCount();
   auto layout = tensorrt_llm::QuantizationSFLayout::SWIZZLED_128x4;
 
-#define LAUNCH_SILU_AND_MUL_FP4_QUANTIZE_KERNEL(T, SF_VEC_SIZE)                               \
+#define LAUNCH_SILU_AND_MUL_NVFP4_QUANTIZE_KERNEL(T, SF_VEC_SIZE)                             \
   tensorrt_llm::kernels::invokeSiluAndMulFP4Quantization<T, SF_VEC_SIZE>(                     \
       b, m, k_by_2, reinterpret_cast<T*>(self->data), static_cast<float*>(globalScale->data), \
       static_cast<int32_t*>(mask->data), reinterpret_cast<int64_t*>(valueE2M1->data),         \
       reinterpret_cast<int32_t*>(scaleFP8SF->data), layout, mMultiProcessorCount,             \
       get_stream(self->device));
 
   if (self->dtype == dl_float16) {
-    LAUNCH_SILU_AND_MUL_FP4_QUANTIZE_KERNEL(half, 16)
+    LAUNCH_SILU_AND_MUL_NVFP4_QUANTIZE_KERNEL(half, 16)
   } else if (self->dtype == dl_bfloat16) {
 #ifdef ENABLE_BF16
-    LAUNCH_SILU_AND_MUL_FP4_QUANTIZE_KERNEL(__nv_bfloat16, 16)
+    LAUNCH_SILU_AND_MUL_NVFP4_QUANTIZE_KERNEL(__nv_bfloat16, 16)
 #else
     TVM_FFI_LOG_AND_THROW(NotImplementedError)
         << "BFloat16 must be enabled to quantize an bf16 tensor to fp4.";
@@ -246,9 +246,10 @@ void silu_and_mul_fp4_batched_quantize(Tensor const& self, Tensor const& mask,
         << "fp4_quantize only supports input tensor with dtypes fp16/bf16.";
   }
 
-#undef LAUNCH_SILU_AND_MUL_FP4_QUANTIZE_KERNEL
+#undef LAUNCH_SILU_AND_MUL_NVFP4_QUANTIZE_KERNEL
 }
 
 TVM_FFI_DLL_EXPORT_TYPED_FUNC(fp4_quantize, fp4_quantize);
 TVM_FFI_DLL_EXPORT_TYPED_FUNC(fp4_batched_quantize, fp4_batched_quantize);
-TVM_FFI_DLL_EXPORT_TYPED_FUNC(silu_and_mul_fp4_batched_quantize, silu_and_mul_fp4_batched_quantize);
+TVM_FFI_DLL_EXPORT_TYPED_FUNC(silu_and_mul_nvfp4_batched_quantize,
+                              silu_and_mul_nvfp4_batched_quantize);
diff --git a/csrc/nv_internal/tensorrt_llm/thop/fp4Quantize.h b/csrc/nv_internal/tensorrt_llm/thop/fp4Quantize.h
@@ -34,6 +34,6 @@ void fp4_quantize(Tensor self, Optional<Tensor> const& globalScale, Tensor value
 void fp4_batched_quantize(Tensor self, Optional<Tensor> const& mask, Tensor globalScale,
                           Tensor valueE2M1, Tensor scaleFP8SF, int64_t sfVecSize, bool sfUseUE8M0);
 
-void silu_and_mul_fp4_batched_quantize(Tensor const& self, Tensor const& mask,
-                                       Tensor const& globalScale, Tensor valueE2M1,
-                                       Tensor scaleFP8SF, int64_t sfVecSize);
+void silu_and_mul_nvfp4_batched_quantize(Tensor const& self, Tensor const& mask,
+                                         Tensor const& globalScale, Tensor valueE2M1,
+                                         Tensor scaleFP8SF, int64_t sfVecSize);
diff --git a/docs/api/fp4_quantization.rst b/docs/api/fp4_quantization.rst
@@ -18,7 +18,7 @@ Core Quantization Functions
     nvfp4_batched_quantize
     nvfp4_block_scale_interleave
     e2m1_and_ufp8sf_scale_to_float
-    silu_and_mul_fp4_batched_quantize
+    silu_and_mul_nvfp4_batched_quantize
 
 Matrix Shuffling Utilities
 --------------------------
diff --git a/flashinfer/__init__.py b/flashinfer/__init__.py
@@ -25,7 +25,7 @@
 from .activation import gelu_tanh_and_mul as gelu_tanh_and_mul
 from .activation import silu_and_mul as silu_and_mul
 from .activation import (
-    silu_and_mul_fp4_batched_quantize as silu_and_mul_fp4_batched_quantize,
+    silu_and_mul_nvfp4_batched_quantize as silu_and_mul_nvfp4_batched_quantize,
 )
 from .attention import BatchAttention as BatchAttention
 from .attention import (
diff --git a/flashinfer/activation.py b/flashinfer/activation.py
@@ -142,7 +142,7 @@ def silu_and_mul(
     return out
 
 
-def silu_and_mul_fp4_batched_quantize(
+def silu_and_mul_nvfp4_batched_quantize(
     a,
     mask,
     a_global_sf,
@@ -166,7 +166,7 @@ def silu_and_mul_fp4_batched_quantize(
     device_arch = f"{major * 10 + minor}"
     a_fp4, a_sf = get_fp4_quantization_module(
         device_arch
-    ).silu_and_mul_fp4_batched_quantize_sm100(
+    ).silu_and_mul_nvfp4_batched_quantize_sm100(
         a,
         mask,
         a_global_sf,
diff --git a/flashinfer/fp4_quantization.py b/flashinfer/fp4_quantization.py
@@ -375,10 +375,10 @@ def _fp4_batched_quantize_sm100(
         )
 
     @register_custom_op(
-        "flashinfer::silu_and_mul_fp4_batched_quantize_sm100",
+        "flashinfer::silu_and_mul_nvfp4_batched_quantize_sm100",
         mutates_args=("",),
     )
-    def silu_and_mul_fp4_batched_quantize_sm100(
+    def silu_and_mul_nvfp4_batched_quantize_sm100(
         input: torch.Tensor,
         mask: torch.Tensor,
         global_scale: Optional[torch.Tensor] = None,
@@ -429,7 +429,7 @@ def silu_and_mul_fp4_batched_quantize_sm100(
             dtype=torch.uint8,
             device=input.device,
         )
-        module.silu_and_mul_fp4_batched_quantize(
+        module.silu_and_mul_nvfp4_batched_quantize(
             input,
             mask,
             global_scale,
@@ -439,8 +439,8 @@ def silu_and_mul_fp4_batched_quantize_sm100(
         )
         return out_val, out_sf
 
-    @register_fake_op("flashinfer::silu_and_mul_fp4_batched_quantize_sm100")
-    def _silu_and_mul_fp4_batched_quantize_sm100(
+    @register_fake_op("flashinfer::silu_and_mul_nvfp4_batched_quantize_sm100")
+    def _silu_and_mul_nvfp4_batched_quantize_sm100(
         input: torch.Tensor,
         mask: torch.Tensor,
         global_scale: Optional[torch.Tensor] = None,
@@ -518,7 +518,7 @@ def _fake_e2m1_and_ufp8sf_scale_to_float_sm100(
         e2m1_and_ufp8sf_scale_to_float_sm100=e2m1_and_ufp8sf_scale_to_float_sm100,
         mxfp4_dequantize_host=mxfp4_dequantize_host,
         fp4_batched_quantize_sm100=fp4_batched_quantize_sm100,
-        silu_and_mul_fp4_batched_quantize_sm100=silu_and_mul_fp4_batched_quantize_sm100,
+        silu_and_mul_nvfp4_batched_quantize_sm100=silu_and_mul_nvfp4_batched_quantize_sm100,
     )
 
 
diff --git a/tests/test_fp4_quantize.py b/tests/test_fp4_quantize.py
@@ -11,7 +11,7 @@
     mxfp4_quantize,
     mxfp4_dequantize,
     nvfp4_batched_quantize,
-    silu_and_mul_fp4_batched_quantize,
+    silu_and_mul_nvfp4_batched_quantize,
 )
 from flashinfer.utils import is_sm100a_supported
 
@@ -377,13 +377,13 @@ def test_nvfp4_batched_quantize(
 @pytest.mark.parametrize("seed", SEEDS)
 @pytest.mark.parametrize("device", CUDA_DEVICES)
 @torch.inference_mode()
-def test_silu_and_mul_fp4_batched_quantize(
+def test_silu_and_mul_nvfp4_batched_quantize(
     dtype: torch.dtype,
     batch_shape: tuple[int, int, int],
     seed: int,
     device: str,
 ) -> None:
-    """Test silu_and_mul_fp4_batched_quantize function."""
+    """Test silu_and_mul_nvfp4_batched_quantize function."""
     if not is_sm100a_supported(torch.device(device)):
         pytest.skip("Nvfp4 Requires compute capability of 10 or above")
     torch.set_default_device(device)
@@ -399,7 +399,7 @@ def test_silu_and_mul_fp4_batched_quantize(
     tensor_amax = ref_y.abs().amax(dim=(1, 2)).to(torch.float32)
     global_scale = FLOAT8_E4M3_MAX * FLOAT4_E2M1_MAX / tensor_amax
 
-    out, out_scale = silu_and_mul_fp4_batched_quantize(x, mask, global_scale)
+    out, out_scale = silu_and_mul_nvfp4_batched_quantize(x, mask, global_scale)
     ref_out, ref_out_scale = nvfp4_batched_quantize(
         ref_y,
         global_scale,

Original file line number	Diff line number	Diff line change
`@@ -25,7 +25,7 @@`
`25`	`25`	`from .activation import gelu_tanh_and_mul as gelu_tanh_and_mul`
`26`	`26`	`from .activation import silu_and_mul as silu_and_mul`
`27`	`27`	`from .activation import (`
`28`		`- silu_and_mul_fp4_batched_quantize as silu_and_mul_fp4_batched_quantize,`
	`28`	`+ silu_and_mul_nvfp4_batched_quantize as silu_and_mul_nvfp4_batched_quantize,`
`29`	`29`	`)`
`30`	`30`	`from .attention import BatchAttention as BatchAttention`
`31`	`31`	`from .attention import (`