flashinfer-ai · bkryu · Mar 12, 2026 · Jan 28, 2026 · Jan 28, 2026 · Jan 28, 2026
@@ -17,14 +17,14 @@
 
 void rmsnorm(TensorView out, TensorView input, TensorView weight, double eps, bool enable_pdl);
 
-void rmsnorm_quant(TensorView out, TensorView input, TensorView weight, double scale, double eps,
-                   bool enable_pdl);
+void rmsnorm_quant(TensorView out, TensorView input, TensorView weight, TensorView scale,
+                   double eps, bool enable_pdl);
 
 void fused_add_rmsnorm(TensorView input, TensorView residual, TensorView weight, double eps,
                        bool enable_pdl);
 
 void fused_add_rmsnorm_quant(TensorView output, TensorView input, TensorView residual,
-                             TensorView weight, double scale, double eps, bool enable_pdl);
+                             TensorView weight, TensorView scale, double eps, bool enable_pdl);
 
 void gemma_rmsnorm(TensorView out, TensorView input, TensorView weight, double eps,
                    bool enable_pdl);

@@ -77,13 +77,15 @@ void rmsnorm(TensorView output, TensorView input, TensorView weight, double eps,
   }
 }
 
-void rmsnorm_quant(TensorView output, TensorView input, TensorView weight, double scale, double eps,
-                   bool enable_pdl) {
+void rmsnorm_quant(TensorView output, TensorView input, TensorView weight, TensorView scale,
+                   double eps, bool enable_pdl) {
   CHECK_LAST_DIM_CONTIGUOUS_INPUT(input);
   CHECK_LAST_DIM_CONTIGUOUS_INPUT(output);
   CHECK_LAST_DIM_CONTIGUOUS_INPUT(weight);
   CHECK_DEVICE(input, weight);
+  CHECK_DEVICE(input, scale);
   CHECK_DIM(1, weight);  // weight: (hidden_size)
+  TVM_FFI_ICHECK_EQ(scale.numel(), 1);
 
   auto input_ndim = input.ndim();
   if (input_ndim == 2) {
@@ -103,7 +105,7 @@ void rmsnorm_quant(TensorView output, TensorView input, TensorView weight, doubl
         cudaError_t status = norm::RMSNormQuant(
             static_cast<c_type*>(input.data_ptr()), static_cast<c_type*>(weight.data_ptr()),
             static_cast<o_type*>(output.data_ptr()), batch_size, hidden_size, input.stride(0),
-            output.stride(0), static_cast<float>(scale), eps, enable_pdl, stream);
+            output.stride(0), static_cast<float*>(scale.data_ptr()), eps, enable_pdl, stream);
         TVM_FFI_ICHECK(status == cudaSuccess)
             << "RMSNormQuant failed with error code " << cudaGetErrorString(status);
         return true;
@@ -145,14 +147,15 @@ void fused_add_rmsnorm(TensorView input, TensorView residual, TensorView weight,
 }
 
 void fused_add_rmsnorm_quant(TensorView output, TensorView input, TensorView residual,
-                             TensorView weight, double scale, double eps, bool enable_pdl) {
+                             TensorView weight, TensorView scale, double eps, bool enable_pdl) {
   CHECK_LAST_DIM_CONTIGUOUS_INPUT(input);
   CHECK_LAST_DIM_CONTIGUOUS_INPUT(residual);
   CHECK_LAST_DIM_CONTIGUOUS_INPUT(weight);
   CHECK_LAST_DIM_CONTIGUOUS_INPUT(output);
   CHECK_DEVICE(input, residual);
   CHECK_DEVICE(input, weight);
   CHECK_DEVICE(input, output);
+  CHECK_DEVICE(input, scale);
   CHECK_DIM(2, input);     // input: (batch_size, hidden_size)
   CHECK_DIM(2, residual);  // residual: (batch_size, hidden_size)
   CHECK_DIM(1, weight);    // weight: (hidden_size)
@@ -162,6 +165,7 @@ void fused_add_rmsnorm_quant(TensorView output, TensorView input, TensorView res
   TVM_FFI_ICHECK_EQ(residual.size(0), batch_size);
   TVM_FFI_ICHECK_EQ(residual.size(1), hidden_size);
   TVM_FFI_ICHECK_EQ(weight.size(0), hidden_size);
+  TVM_FFI_ICHECK_EQ(scale.numel(), 1);
   ffi::CUDADeviceGuard device_guard(input.device().device_id);
   const cudaStream_t stream = get_stream(input.device());
 
@@ -170,8 +174,8 @@ void fused_add_rmsnorm_quant(TensorView output, TensorView input, TensorView res
       cudaError_t status = norm::FusedAddRMSNormQuant(
           static_cast<c_type*>(input.data_ptr()), static_cast<c_type*>(residual.data_ptr()),
           static_cast<c_type*>(weight.data_ptr()), static_cast<o_type*>(output.data_ptr()),
-          batch_size, hidden_size, input.stride(0), residual.stride(0), output.stride(0), scale,
-          eps, enable_pdl, stream);
+          batch_size, hidden_size, input.stride(0), residual.stride(0), output.stride(0),
+          static_cast<float*>(scale.data_ptr()), eps, enable_pdl, stream);
 
       TVM_FFI_ICHECK(status == cudaSuccess)
           << "FusedAddRMSNormQuant failed with error code " << cudaGetErrorString(status);

@@ -11,7 +11,9 @@ Kernels for normalization layers.
     :toctree: ../generated
 
     rmsnorm
+    rmsnorm_quant
     fused_add_rmsnorm
+    fused_add_rmsnorm_quant
     gemma_rmsnorm
     gemma_fused_add_rmsnorm
     layernorm
@@ -112,8 +112,11 @@
 from .norm import gemma_rmsnorm as gemma_rmsnorm
 from .norm import rmsnorm as rmsnorm
 
-from .norm import rmsnorm_fp4quant as rmsnorm_fp4quant
-from .norm import add_rmsnorm_fp4quant as add_rmsnorm_fp4quant
+try:
+    from .norm import rmsnorm_fp4quant as rmsnorm_fp4quant
+    from .norm import add_rmsnorm_fp4quant as add_rmsnorm_fp4quant
+except (ImportError, AttributeError):
+    pass  # nvidia-cutlass-dsl not installed
 from .page import append_paged_kv_cache as append_paged_kv_cache
 from .page import append_paged_mla_kv_cache as append_paged_mla_kv_cache
 from .page import get_batch_indices_positions as get_batch_indices_positions

@@ -54,6 +54,24 @@
         AddRMSNormFP4QuantKernel,
     )
 
+    # Backwards-compatible re-exports from flashinfer.norm.kernels submodule
+    from ..norm.kernels import (
+        # Kernel classes
+        RMSNormKernel,
+        QKRMSNormKernel,
+        RMSNormQuantKernel,
+        FusedAddRMSNormKernel,
+        FusedAddRMSNormQuantKernel,
+        LayerNormKernel,
+        # Python API functions
+        rmsnorm_cute,
+        qk_rmsnorm_cute,
+        rmsnorm_quant_cute,
+        fused_add_rmsnorm_cute,
+        fused_add_rmsnorm_quant_cute,
+        layernorm_cute,
+    )
+
 __all__ = [
     # Utils (always available)
     "is_cute_dsl_available",
@@ -79,4 +97,17 @@
         # Add + RMSNorm + FP4 Quantization
         "add_rmsnorm_fp4quant",
         "AddRMSNormFP4QuantKernel",
+        # Norm kernels (CuTe DSL) - backwards-compatible re-exports
+        "RMSNormKernel",
+        "QKRMSNormKernel",
+        "RMSNormQuantKernel",
+        "FusedAddRMSNormKernel",
+        "FusedAddRMSNormQuantKernel",
+        "LayerNormKernel",
+        "rmsnorm_cute",
+        "qk_rmsnorm_cute",
+        "rmsnorm_quant_cute",
+        "fused_add_rmsnorm_cute",
+        "fused_add_rmsnorm_quant_cute",
+        "layernorm_cute",
     ]
@@ -1012,8 +1012,8 @@ def tensor_api(
             s_tensor,
             s_unswizzled.contiguous(),
             global_scale,
-            Int32(M),
-            Float32(eps),
+            M,
+            eps,
         )
 
     return tensor_api

@@ -750,8 +750,8 @@ def tensor_api(
             y_uint8,
             s_tensor,
             global_scale,
-            Int32(M),
-            Float32(eps),
+            M,
+            eps,
         )
 
     return tensor_api