minor

Fridah-nv · Fridah-nv · commit f8ca1fbb2666 · 2025-08-14T20:31:36.000Z
Signed-off-by: Frida Hou &lt;201670829+Fridah-nv@users.noreply.github.com&gt;
diff --git a/tensorrt_llm/_torch/auto_deploy/custom_ops/torch_quant.py b/tensorrt_llm/_torch/auto_deploy/custom_ops/torch_quant.py
@@ -223,14 +223,14 @@ def custom_quant_linear(
                 "NVFP4 needs weight_scale[0] (per-block vector) and weight_scale[1] (alpha)."
             )
         cutlass_qscale = weight_scale[0]
-        alpha_inv = weight_scale[1]
+        alpha = weight_scale[1]
 
         if cutlass_qscale.dtype != torch.uint8:
             raise TypeError(
                 "NVFP4 expects CUTLASS per-block scale vector in uint8 (same as fused op)."
             )
 
-        inv_w = alpha_inv / inv_x
+        inv_w = 1 / (inv_x * alpha)
         s2_x = 1.0 / inv_x
         s2_w = 1.0 / inv_w
 
diff --git a/tests/unittest/_torch/auto_deploy/unit/singlegpu/custom_ops/test_quant.py b/tests/unittest/_torch/auto_deploy/unit/singlegpu/custom_ops/test_quant.py
@@ -184,17 +184,14 @@ def test_quant_linear_nvfp4_matches_fused_op(bias):
         alpha=alpha_fused,
     )
 
-    # Unified op (expects modelopt-style per-block scale vector + combined alpha = s_in2*s_w2)
-    alpha_unified = (s_in2 * s_w2).to(torch.float32)
-
     out_unified = torch.ops.auto_deploy.custom_quant_linear(
         x,
         weight_fp4,
         bias,
         [s_in2],  # input_scale list
         [
             weight_scale_cutlass,
-            alpha_unified,
+            alpha_fused,
         ],  # weight_scale list: [per-block vector, combined alpha]
         [],  # input_zp
         [],  # weight_zp

Original file line number	Diff line number	Diff line change
`@@ -223,14 +223,14 @@ def custom_quant_linear(`
`223`	`223`	`"NVFP4 needs weight_scale[0] (per-block vector) and weight_scale[1] (alpha)."`
`224`	`224`	`)`
`225`	`225`	`cutlass_qscale = weight_scale[0]`
`226`		`- alpha_inv = weight_scale[1]`
	`226`	`+ alpha = weight_scale[1]`
`227`	`227`
`228`	`228`	`if cutlass_qscale.dtype != torch.uint8:`
`229`	`229`	`raise TypeError(`
`230`	`230`	`"NVFP4 expects CUTLASS per-block scale vector in uint8 (same as fused op)."`
`231`	`231`	`)`
`232`	`232`
`233`		`- inv_w = alpha_inv / inv_x`
	`233`	`+ inv_w = 1 / (inv_x * alpha)`
`234`	`234`	`s2_x = 1.0 / inv_x`
`235`	`235`	`s2_w = 1.0 / inv_w`
`236`	`236`