[Optimize] Machete using group scale default (#4121)

Sunny-bot1 · web-flow · commit c3b8ebeb18b0 · 2025-09-18T13:51:11.000+08:00
diff --git a/fastdeploy/model_executor/layers/quantization/weight_only.py b/fastdeploy/model_executor/layers/quantization/weight_only.py
@@ -161,7 +161,6 @@ def get_quant_method(self, layer) -> Optional[QuantMethodBase]:
                     and envs.FD_USE_MACHETE == "1"
                     and layer.weight_shape[1]
                     and layer.weight_shape[1] % 128 == 0
-                    and not layer.add_bias
                 ):
                     return MacheteWeightOnlyLinearMethod(self)
                 return GPUWeightOnlyLinearMethod(self)
@@ -244,7 +243,8 @@ def create_weights(self, layer, **extra_weight_attrs):
             )
         else:
             if isinstance(self, MacheteWeightOnlyLinearMethod):
-                weight_scale_shape = [1, layer.weight_shape[1]]
+                # Using group scale for machete, group size is 128
+                weight_scale_shape = [(layer.weight_shape[0] + 127) // 128, layer.weight_shape[1]]
                 if self.quant_config.name() == "wint4":
                     layer.weight_shape[0] //= 8
                 else:
@@ -299,10 +299,12 @@ def process_weights_after_loading(self, layer) -> None:
                 machete_quantize_and_pack,
             )
 
+            # Using group scale for machete, group size is 128
             quanted_weight_tensor, weight_scale_tensor = machete_quantize_and_pack(
                 w=layer.weight,
                 atype=layer._dtype,
                 quant_type="uint4b8" if self.quant_config.name() == "wint4" else "uint8b128",
+                group_size=128,
             )
         else:
             quanted_weight_tensor, weight_scale_tensor = weight_quantize(
@@ -404,23 +406,27 @@ def process_loaded_weights(self, layer, weight) -> None:
             machete_quantize_and_pack,
         )
 
+        # Using group scale for machete, group size is 128
         quanted_weight_tensor, weight_scale_tensor = machete_quantize_and_pack(
             w=weight,
             atype=layer._dtype,
             quant_type="uint4b8" if self.quant_config.name() == "wint4" else "uint8b128",
+            group_size=128,
         )
         layer.weight.set_value(quanted_weight_tensor)
         layer.weight_scale.set_value(weight_scale_tensor.astype(paddle.get_default_dtype()))
 
     def apply(self, layer, x):
-        assert layer.bias is None, "Machete weight only linear method does not support bias."
         from fastdeploy.model_executor.layers.quantization.ops import machete_wint_mm
 
+        # Using group scale for machete, group size is 128
         linear_out = machete_wint_mm(
             x,
             w_prepack=layer.weight,
             w_g_s=layer.weight_scale,
             weight_dtype="uint4b8" if self.quant_config.name() == "wint4" else "uint8b128",
+            group_size=128,
         )
-
+        if layer.with_bias:
+            linear_out = paddle.add(linear_out, layer.bias)
         return linear_out
diff --git a/tests/operators/test_machete_mm.py b/tests/operators/test_machete_mm.py
@@ -135,6 +135,8 @@ def get_machete_weight_only_linear_out(self):
             weight_dtype="uint4b8" if self.weight_dtype == "int4" else "uint8b128",  # weight_dtype
             group_size=self.machete_group_size,
         )
+        if self.bias is not None:
+            out = paddle.add(out, self.bias)
         return out.numpy()
 
     def test_weight_only_linear(self):
@@ -158,7 +160,7 @@ def config(self):
         self.dtype = "float16"
         self.rtol = 1e-5
         self.atol = 1e-1
-        self.bias = False
+        self.bias = True
         self.batch = 1
         self.token = 512
         self.in_features = 7168
@@ -224,6 +226,8 @@ def get_machete_weight_only_linear_out(self):
             weight_dtype="uint4b8" if self.weight_dtype == "int4" else "uint8b128",  # weight_dtype
             group_size=self.machete_group_size,
         )
+        if self.bias is not None:
+            out = paddle.add(out, self.bias)
         return out.numpy()
 
     def test_weight_only_linear(self):