diff --git a/fastdeploy/model_executor/layers/quantization/weight_only.py b/fastdeploy/model_executor/layers/quantization/weight_only.py index 88a87868c08..070d0fbf410 100644 --- a/fastdeploy/model_executor/layers/quantization/weight_only.py +++ b/fastdeploy/model_executor/layers/quantization/weight_only.py @@ -161,7 +161,6 @@ def get_quant_method(self, layer) -> Optional[QuantMethodBase]: and envs.FD_USE_MACHETE == "1" and layer.weight_shape[1] and layer.weight_shape[1] % 128 == 0 - and not layer.add_bias ): return MacheteWeightOnlyLinearMethod(self) return GPUWeightOnlyLinearMethod(self) @@ -244,7 +243,8 @@ def create_weights(self, layer, **extra_weight_attrs): ) else: if isinstance(self, MacheteWeightOnlyLinearMethod): - weight_scale_shape = [1, layer.weight_shape[1]] + # Using group scale for machete, group size is 128 + weight_scale_shape = [(layer.weight_shape[0] + 127) // 128, layer.weight_shape[1]] if self.quant_config.name() == "wint4": layer.weight_shape[0] //= 8 else: @@ -299,10 +299,12 @@ def process_weights_after_loading(self, layer) -> None: machete_quantize_and_pack, ) + # Using group scale for machete, group size is 128 quanted_weight_tensor, weight_scale_tensor = machete_quantize_and_pack( w=layer.weight, atype=layer._dtype, quant_type="uint4b8" if self.quant_config.name() == "wint4" else "uint8b128", + group_size=128, ) else: quanted_weight_tensor, weight_scale_tensor = weight_quantize( @@ -404,23 +406,27 @@ def process_loaded_weights(self, layer, weight) -> None: machete_quantize_and_pack, ) + # Using group scale for machete, group size is 128 quanted_weight_tensor, weight_scale_tensor = machete_quantize_and_pack( w=weight, atype=layer._dtype, quant_type="uint4b8" if self.quant_config.name() == "wint4" else "uint8b128", + group_size=128, ) layer.weight.set_value(quanted_weight_tensor) layer.weight_scale.set_value(weight_scale_tensor.astype(paddle.get_default_dtype())) def apply(self, layer, x): - assert layer.bias is None, "Machete weight only linear method does not support bias." from fastdeploy.model_executor.layers.quantization.ops import machete_wint_mm + # Using group scale for machete, group size is 128 linear_out = machete_wint_mm( x, w_prepack=layer.weight, w_g_s=layer.weight_scale, weight_dtype="uint4b8" if self.quant_config.name() == "wint4" else "uint8b128", + group_size=128, ) - + if layer.with_bias: + linear_out = paddle.add(linear_out, layer.bias) return linear_out diff --git a/tests/operators/test_machete_mm.py b/tests/operators/test_machete_mm.py index fafdf717d89..76fc93edc89 100644 --- a/tests/operators/test_machete_mm.py +++ b/tests/operators/test_machete_mm.py @@ -135,6 +135,8 @@ def get_machete_weight_only_linear_out(self): weight_dtype="uint4b8" if self.weight_dtype == "int4" else "uint8b128", # weight_dtype group_size=self.machete_group_size, ) + if self.bias is not None: + out = paddle.add(out, self.bias) return out.numpy() def test_weight_only_linear(self): @@ -158,7 +160,7 @@ def config(self): self.dtype = "float16" self.rtol = 1e-5 self.atol = 1e-1 - self.bias = False + self.bias = True self.batch = 1 self.token = 512 self.in_features = 7168 @@ -224,6 +226,8 @@ def get_machete_weight_only_linear_out(self): weight_dtype="uint4b8" if self.weight_dtype == "int4" else "uint8b128", # weight_dtype group_size=self.machete_group_size, ) + if self.bias is not None: + out = paddle.add(out, self.bias) return out.numpy() def test_weight_only_linear(self):