@@ -161,7 +161,6 @@ def get_quant_method(self, layer) -> Optional[QuantMethodBase]:
161161 and envs .FD_USE_MACHETE == "1"
162162 and layer .weight_shape [1 ]
163163 and layer .weight_shape [1 ] % 128 == 0
164- and not layer .add_bias
165164 ):
166165 return MacheteWeightOnlyLinearMethod (self )
167166 return GPUWeightOnlyLinearMethod (self )
@@ -244,7 +243,8 @@ def create_weights(self, layer, **extra_weight_attrs):
244243 )
245244 else :
246245 if isinstance (self , MacheteWeightOnlyLinearMethod ):
247- weight_scale_shape = [1 , layer .weight_shape [1 ]]
246+ # Using group scale for machete, group size is 128
247+ weight_scale_shape = [(layer .weight_shape [0 ] + 127 ) // 128 , layer .weight_shape [1 ]]
248248 if self .quant_config .name () == "wint4" :
249249 layer .weight_shape [0 ] //= 8
250250 else :
@@ -299,10 +299,12 @@ def process_weights_after_loading(self, layer) -> None:
299299 machete_quantize_and_pack ,
300300 )
301301
302+ # Using group scale for machete, group size is 128
302303 quanted_weight_tensor , weight_scale_tensor = machete_quantize_and_pack (
303304 w = layer .weight ,
304305 atype = layer ._dtype ,
305306 quant_type = "uint4b8" if self .quant_config .name () == "wint4" else "uint8b128" ,
307+ group_size = 128 ,
306308 )
307309 else :
308310 quanted_weight_tensor , weight_scale_tensor = weight_quantize (
@@ -404,23 +406,27 @@ def process_loaded_weights(self, layer, weight) -> None:
404406 machete_quantize_and_pack ,
405407 )
406408
409+ # Using group scale for machete, group size is 128
407410 quanted_weight_tensor , weight_scale_tensor = machete_quantize_and_pack (
408411 w = weight ,
409412 atype = layer ._dtype ,
410413 quant_type = "uint4b8" if self .quant_config .name () == "wint4" else "uint8b128" ,
414+ group_size = 128 ,
411415 )
412416 layer .weight .set_value (quanted_weight_tensor )
413417 layer .weight_scale .set_value (weight_scale_tensor .astype (paddle .get_default_dtype ()))
414418
415419 def apply (self , layer , x ):
416- assert layer .bias is None , "Machete weight only linear method does not support bias."
417420 from fastdeploy .model_executor .layers .quantization .ops import machete_wint_mm
418421
422+ # Using group scale for machete, group size is 128
419423 linear_out = machete_wint_mm (
420424 x ,
421425 w_prepack = layer .weight ,
422426 w_g_s = layer .weight_scale ,
423427 weight_dtype = "uint4b8" if self .quant_config .name () == "wint4" else "uint8b128" ,
428+ group_size = 128 ,
424429 )
425-
430+ if layer .with_bias :
431+ linear_out = paddle .add (linear_out , layer .bias )
426432 return linear_out
0 commit comments