huggingface · Cyrilvallez · Apr 22, 2025 · Apr 22, 2025 · Apr 22, 2025 · Apr 22, 2025
diff --git a/src/transformers/quantizers/quantizer_finegrained_fp8.py b/src/transformers/quantizers/quantizer_finegrained_fp8.py
@@ -91,11 +91,9 @@ def create_quantized_param(
         """
         Quantizes weights to FP8 format using Block-wise quantization
         """
-        from accelerate.utils import set_module_tensor_to_device
+        from ..modeling_utils import _load_parameter_into_model
 
-        set_module_tensor_to_device(model, param_name, target_device, param_value)
-
-        module, tensor_name = get_module_from_name(model, param_name)
+        param_value = param_value.to(target_device)
 
         # Get FP8 min/max values
         fp8_min = torch.finfo(torch.float8_e4m3fn).min
@@ -131,8 +129,9 @@ def create_quantized_param(
         # Reshape scale to match the number of blocks
         scale = scale.reshape(scale_orig_shape).squeeze().reciprocal()
 
-        module._buffers[tensor_name] = quantized_param.to(target_device)
-        module._buffers["weight_scale_inv"] = scale.to(target_device)
+        # Load into the model
+        _load_parameter_into_model(model, param_name, quantized_param)
+        _load_parameter_into_model(model, param_name.rsplit(".", 1)[0] + ".weight_scale_inv", scale)
 
     def check_quantized_param(
         self,