NVIDIA · tiffany940107 · Aug 25, 2025 · coderabbitai · Aug 25, 2025 · coderabbitai
@@ -27,6 +27,7 @@
 from .modeling_utils import register_auto_model, register_vision_encoder
 
 DISAGG = os.getenv('TLLM_MULTIMODAL_DISAGGREGATED', '0') == '1'
+ENABLE_FP8_BLOCK_SCALE = os.getenv('TLLM_ENABLE_FP8_BLOCK_SCALE', '0') == '1'
 
 
 class Qwen2VLInputProcessorBase(InputProcessor):
@@ -364,6 +365,45 @@ def __init__(self, model_config: ModelConfig[PretrainedConfig],
             attn_implementation='flash_attention_2').eval()
         # TODO: Make vision model compatible with meta init mode and load_weights at the same place
         self.visual = model.visual.to(self.device)
+
+        # Check if FP8 Block Scale mode is enabled
+        # Priority: Environment variable > Config file > Default value
+        config_enable = getattr(pretrained_config, 'enable_fp8_block_scale', False)
+        self.enable_fp8_block_scale = ENABLE_FP8_BLOCK_SCALE or config_enable
+        print(f"FP8 Block Scale mode: {'ENABLED' if self.enable_fp8_block_scale else 'DISABLED'}")
+        if ENABLE_FP8_BLOCK_SCALE:
+            print("  - Enabled via environment variable TLLM_ENABLE_FP8_BLOCK_SCALE=1")
+        elif config_enable:
+            print("  - Enabled via config file")
+        else:
+            print("  - Disabled (use TLLM_ENABLE_FP8_BLOCK_SCALE=1 or set enable_fp8_block_scale=True in config)")
+
+        if self.enable_fp8_block_scale:
+            # Define layer name patterns to be replaced with FP8 Block Scale
+            # Now supports MLP layers, handling dimension mismatch through padding
+            self.fp8_block_scale_patterns = [
+                "blocks.*.attn.qkv",  # All block attention qkv
+                "blocks.*.attn.proj",  # Re-enable attention projection, fix reshape logic
+                "blocks.*.mlp.gate_proj",  # All block mlp gate_proj
+                "blocks.*.mlp.down_proj",  # All block mlp down_proj
+                "blocks.*.mlp.up_proj",  # All block mlp up_proj
+            ]
+
+            # Allow custom replacement patterns through configuration
+            if hasattr(pretrained_config, 'fp8_block_scale_patterns'):
+                self.fp8_block_scale_patterns = pretrained_config.fp8_block_scale_patterns
+
+            # Print model structure for debugging
+            print("Visual model structure:")
+            for name, module in self.visual.named_modules():
+                if isinstance(module, torch.nn.Linear):
+                    print(f"  Linear layer: {name}")
+
+            # Enable replacement functionality - now with pre-quantized weights
+            self._replace_linear_layers_with_pre_quantization()
+        else:
+            print("Skipping FP8 Block Scale layer replacement, using original implementation")
+
         self.post_config()
 
     def post_config(self):
@@ -429,6 +469,272 @@ def _parse_and_batch_multimodal_data(
 
         return mm_content_dict, mm_extra_data
 
+    def _replace_linear_layers_with_pre_quantization(self):
+        """
+        Replace linear layers and pre-quantize weights to avoid repeated quantization during forward pass
+        """
+        import re
+        import torch.nn as nn
+
+        # Directly iterate through all submodules of the visual module
+        for name, module in self.visual.named_modules():
+            # Check if it's a linear layer
+            if isinstance(module, nn.Linear):
+                # Check if it matches any pattern
+                should_replace = False
+                for pattern in self.fp8_block_scale_patterns:
+                    # Convert pattern to regex
+                    regex_pattern = pattern.replace("*", r"\d+")
+                    if re.match(regex_pattern, name):
+                        should_replace = True
+                        break
+
+                if should_replace:
+                    # Check if weight dimensions meet TensorRT-LLM requirements
+                    # For matrix multiplication input @ weight.T, N dimension is in_features
+                    weight = module.weight
+                    in_features = weight.shape[0]  # Input feature dimension
+                    out_features = weight.shape[1]  # Output feature dimension
+                    print(f"DEBUG: Checking {name}, weight.shape={weight.shape}, in_features={in_features}, out_features={out_features}, in_features%16={in_features % 16}")
+
+                    if in_features % 16 != 0:
+                        print(f"Skipping {name}: in_features ({in_features}) not divisible by 16")
+                        continue
+
+                    try:
+                        # Create pre-quantized FP8 Block Scale replacement
+                        fp8_linear = self._create_pre_quantized_fp8_block_linear(module)
+
+                        # Find parent module and child module names
+                        parent_name = '.'.join(name.split('.')[:-1])
+                        child_name = name.split('.')[-1]
+
+                        if parent_name:
+                            # Get parent module
+                            parent_module = self.visual
+                            for part in parent_name.split('.'):
+                                parent_module = getattr(parent_module, part)
+
+                            # Replace child module
+                            setattr(parent_module, child_name, fp8_linear)
+                        else:
+                            # Direct replacement
+                            setattr(self.visual, child_name, fp8_linear)
+
+                        print(f"Replaced Linear layer with Pre-quantized FP8 Block Scale: {name}")
+                    except Exception as e:
+                        print(f"Failed to replace {name}: {e}")
+
+    def _create_pre_quantized_fp8_block_linear(self, original_linear):
+        """
+        Create pre-quantized FP8 Block Linear replacement layer
+
+        Args:
+            original_linear: Original nn.Linear layer
+
+        Returns:
+            Pre-quantized FP8 Block Linear layer
+        """
+        import torch.nn as nn
+
+        class PreQuantizedTrtllmFp8BlockLinear(nn.Module):
+            def __init__(self, original_linear):
+                super().__init__()
+                self.original_linear = original_linear
+
+                # Pre-quantize weights and scaling factors
+                print(f"Pre-quantizing weights for layer with shape {original_linear.weight.shape}")
+                self.weight_fp8, self.weight_scale = self._pre_quantize_weight(original_linear.weight)
+
+                # Move quantized weights and scaling factors to CPU to save GPU memory
+                self.weight_fp8 = self.weight_fp8.cpu()
+                self.weight_scale = self.weight_scale.cpu()
+
+                print(f"Pre-quantization completed. Weight FP8 shape: {self.weight_fp8.shape}, Scale shape: {self.weight_scale.shape}")
+
-                # Pre-quantize weights and scaling factors
-                print(f"Pre-quantizing weights for layer with shape {original_linear.weight.shape}")
-                self.weight_fp8, self.weight_scale = self._pre_quantize_weight(original_linear.weight)
-                
-                # Move quantized weights and scaling factors to CPU to save GPU memory
-                self.weight_fp8 = self.weight_fp8.cpu()
-                self.weight_scale = self.weight_scale.cpu()
-                
-                print(f"Pre-quantization completed. Weight FP8 shape: {self.weight_fp8.shape}, Scale shape: {self.weight_scale.shape}")
-                
+                # Pre-quantize weights and scaling factors
+                print(f"Pre-quantizing weights for layer with shape {original_linear.weight.shape}")
+-                self.weight_fp8, self.weight_scale = self._pre_quantize_weight(original_linear.weight)
+-                
+-                # Move quantized weights and scaling factors to CPU to save GPU memory
+-                self.weight_fp8 = self.weight_fp8.cpu()
+                q_weight, q_scale = self._pre_quantize_weight(original_linear.weight)
+                self._fp8_enabled = (q_weight.dtype == torch.float8_e4m3fn)
+                # Keep on the same device as the original weight to avoid runtime transfers
+                self.register_buffer("weight_fp8", q_weight.to(original_linear.weight.device), persistent=False)
+                self.register_buffer("weight_scale", q_scale.to(original_linear.weight.device), persistent=False)
+                
+                print(f"Pre-quantization completed. Weight FP8 shape: {self.weight_fp8.shape}, Scale shape: {self.weight_scale.shape}")
-                # Pre-quantize weights and scaling factors
-                print(f"Pre-quantizing weights for layer with shape {original_linear.weight.shape}")
-                self.weight_fp8, self.weight_scale = self._pre_quantize_weight(original_linear.weight)
-                
-                # Move quantized weights and scaling factors to CPU to save GPU memory
-                self.weight_fp8 = self.weight_fp8.cpu()
-                self.weight_scale = self.weight_scale.cpu()
-                
-                print(f"Pre-quantization completed. Weight FP8 shape: {self.weight_fp8.shape}, Scale shape: {self.weight_scale.shape}")
-                
+                # Pre-quantize weights and scaling factors
+                print(f"Pre-quantizing weights for layer with shape {original_linear.weight.shape}")
+-                self.weight_fp8, self.weight_scale = self._pre_quantize_weight(original_linear.weight)
+-                
+-                # Move quantized weights and scaling factors to CPU to save GPU memory
+-                self.weight_fp8 = self.weight_fp8.cpu()
+                q_weight, q_scale = self._pre_quantize_weight(original_linear.weight)
+                self._fp8_enabled = (q_weight.dtype == torch.float8_e4m3fn)
+                # Keep on the same device as the original weight to avoid runtime transfers
+                self.register_buffer("weight_fp8", q_weight.to(original_linear.weight.device), persistent=False)
+                self.register_buffer("weight_scale", q_scale.to(original_linear.weight.device), persistent=False)
+                
+                print(f"Pre-quantization completed. Weight FP8 shape: {self.weight_fp8.shape}, Scale shape: {self.weight_scale.shape}")
+                try:
+                    import tensorrt_llm
+                    pass
+                except ImportError:
+                    raise ImportError("TensorRT-LLM is not installed.")
+
+            def _pre_quantize_weight(self, weight: torch.Tensor):
+                """
+                Pre-quantize weights, executed once during initialization
+                """
+                print(f"Starting pre-quantization for weight with shape {weight.shape}")
+
+                # Check if N dimension is divisible by 16
+                if weight.shape[1] % 16 != 0:
+                    print(f"Warning: Matrix N dimension ({weight.shape[1]}) not divisible by 16, skipping FP8 quantization")
+                    return weight, torch.ones(1, device=weight.device, dtype=torch.float32)
+
+                # Execute block-wise quantization
+                quantized_weight, scale = self._create_blockwise_quantized_weight(weight)
+
+                if quantized_weight.dtype != torch.float8_e4m3fn:
+                    print(f"Warning: Failed to quantize weight, using original")
+                    return weight, torch.ones(1, device=weight.device, dtype=torch.float32)
+
+                print(f"Pre-quantization successful. Quantized weight shape: {quantized_weight.shape}")
+                return quantized_weight, scale
+
+            def forward(self, input: torch.Tensor) -> torch.Tensor:
+                """Forward method using pre-quantized weights"""
+                # Get parameters from original linear layer
+                bias = getattr(self.original_linear, 'bias', None)
+
+                # Check if input dimensions meet requirements
+                input_features = input.shape[-1]
+                if input_features % 16 != 0:
+                    print(f"Using original linear layer: input_features ({input_features}) not divisible by 16")
+                    return self.original_linear(input)
+
+                # Save original shape and data type
+                origin_shape = input.shape
+                origin_dtype = input.dtype
+                input = input.to(torch.bfloat16)
+
+                if input.dim() > 2:
+                    input = input.reshape(-1, input.shape[-1])
+
+                # Execute input FP8 quantization
+                act_input_fp8, input_scale = torch.ops.trtllm.fp8_quantize_1x128(input)
+
+                # Move pre-quantized weights and scaling factors to current device
+                weight_fp8 = self.weight_fp8.to(input.device)
+                weight_scale = self.weight_scale.to(input.device)
+
+                # Execute FP8 GEMM
+                output = torch.ops.trtllm.fp8_block_scaling_gemm(act_input_fp8, weight_fp8, input_scale, weight_scale)
+                output = output.to(origin_dtype)
+
+                if bias is not None:
+                    output = output + bias
+
+                # Handle output shape
+                if output.dim() == 2:
+                    if len(origin_shape) == 3:
+                        batch_size, seq_len, hidden_size = origin_shape
+                        output = output.reshape(batch_size, seq_len, hidden_size)
+                    elif len(origin_shape) == 2:
+                        pass  # No reshape needed
+                    else:
+                        return self.original_linear(input)
+
+                return output
+
-            def forward(self, input: torch.Tensor) -> torch.Tensor:
-                """Forward method using pre-quantized weights"""
-                # Get parameters from original linear layer
-                bias = getattr(self.original_linear, 'bias', None)
-                
-                # Check if input dimensions meet requirements
-                input_features = input.shape[-1]
-                if input_features % 16 != 0:
-                    print(f"Using original linear layer: input_features ({input_features}) not divisible by 16")
-                    return self.original_linear(input)
-                
-                # Save original shape and data type
-                origin_shape = input.shape
-                origin_dtype = input.dtype
-                input = input.to(torch.bfloat16)
-
-                if input.dim() > 2:
-                    input = input.reshape(-1, input.shape[-1])
-
-                # Execute input FP8 quantization
-                act_input_fp8, input_scale = torch.ops.trtllm.fp8_quantize_1x128(input)
-                
-                # Move pre-quantized weights and scaling factors to current device
-                weight_fp8 = self.weight_fp8.to(input.device)
-                weight_scale = self.weight_scale.to(input.device)
-                
-                # Execute FP8 GEMM
-                output = torch.ops.trtllm.fp8_block_scaling_gemm(act_input_fp8, weight_fp8, input_scale, weight_scale)
-                output = output.to(origin_dtype)
-
-                if bias is not None:
-                    output = output + bias
-                    
-                # Handle output shape
-                if output.dim() == 2:
-                    if len(origin_shape) == 3:
-                        batch_size, seq_len, hidden_size = origin_shape
-                        output = output.reshape(batch_size, seq_len, hidden_size)
-                    elif len(origin_shape) == 2:
-                        pass  # No reshape needed
-                    else:
-                        return self.original_linear(input)
-                    
-                return output
-            
+            def forward(self, input: torch.Tensor) -> torch.Tensor:
+                """Forward method using pre-quantized weights"""
+                # Fallback if FP8 not enabled or input dim incompatible
+                input_features = input.shape[-1]
+                if not getattr(self, "_fp8_enabled", False) or (input_features % 16 != 0):
+                    return self.original_linear(input)
+                bias = getattr(self.original_linear, "bias", None)
+
+                # Save original shape and data type
+                origin_shape = input.shape
+                origin_dtype = input.dtype
+                x = input.to(torch.bfloat16)
+
+                if x.dim() > 2:
+                    x = x.reshape(-1, x.shape[-1])
+
+                # Execute input FP8 quantization
+                act_input_fp8, input_scale = torch.ops.trtllm.fp8_quantize_1x128(x)
+                weight_fp8 = self.weight_fp8
+                weight_scale = self.weight_scale
+                if weight_fp8.device != x.device:
+                    weight_fp8 = weight_fp8.to(x.device)
+                    weight_scale = weight_scale.to(x.device)
+
+                # Execute FP8 GEMM
+                output = torch.ops.trtllm.fp8_block_scaling_gemm(
+                    act_input_fp8, weight_fp8, input_scale, weight_scale
+                )
+                output = output.to(origin_dtype)
+
+                if bias is not None:
+                    output = output + bias
+
+                # Handle output shape
+                if output.dim() == 2:
+                    if len(origin_shape) == 3:
+                        batch_size, seq_len, _ = origin_shape
+                        out_features = self.original_linear.out_features
+                        output = output.reshape(batch_size, seq_len, out_features)
+                    elif len(origin_shape) == 2:
+                        pass  # No reshape needed
+                    else:
+                        return self.original_linear(input)
+
+                return output
-            def forward(self, input: torch.Tensor) -> torch.Tensor:
-                """Forward method using pre-quantized weights"""
-                # Get parameters from original linear layer
-                bias = getattr(self.original_linear, 'bias', None)
-                
-                # Check if input dimensions meet requirements
-                input_features = input.shape[-1]
-                if input_features % 16 != 0:
-                    print(f"Using original linear layer: input_features ({input_features}) not divisible by 16")
-                    return self.original_linear(input)
-                
-                # Save original shape and data type
-                origin_shape = input.shape
-                origin_dtype = input.dtype
-                input = input.to(torch.bfloat16)
-
-                if input.dim() > 2:
-                    input = input.reshape(-1, input.shape[-1])
-
-                # Execute input FP8 quantization
-                act_input_fp8, input_scale = torch.ops.trtllm.fp8_quantize_1x128(input)
-                
-                # Move pre-quantized weights and scaling factors to current device
-                weight_fp8 = self.weight_fp8.to(input.device)
-                weight_scale = self.weight_scale.to(input.device)
-                
-                # Execute FP8 GEMM
-                output = torch.ops.trtllm.fp8_block_scaling_gemm(act_input_fp8, weight_fp8, input_scale, weight_scale)
-                output = output.to(origin_dtype)
-
-                if bias is not None:
-                    output = output + bias
-                    
-                # Handle output shape
-                if output.dim() == 2:
-                    if len(origin_shape) == 3:
-                        batch_size, seq_len, hidden_size = origin_shape
-                        output = output.reshape(batch_size, seq_len, hidden_size)
-                    elif len(origin_shape) == 2:
-                        pass  # No reshape needed
-                    else:
-                        return self.original_linear(input)
-                    
-                return output
-            
+            def forward(self, input: torch.Tensor) -> torch.Tensor:
+                """Forward method using pre-quantized weights"""
+                # Fallback if FP8 not enabled or input dim incompatible
+                input_features = input.shape[-1]
+                if not getattr(self, "_fp8_enabled", False) or (input_features % 16 != 0):
+                    return self.original_linear(input)
+                bias = getattr(self.original_linear, "bias", None)
+
+                # Save original shape and data type
+                origin_shape = input.shape
+                origin_dtype = input.dtype
+                x = input.to(torch.bfloat16)
+
+                if x.dim() > 2:
+                    x = x.reshape(-1, x.shape[-1])
+
+                # Execute input FP8 quantization
+                act_input_fp8, input_scale = torch.ops.trtllm.fp8_quantize_1x128(x)
+                weight_fp8 = self.weight_fp8
+                weight_scale = self.weight_scale
+                if weight_fp8.device != x.device:
+                    weight_fp8 = weight_fp8.to(x.device)
+                    weight_scale = weight_scale.to(x.device)
+
+                # Execute FP8 GEMM
+                output = torch.ops.trtllm.fp8_block_scaling_gemm(
+                    act_input_fp8, weight_fp8, input_scale, weight_scale
+                )
+                output = output.to(origin_dtype)
+
+                if bias is not None:
+                    output = output + bias
+
+                # Handle output shape
+                if output.dim() == 2:
+                    if len(origin_shape) == 3:
+                        batch_size, seq_len, _ = origin_shape
+                        out_features = self.original_linear.out_features
+                        output = output.reshape(batch_size, seq_len, out_features)
+                    elif len(origin_shape) == 2:
+                        pass  # No reshape needed
+                    else:
+                        return self.original_linear(input)
+
+                return output
+            def _create_blockwise_quantized_weight(
+                self,
+                param_value: torch.Tensor,
+                block_size: int = 128,
+            ):
+                """
+                Create block-wise quantized weights
+                Reference: transformers fp8 128*128 block quantization
+                Supports padding non-128-multiple matrices to 128 multiples
+                """
+                param_value = param_value.to(torch.float32)
+
+                # Get FP8 min/max values
+                fp8_min = torch.finfo(torch.float8_e4m3fn).min
+                fp8_max = torch.finfo(torch.float8_e4m3fn).max
+
+                rows, cols = param_value.shape[-2:]
+                original_shape = param_value.shape
+
+                # Check if N dimension is divisible by 16 (TensorRT-LLM FP8 GEMM requirement)
+                # For matrix multiplication input @ weight.T, N dimension is cols (in_features)
+                if cols % 16 != 0:
+                    print(f"Warning: Matrix N dimension ({cols}) not divisible by 16, skipping FP8 quantization")
+                    return param_value, torch.ones(1, device=param_value.device, dtype=torch.float32)
+
+                # Calculate padding needed for rows and columns
+                # Round up to block_size multiples
+                target_rows = ((rows + block_size - 1) // block_size) * block_size
+                target_cols = ((cols + block_size - 1) // block_size) * block_size
+                pad_rows = target_rows - rows
+                pad_cols = target_cols - cols
+
+                # Perform padding if needed
+                if pad_rows > 0 or pad_cols > 0:
+                    print(f"Padding matrix from ({rows}, {cols}) to ({rows + pad_rows}, {cols + pad_cols})")
+
+                    # Create padded weight matrix
+                    padded_weight = torch.zeros(
+                        rows + pad_rows, cols + pad_cols, 
+                        device=param_value.device, dtype=param_value.dtype
+                    )
+
+                    # Copy original weights to top-left corner of padded matrix
+                    padded_weight[:rows, :cols] = param_value
+
+                    # Use padded weights for quantization
+                    param_value = padded_weight
+                    rows, cols = rows + pad_rows, cols + pad_cols
+
+                # Now matrix dimensions are multiples of 128, can perform block-wise quantization
+                block_size_m, block_size_n = block_size, block_size
+                param_value_orig_shape = param_value.shape
+                param_value = param_value.reshape(
+                    -1, rows // block_size_m, block_size_m, cols // block_size_n, block_size_n
+                ).permute(0, 1, 3, 2, 4)
+
+                # Calculate scaling factor for each block
+                max_abs = torch.amax(torch.abs(param_value), dim=(-1, -2))
+                scale = fp8_max / max_abs
+                scale_orig_shape = scale.shape
+                scale = scale.unsqueeze(-1).unsqueeze(-1)
+
+                @torch.compiler.disable()
+                def _quantize(param_value, scale, fp8_min, fp8_max):
+                    # Quantize the weights
+                    quantized_param = torch.clamp(param_value * scale, min=fp8_min, max=fp8_max).to(torch.float8_e4m3fn)
+
+                    quantized_param = quantized_param.permute(0, 1, 3, 2, 4)
+                    # Reshape back to matrix shape
+                    quantized_param = quantized_param.reshape(param_value_orig_shape)
+
+                    # Reshape scale to match the number of blocks
+                    scale = scale.reshape(scale_orig_shape).squeeze().reciprocal()
+
+                    return quantized_param, scale
+
+                quantized_param, scale = _quantize(param_value, scale, fp8_min, fp8_max)
+
+                # If original matrix was padded, crop back to original size
+                if pad_rows > 0 or pad_cols > 0:
+                    quantized_param = quantized_param[:original_shape[0], :original_shape[1]]
+
+                return quantized_param, scale
+
-            def _create_blockwise_quantized_weight(
-                self,
-                param_value: torch.Tensor,
-                block_size: int = 128,
-            ):
-                """
-                Create block-wise quantized weights
-                Reference: transformers fp8 128*128 block quantization
-                Supports padding non-128-multiple matrices to 128 multiples
-                """
-                param_value = param_value.to(torch.float32)
-
-                # Get FP8 min/max values
-                fp8_min = torch.finfo(torch.float8_e4m3fn).min
-                fp8_max = torch.finfo(torch.float8_e4m3fn).max
-
-                rows, cols = param_value.shape[-2:]
-                original_shape = param_value.shape
-                
-                # Check if N dimension is divisible by 16 (TensorRT-LLM FP8 GEMM requirement)
-                # For matrix multiplication input @ weight.T, N dimension is cols (in_features)
-                if cols % 16 != 0:
-                    print(f"Warning: Matrix N dimension ({cols}) not divisible by 16, skipping FP8 quantization")
-                    return param_value, torch.ones(1, device=param_value.device, dtype=torch.float32)
-                
-                # Calculate padding needed for rows and columns
-                # Round up to block_size multiples
-                target_rows = ((rows + block_size - 1) // block_size) * block_size
-                target_cols = ((cols + block_size - 1) // block_size) * block_size
-                pad_rows = target_rows - rows
-                pad_cols = target_cols - cols
-                
-                # Perform padding if needed
-                if pad_rows > 0 or pad_cols > 0:
-                    print(f"Padding matrix from ({rows}, {cols}) to ({rows + pad_rows}, {cols + pad_cols})")
-                    
-                    # Create padded weight matrix
-                    padded_weight = torch.zeros(
-                        rows + pad_rows, cols + pad_cols, 
-                        device=param_value.device, dtype=param_value.dtype
-                    )
-                    
-                    # Copy original weights to top-left corner of padded matrix
-                    padded_weight[:rows, :cols] = param_value
-                    
-                    # Use padded weights for quantization
-                    param_value = padded_weight
-                    rows, cols = rows + pad_rows, cols + pad_cols
-                
-                # Now matrix dimensions are multiples of 128, can perform block-wise quantization
-                block_size_m, block_size_n = block_size, block_size
-                param_value_orig_shape = param_value.shape
-                param_value = param_value.reshape(
-                    -1, rows // block_size_m, block_size_m, cols // block_size_n, block_size_n
-                ).permute(0, 1, 3, 2, 4)
-
-                # Calculate scaling factor for each block
-                max_abs = torch.amax(torch.abs(param_value), dim=(-1, -2))
-                scale = fp8_max / max_abs
-                scale_orig_shape = scale.shape
-                scale = scale.unsqueeze(-1).unsqueeze(-1)
-
-                @torch.compiler.disable()
-                def _quantize(param_value, scale, fp8_min, fp8_max):
-                    # Quantize the weights
-                    quantized_param = torch.clamp(param_value * scale, min=fp8_min, max=fp8_max).to(torch.float8_e4m3fn)
-
-                    quantized_param = quantized_param.permute(0, 1, 3, 2, 4)
-                    # Reshape back to matrix shape
-                    quantized_param = quantized_param.reshape(param_value_orig_shape)
-
-                    # Reshape scale to match the number of blocks
-                    scale = scale.reshape(scale_orig_shape).squeeze().reciprocal()
-
-                    return quantized_param, scale
-
-                quantized_param, scale = _quantize(param_value, scale, fp8_min, fp8_max)
-                
-                # If original matrix was padded, crop back to original size
-                if pad_rows > 0 or pad_cols > 0:
-                    quantized_param = quantized_param[:original_shape[0], :original_shape[1]]
-                
-                return quantized_param, scale
-            
+                import math
+
+                # Calculate scaling factor for each block
+-                max_abs = torch.amax(torch.abs(param_value), dim=(-1, -2))
+                max_abs = torch.amax(torch.abs(param_value), dim=(-1, -2))
+                # Avoid division by zero for all-zero blocks
+                eps = torch.tensor(1e-6, device=max_abs.device, dtype=max_abs.dtype)
+                scale = fp8_max / torch.maximum(max_abs, eps)
+                 scale_orig_shape = scale.shape
+                 scale = scale.unsqueeze(-1).unsqueeze(-1)
+
+                 @torch.compiler.disable()
+                 def _quantize(param_value, scale, fp8_min, fp8_max):
+                     # Quantize the weights
+                     quantized_param = torch.clamp(param_value * scale, min=fp8_min, max=fp8_max).to(torch.float8_e4m3fn)
+
+                     quantized_param = quantized_param.permute(0, 1, 3, 2, 4)
+                     # Reshape back to matrix shape
+                     quantized_param = quantized_param.reshape(param_value_orig_shape)
+
+-                    # Reshape scale to match the number of blocks
+                    # Reshape scale to match blocks and compute dequant factors
+                    scale = scale.reshape(scale_orig_shape).reciprocal()
+
+                     return quantized_param, scale
+
+                 quantized_param, scale = _quantize(param_value, scale, fp8_min, fp8_max)
+
+                # If we padded the matrix, crop both weight and block scales
+                 if pad_rows > 0 or pad_cols > 0:
+                     quantized_param = quantized_param[:original_shape[0], :original_shape[1]]
+                    # Determine original block counts
+                    orig_block_rows = math.ceil(original_shape[0] / block_size_m)
+                    orig_block_cols = math.ceil(original_shape[1] / block_size_n)
+                    # Trim scale to [orig_block_rows, orig_block_cols] (or include batch dim)
+                    if scale.dim() == 3:
+                        scale = scale[:, :orig_block_rows, :orig_block_cols]
+                    else:
+                        scale = scale[:orig_block_rows, :orig_block_cols]
-            def _create_blockwise_quantized_weight(
-                self,
-                param_value: torch.Tensor,
-                block_size: int = 128,
-            ):
-                """
-                Create block-wise quantized weights
-                Reference: transformers fp8 128*128 block quantization
-                Supports padding non-128-multiple matrices to 128 multiples
-                """
-                param_value = param_value.to(torch.float32)
-
-                # Get FP8 min/max values
-                fp8_min = torch.finfo(torch.float8_e4m3fn).min
-                fp8_max = torch.finfo(torch.float8_e4m3fn).max
-
-                rows, cols = param_value.shape[-2:]
-                original_shape = param_value.shape
-                
-                # Check if N dimension is divisible by 16 (TensorRT-LLM FP8 GEMM requirement)
-                # For matrix multiplication input @ weight.T, N dimension is cols (in_features)
-                if cols % 16 != 0:
-                    print(f"Warning: Matrix N dimension ({cols}) not divisible by 16, skipping FP8 quantization")
-                    return param_value, torch.ones(1, device=param_value.device, dtype=torch.float32)
-                
-                # Calculate padding needed for rows and columns
-                # Round up to block_size multiples
-                target_rows = ((rows + block_size - 1) // block_size) * block_size
-                target_cols = ((cols + block_size - 1) // block_size) * block_size
-                pad_rows = target_rows - rows
-                pad_cols = target_cols - cols
-                
-                # Perform padding if needed
-                if pad_rows > 0 or pad_cols > 0:
-                    print(f"Padding matrix from ({rows}, {cols}) to ({rows + pad_rows}, {cols + pad_cols})")
-                    
-                    # Create padded weight matrix
-                    padded_weight = torch.zeros(
-                        rows + pad_rows, cols + pad_cols, 
-                        device=param_value.device, dtype=param_value.dtype
-                    )
-                    
-                    # Copy original weights to top-left corner of padded matrix
-                    padded_weight[:rows, :cols] = param_value
-                    
-                    # Use padded weights for quantization
-                    param_value = padded_weight
-                    rows, cols = rows + pad_rows, cols + pad_cols
-                
-                # Now matrix dimensions are multiples of 128, can perform block-wise quantization
-                block_size_m, block_size_n = block_size, block_size
-                param_value_orig_shape = param_value.shape
-                param_value = param_value.reshape(
-                    -1, rows // block_size_m, block_size_m, cols // block_size_n, block_size_n
-                ).permute(0, 1, 3, 2, 4)
-
-                # Calculate scaling factor for each block
-                max_abs = torch.amax(torch.abs(param_value), dim=(-1, -2))
-                scale = fp8_max / max_abs
-                scale_orig_shape = scale.shape
-                scale = scale.unsqueeze(-1).unsqueeze(-1)
-
-                @torch.compiler.disable()
-                def _quantize(param_value, scale, fp8_min, fp8_max):
-                    # Quantize the weights
-                    quantized_param = torch.clamp(param_value * scale, min=fp8_min, max=fp8_max).to(torch.float8_e4m3fn)
-
-                    quantized_param = quantized_param.permute(0, 1, 3, 2, 4)
-                    # Reshape back to matrix shape
-                    quantized_param = quantized_param.reshape(param_value_orig_shape)
-
-                    # Reshape scale to match the number of blocks
-                    scale = scale.reshape(scale_orig_shape).squeeze().reciprocal()
-
-                    return quantized_param, scale
-
-                quantized_param, scale = _quantize(param_value, scale, fp8_min, fp8_max)
-                
-                # If original matrix was padded, crop back to original size
-                if pad_rows > 0 or pad_cols > 0:
-                    quantized_param = quantized_param[:original_shape[0], :original_shape[1]]
-                
-                return quantized_param, scale
-            
+                import math
+
+                # Calculate scaling factor for each block
+-                max_abs = torch.amax(torch.abs(param_value), dim=(-1, -2))
+                max_abs = torch.amax(torch.abs(param_value), dim=(-1, -2))
+                # Avoid division by zero for all-zero blocks
+                eps = torch.tensor(1e-6, device=max_abs.device, dtype=max_abs.dtype)
+                scale = fp8_max / torch.maximum(max_abs, eps)
+                 scale_orig_shape = scale.shape
+                 scale = scale.unsqueeze(-1).unsqueeze(-1)
+
+                 @torch.compiler.disable()
+                 def _quantize(param_value, scale, fp8_min, fp8_max):
+                     # Quantize the weights
+                     quantized_param = torch.clamp(param_value * scale, min=fp8_min, max=fp8_max).to(torch.float8_e4m3fn)
+
+                     quantized_param = quantized_param.permute(0, 1, 3, 2, 4)
+                     # Reshape back to matrix shape
+                     quantized_param = quantized_param.reshape(param_value_orig_shape)
+
+-                    # Reshape scale to match the number of blocks
+                    # Reshape scale to match blocks and compute dequant factors
+                    scale = scale.reshape(scale_orig_shape).reciprocal()
+
+                     return quantized_param, scale
+
+                 quantized_param, scale = _quantize(param_value, scale, fp8_min, fp8_max)
+
+                # If we padded the matrix, crop both weight and block scales
+                 if pad_rows > 0 or pad_cols > 0:
+                     quantized_param = quantized_param[:original_shape[0], :original_shape[1]]
+                    # Determine original block counts
+                    orig_block_rows = math.ceil(original_shape[0] / block_size_m)
+                    orig_block_cols = math.ceil(original_shape[1] / block_size_n)
+                    # Trim scale to [orig_block_rows, orig_block_cols] (or include batch dim)
+                    if scale.dim() == 3:
+                        scale = scale[:, :orig_block_rows, :orig_block_cols]
+                    else:
+                        scale = scale[:orig_block_rows, :orig_block_cols]
+            @property
+            def weight(self):
+                return self.original_linear.weight
+
+            @property
+            def bias(self):
+                return getattr(self.original_linear, 'bias', None)
+
+            @property
+            def in_features(self):
+                return self.original_linear.in_features
+
+            @property
+            def out_features(self):
+                return self.original_linear.out_features
+
+        return PreQuantizedTrtllmFp8BlockLinear(original_linear)
+
+    def is_fp8_blockscale_enabled(self) -> bool:
+        """
+        Check if FP8 Block Scale mode is enabled
+
+        Returns:
+            bool: True if FP8 mode is enabled, False otherwise
+        """
+        return getattr(self, 'enable_fp8_block_scale', False)
+
     @torch.inference_mode()
     def forward(self, multimodal_params: List[MultimodalParams]):