huggingface · ArthurZucker · Jan 22, 2026 · Jan 12, 2026 · Jan 12, 2026 · Jan 12, 2026
diff --git a/src/transformers/conversion_mapping.py b/src/transformers/conversion_mapping.py
@@ -21,6 +21,7 @@
     Chunk,
     Concatenate,
     ErnieFuseAndSplitTextVisionExperts,
+    Force16BytesAlignment,
     MergeModulelist,
     Transpose,
     WeightConverter,
@@ -40,6 +41,18 @@
 
 def _build_checkpoint_conversion_mapping():
     mapping = {
+        "gpt_oss": [
+            WeightConverter(
+                source_patterns="mlp.experts.gate_up_proj",
+                target_patterns="mlp.experts.gate_up_proj",
+                operations=[Force16BytesAlignment()],
+            ),
+            WeightConverter(
+                source_patterns="mlp.experts.down_proj",
+                target_patterns="mlp.experts.down_proj",
+                operations=[Force16BytesAlignment()],
+            ),
+        ],
         "mixtral": [
             WeightRenaming(".block_sparse_moe.gate", ".mlp.gate"),
             WeightConverter(
@@ -347,6 +360,10 @@ def get_model_conversion_mapping(
 
     # Add the ones from the quantizer as well if provided
     if hf_quantizer is not None:
-        weight_conversions.extend(hf_quantizer.get_weight_conversions())
+        # NOTE: Since get_weight_conversions() only serves to dequantize, we need to put them first in the list.
+        # However, for now it's not possible to match 1 param with 2 converters (i.e. 1 dequantization converter
+        # and 1 model-specific converter). Which means that if a model that has model-specific conversions and is being
+        # dequantized, the model-specific conversion that has patterns matching the dequantization patterns will be ignored.
+        weight_conversions = hf_quantizer.get_weight_conversions() + weight_conversions
 
     return weight_conversions
diff --git a/src/transformers/core_model_loading.py b/src/transformers/core_model_loading.py
@@ -440,6 +440,42 @@ def reverse_op(self) -> ConversionOps:
         return ErnieFuseAndSplitTextVisionExperts(stack_dim=self.stack_dim, concat_dim=self.concat_dim)
 
 
+class Force16BytesAlignment(ConversionOps):
+    """
+    Ensures that the given tensor is 16-bytes aligned in memory and clones it if not.
+    This garantees 16-bytes alignmenet for kernels / implementations that use TMA or SIMD instructions like torch._grouped_mm.
+    """
+
+    @torch.no_grad()
+    def convert(
+        self, input_dict: dict[str, torch.Tensor], source_patterns: list[str], target_patterns: list[str], **kwargs
+    ) -> dict[str, torch.Tensor]:
+        target_pattern = self.get_target_pattern(input_dict, source_patterns, target_patterns)
+        tensors = next(iter(input_dict.values()))
+        tensor = tensors[0] if isinstance(tensors, list) else tensors
+        tensor = tensor.clone() if tensor.data_ptr() % 16 != 0 else tensor
+        return {target_pattern: tensor}
+
+    def get_target_pattern(
+        self, input_dict: dict[str, torch.Tensor], source_patterns: list[str], target_patterns: list[str]
+    ) -> str:
+        if len(input_dict) != 1:
+            raise ValueError("Undefined Operation encountered!")
+        # Here it's the first operation of a chain, so return the source
+        if len(target_patterns) > 1:
+            if len(source_patterns) == 1:
+                return source_patterns[0]
+            else:
+                raise ValueError("Undefined Operation encountered!")
+        # Here it's the only operation, or the last operation in a chain, so we return the target
+        else:
+            return target_patterns[0]
+
+    @property
+    def reverse_op(self) -> ConversionOps:
+        return Force16BytesAlignment()
+
+
 @dataclass(slots=True)
 class WeightTransform:
     source_patterns: str | list[str] = field(init=True)