diff --git a/unsloth_zoo/saving_utils.py b/unsloth_zoo/saving_utils.py
index ce002a46e..f7f3ab0e4 100644
--- a/unsloth_zoo/saving_utils.py
+++ b/unsloth_zoo/saving_utils.py
@@ -446,6 +446,13 @@ def create_lora_statistics(model, merge_into_original = False, return_state_dict
             if name.endswith(".base_layer.weight"):
                 name = name[:-len(".base_layer.weight")]
 
+            # modules_to_save wraps embed_tokens / lm_head; strip the wrapper
+            # so the key matches lora_weights entries created by the branch above.
+            # Only strip .weight variant; the lora_weights branch adds both
+            # .weight and .bias from the module so we don't need a separate bias entry.
+            elif name.endswith(".modules_to_save.default.weight"):
+                name = name[:-len(".modules_to_save.default.weight")]
+
             if name in lora_weights:
                 state_dict[name + ".weight"]   = lora_weights[name]
                 if getattr(lora_weights[name].module, "bias", None) is not None:
@@ -1479,7 +1486,19 @@ def _merge_and_overwrite_lora_mxfp4(save_directory, filename, lora_weights, outp
 
 def get_torch_storage_size_new(x, element_size):
     if isinstance(x, LoraStats):
-        shape = (x.module.in_features, x.module.out_features)
+        mod = x.module
+        # modules_to_save: use the saved weight shape directly
+        saved_w = _get_modules_to_save_weight(mod)
+        if saved_w is None and hasattr(mod, "weight"):
+            saved_w = mod.weight
+        if saved_w is not None and hasattr(saved_w, "shape"):
+            return int(np.prod(saved_w.shape)) * element_size
+        # MoE LoRA wrappers with no .base_layer: infer merged shape from lora matrices
+        if mod is None and x.lora_A is not None and x.lora_B is not None:
+            shape = (x.lora_B.shape[0], x.lora_A.shape[1])
+            return int(np.prod(shape)) * element_size
+        # Fallback for Linear-like modules
+        shape = (mod.in_features, mod.out_features)
         return int(np.prod(shape)) * element_size
     else:
         return get_torch_storage_size(x)