diff --git a/unsloth/__init__.py b/unsloth/__init__.py
index 745b210208..980425e1f1 100644
--- a/unsloth/__init__.py
+++ b/unsloth/__init__.py
@@ -55,7 +55,7 @@
 pass
 
 # Reduce VRAM usage by reducing fragmentation
-os.environ["PYTORCH_CUDA_ALLOC_CONF"] = "expandable_segments:True"
+os.environ["PYTORCH_CUDA_ALLOC_CONF"] = "expandable_segments:True,roundup_power2_divisions:[64:128,256:64,>:32]"
 
 # Hugging Face Hub faster downloads
 if "HF_HUB_ENABLE_HF_TRANSFER" not in os.environ:
diff --git a/unsloth/kernels/__init__.py b/unsloth/kernels/__init__.py
index 82e7641693..ef5fa5da70 100644
--- a/unsloth/kernels/__init__.py
+++ b/unsloth/kernels/__init__.py
@@ -42,6 +42,7 @@
     apply_lora_mlp_geglu_approx,
     apply_lora_qkv,
     apply_lora_o,
+    fast_lora_forward,
 )
 from .utils import fast_dequantize, fast_gemv, QUANT_STATE, fast_linear_forward, matmul_lora
 
diff --git a/unsloth/kernels/fast_lora.py b/unsloth/kernels/fast_lora.py
index 2177b43b9e..c2b7929a29 100644
--- a/unsloth/kernels/fast_lora.py
+++ b/unsloth/kernels/fast_lora.py
@@ -410,3 +410,81 @@ def apply_lora_o(self, X):
     O = LoRA_W.apply(X, OW, OW_quant, OA, OB, OS)
     return O
 pass
+
+
+IDENTITY_DROPOUT = torch.nn.Identity
+@torch._disable_dynamo
+def fast_lora_forward(self, x: torch.Tensor, *args, **kwargs) -> torch.Tensor:
+    raise NotImplementedError(
+        "Unsloth: Currently not supported yet - reshaping done incorrectly"
+    )
+    self._check_forward_args(x, *args, **kwargs)
+    adapter_names = kwargs.pop("adapter_names", None)
+
+    if self.disable_adapters:
+        if self.merged:
+            self.unmerge()
+        result = self.base_layer(x, *args, **kwargs)
+    elif adapter_names is not None:
+        result = self._mixed_batch_forward(x, *args, adapter_names=adapter_names, **kwargs)
+    elif self.merged:
+        result = self.base_layer(x, *args, **kwargs)
+    else:
+        # Fastpath
+        if len(self.active_adapters) == 1:
+            active_adapter = self.active_adapters[0]
+            if active_adapter not in self.lora_A.keys(): return self.base_layer(x, *args, **kwargs)
+
+            dropout = self.lora_dropout[active_adapter]
+            if isinstance(dropout, IDENTITY_DROPOUT) and not self.use_dora[active_adapter]:
+                lora_A = self.lora_A[active_adapter].weight
+                lora_B = self.lora_B[active_adapter].weight
+                scaling = self.scaling[active_adapter]
+                W = self.base_layer.weight
+                return LoRA_W.apply(x, W, QUANT_STATE(W), lora_A, lora_B, scaling)
+            pass
+        pass
+
+        result = self.base_layer(x, *args, **kwargs)
+        # As per Tim Dettmers, for 4bit, we need to defensively clone here.
+        # The reason is that in some cases, an error can occur that backprop
+        # does not work on a manipulated view. This issue may be solved with
+        # newer PyTorch versions but this would need extensive testing to be
+        # sure.
+        result = result.clone()
+
+        for active_adapter in self.active_adapters:
+            if active_adapter not in self.lora_A.keys():
+                continue
+            lora_A = self.lora_A[active_adapter]
+            lora_B = self.lora_B[active_adapter]
+            dropout = self.lora_dropout[active_adapter]
+            scaling = self.scaling[active_adapter]
+
+            requires_conversion = not torch.is_autocast_enabled()
+            if requires_conversion:
+                expected_dtype = result.dtype
+                x = x.to(lora_A.weight.dtype)
+
+            if not self.use_dora[active_adapter]:
+                result = result + lora_B(lora_A(dropout(x))) * scaling
+            else:
+                if isinstance(dropout, torch.nn.Identity) or not self.training:
+                    base_result = result
+                else:
+                    x = dropout(x)
+                    base_result = None
+
+                result = result + self.lora_magnitude_vector[active_adapter](
+                    x,
+                    lora_A=lora_A,
+                    lora_B=lora_B,
+                    scaling=scaling,
+                    base_layer=self.get_base_layer(),
+                    base_result=base_result,
+                )
+            if requires_conversion:
+                result = result.to(expected_dtype)
+
+    return result
+pass
diff --git a/unsloth/kernels/rms_layernorm.py b/unsloth/kernels/rms_layernorm.py
index 4b22f8c3e5..b74d636c63 100644
--- a/unsloth/kernels/rms_layernorm.py
+++ b/unsloth/kernels/rms_layernorm.py
@@ -57,6 +57,7 @@ def _rms_layernorm_forward(
 @triton.jit
 def _rms_layernorm_backward(
     dY, dY_row_stride,
+    dX, dX_row_stride,
     X,   X_row_stride,
     W,   W_row_stride,
     r,   r_row_stride,
@@ -78,6 +79,9 @@ def _rms_layernorm_backward(
     X  += row_idx *  X_row_stride
     r  += row_idx *  r_row_stride
 
+    if GEMMA: dX += row_idx * dY_row_stride
+    else:     dX = dY
+
     dY_row = tl.load(dY + col_offsets, mask = mask, other = 0).to(tl.float32)
     X_row  = tl.load(X  + col_offsets, mask = mask, other = 0).to(tl.float32)
     W_row  = tl.load(W  + col_offsets, mask = mask, other = 0).to(tl.float32)
@@ -91,7 +95,7 @@ def _rms_layernorm_backward(
 
     rowsum_dY_normed = tl.sum(dY_W * normed, axis = 0)
     output = inv_var/n_cols * (n_cols*dY_W - normed*rowsum_dY_normed)
-    tl.store(dY + col_offsets, output, mask = mask)
+    tl.store(dX + col_offsets, output, mask = mask)
 pass
 
 
@@ -172,9 +176,11 @@ def backward(ctx, dY : torch.Tensor):
         n_cols : int
         n_rows, n_cols = dY.shape
         # dW = X
+        dX = torch.empty_like(dY, device = "cuda:0") if ctx.GEMMA else dY
 
         _rms_layernorm_backward[(n_rows,)](
             dY, dY.stride(0),
+            dX, dX.stride(0),
             X,  X .stride(0),
             W,  W .stride(0),
             r,  r .stride(0),
@@ -184,7 +190,7 @@ def backward(ctx, dY : torch.Tensor):
             BLOCK_SIZE = ctx.BLOCK_SIZE,
             num_warps  = ctx.num_warps,
         )
-        dX = dY.view(*shape)
+        dX = dX.view(*shape)
         return dX, None, None, None
     pass
 pass
diff --git a/unsloth/models/__init__.py b/unsloth/models/__init__.py
index e67a9e5fad..3230cdc207 100644
--- a/unsloth/models/__init__.py
+++ b/unsloth/models/__init__.py
@@ -12,7 +12,7 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-from .loader  import FastLanguageModel
+from .loader  import FastLanguageModel, FastVisionModel
 from .llama   import FastLlamaModel
 from .mistral import FastMistralModel
 from .qwen2   import FastQwen2Model
diff --git a/unsloth/models/_utils.py b/unsloth/models/_utils.py
index daa81d97ac..ee85ba3c36 100644
--- a/unsloth/models/_utils.py
+++ b/unsloth/models/_utils.py
@@ -12,7 +12,7 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-__version__ = "2024.11.7"
+__version__ = "2024.11.8"
 
 __all__ = [
     "prepare_model_for_kbit_training",
@@ -52,6 +52,17 @@
     "unpatch_unsloth_gradient_checkpointing",
     "patch_gradient_checkpointing",
     "unpatch_gradient_checkpointing",
+
+    "HAS_CUT_CROSS_ENTROPY",
+    "fused_linear_cross_entropy",
+    "patch_unsloth_smart_gradient_checkpointing",
+    "unpatch_unsloth_smart_gradient_checkpointing",
+    "create_gradient_checkpointing_buffer",
+
+    "patch_compiled_autograd",
+    "process_vision_info",
+    "unsloth_compile_transformers",
+    "patch_fast_lora",
 ]
 
 import torch
@@ -70,6 +81,7 @@
     patch_layernorm,
     patch_torch_compile,
     patch_model_and_tokenizer,
+    patch_compiled_autograd,
 )
 from unsloth_zoo.gradient_checkpointing import (
     Unsloth_Offloaded_Gradient_Checkpointer,
@@ -81,6 +93,21 @@
     unsloth_gradient_checkpoint,
     patch_gradient_checkpointing,
     unpatch_gradient_checkpointing,
+
+    patch_unsloth_smart_gradient_checkpointing,
+    unpatch_unsloth_smart_gradient_checkpointing,
+    create_gradient_checkpointing_buffer,
+)
+from unsloth_zoo.loss_utils import (
+    HAS_CUT_CROSS_ENTROPY,
+    fused_linear_cross_entropy,
+)
+from unsloth_zoo.vision_utils import (
+    process_vision_info,
+)
+from unsloth_zoo.compiler import (
+    get_transformers_model_type,
+    unsloth_compile_transformers as _unsloth_compile_transformers,
 )
 
 # =============================================
@@ -120,6 +147,22 @@ def filter(self, x): return not (self.text in x.getMessage())
 except:
     pass
 
+# The model weights are not tied. Please use the `tie_weights` method before using the `infer_auto_device` function.
+try:
+    from accelerate.utils.modeling import logger as accelerate_utils_modeling_logger
+    accelerate_utils_modeling_logger.addFilter(HideLoggingMessage("The model weights are not tied"))
+    del accelerate_utils_modeling_logger
+except:
+    pass
+
+# Setting `pad_token_id` to `eos_token_id`
+try:
+    from transformers.generation.utils import logger as transformers_generation_utils_logger
+    transformers_generation_utils_logger.addFilter(HideLoggingMessage("Setting `pad_token_id` to `eos_token_id`"))
+    del transformers_generation_utils_logger
+except:
+    pass
+
 # =============================================
 
 # =============================================
@@ -282,54 +325,60 @@ def _is_openai_available(): return False
 
 # =============================================
 # Get Xformers
-from xformers import __version__ as xformers_version
-# Temporarily disable 0.0.27 and higher - inference issues
-if False: #Version(xformers_version) >= Version("0.0.27"):
-    raise ImportError(
-        "Unsloth: If you are in Colab, we updated the top cell install instructions - please change it to below "\
-        "then press Disconnect Runtime and then Restart it.\n"\
-        "\n"\
-        "%%capture\n"
-        "# Installs Unsloth, Xformers (Flash Attention) and all other packages!\n"
-        '!pip install "unsloth[colab-new] @ git+https://github.com/unslothai/unsloth.git"\n'
-        '!pip install --no-deps "xformers<=0.0.27" trl peft accelerate bitsandbytes\n'\
-        '\n'\
-        f"Otherwise in local machines, your xformers version of {xformers_version} is too new.\n"\
-        'Please downgrade xformers via `pip install --force-reinstall "xformers<=0.0.27"'
-    )
-pass
+try:
+    from xformers import __version__ as xformers_version
+    # Temporarily disable 0.0.27 and higher - inference issues
+    if False: #Version(xformers_version) >= Version("0.0.27"):
+        raise ImportError(
+            "Unsloth: If you are in Colab, we updated the top cell install instructions - please change it to below "\
+            "then press Disconnect Runtime and then Restart it.\n"\
+            "\n"\
+            "%%capture\n"
+            "# Installs Unsloth, Xformers (Flash Attention) and all other packages!\n"
+            '!pip install "unsloth[colab-new] @ git+https://github.com/unslothai/unsloth.git"\n'
+            '!pip install --no-deps "xformers<=0.0.27" trl peft accelerate bitsandbytes\n'\
+            '\n'\
+            f"Otherwise in local machines, your xformers version of {xformers_version} is too new.\n"\
+            'Please downgrade xformers via `pip install --force-reinstall "xformers<=0.0.27"'
+        )
+    pass
 
-if   Version(torch_version) < Version("2.2.0") and Version(xformers_version) >= Version("0.0.24"):
-    raise ImportError(
-        f"Unsloth: You have torch = {torch_version} but xformers = {xformers_version}.\n"\
-        f"Please install xformers < 0.0.24 for torch = {torch_version}."
-    )
-elif Version(torch_version) < Version("2.3.0") and Version(xformers_version) >= Version("0.0.26"):
-    raise ImportError(
-        f"Unsloth: You have torch = {torch_version} but xformers = {xformers_version}.\n"\
-        f"Please install xformers < 0.0.26 for torch = {torch_version}."
-    )
-elif Version(torch_version) < Version("2.4.0") and Version(xformers_version) > Version("0.0.27"):
-    raise ImportError(
-        f"Unsloth: You have torch = {torch_version} but xformers = {xformers_version}.\n"\
-        f"Please install xformers <= 0.0.27 for torch = {torch_version}."
-    )
-pass
+    if   Version(torch_version) < Version("2.2.0") and Version(xformers_version) >= Version("0.0.24"):
+        raise ImportError(
+            f"Unsloth: You have torch = {torch_version} but xformers = {xformers_version}.\n"\
+            f"Please install xformers < 0.0.24 for torch = {torch_version}."
+        )
+    elif Version(torch_version) < Version("2.3.0") and Version(xformers_version) >= Version("0.0.26"):
+        raise ImportError(
+            f"Unsloth: You have torch = {torch_version} but xformers = {xformers_version}.\n"\
+            f"Please install xformers < 0.0.26 for torch = {torch_version}."
+        )
+    elif Version(torch_version) < Version("2.4.0") and Version(xformers_version) > Version("0.0.27"):
+        raise ImportError(
+            f"Unsloth: You have torch = {torch_version} but xformers = {xformers_version}.\n"\
+            f"Please install xformers <= 0.0.27 for torch = {torch_version}."
+        )
+    pass
 
-from xformers._cpp_lib import _register_extensions
-try:
-    _register_extensions() # Check if C++ modules are loaded correctly
-except Exception as error:
-    raise ImportError(
-        "Unsloth: Xformers was not installed correctly.\n"\
-        "Please install xformers separately first.\n"\
-        "Then confirm if it's correctly installed by running:\n"\
-        "python -m xformers.info\n\n"
-        "Longer error message:\n" + str(error)
-    )
+    from xformers._cpp_lib import _register_extensions
+    try:
+        _register_extensions() # Check if C++ modules are loaded correctly
+    except Exception as error:
+        raise ImportError(
+            "Unsloth: Xformers was not installed correctly.\n"\
+            "Please install xformers separately first.\n"\
+            "Then confirm if it's correctly installed by running:\n"\
+            "python -m xformers.info\n\n"
+            "Longer error message:\n" + str(error)
+        )
+    pass
+    import xformers.ops.fmha as xformers
+    xformers_attention = xformers.memory_efficient_attention
+except:
+    xformers = None
+    xformers_attention = None
+    xformers_version = None
 pass
-import xformers.ops.fmha as xformers
-xformers_attention = xformers.memory_efficient_attention
 
 # Check TRL version
 from trl import __version__ as trl_version
@@ -658,7 +707,7 @@ def get_statistics():
 )
 
 def _prepare_backend(
-    self, cpu: bool = False, sagemaker_dp = False, backend: str = None,
+    self, cpu = False, sagemaker_dp = False, backend: str = None,
 ) -> tuple[str, DistributedType]:
     return None, DistributedType.NO
 pass
@@ -1047,3 +1096,69 @@ def patch_tokenizer(model, tokenizer):
         model.config.update({"unsloth_version" : __version__})
     return model, tokenizer
 pass
+
+
+def patch_fast_lora():
+    import peft.tuners.lora.bnb
+    peft.tuners.lora.bnb.Linear4bit.forward = fast_lora_forward
+pass
+
+
+def unsloth_compile_transformers(
+    model_name,
+    token                   = None,
+    revision                = None,
+    trust_remote_code       = False,
+    sdpa_dynamic_mask       = True,
+    sdpa_bool_masks         = True,
+    sdpa_gqa_replace        = True,
+    sdpa_dynamic_compile    = True,
+    compile_attention       = True,
+    disable_causal_masks    = True,
+    compile_torch_modules   = True,
+    compile_custom_modules  = True,
+    compile_function_calls  = True,
+    fuse_lm_head            = True,
+    gradient_checkpointing  = True,
+    manual_replacements     = True,
+    epilogue_fusion         = True,
+    max_autotune            = False,
+    shape_padding           = True,
+    cudagraphs              = False,
+    debug                   = False,
+    import_from_cache       = False,
+    disable                 = False,
+):
+    if disable: return
+    model_types = get_transformers_model_type(
+        model_name        = model_name,
+        token             = token,
+        revision          = revision,
+        trust_remote_code = trust_remote_code,
+    )
+    for model_type in model_types:
+        _unsloth_compile_transformers(
+            model_type,
+            sdpa_dynamic_mask      = sdpa_dynamic_mask,
+            sdpa_bool_masks        = sdpa_bool_masks,
+            sdpa_gqa_replace       = sdpa_gqa_replace,
+            sdpa_dynamic_compile   = sdpa_dynamic_compile,
+            compile_attention      = compile_attention,
+            disable_causal_masks   = disable_causal_masks,
+            compile_torch_modules  = compile_torch_modules,
+            compile_custom_modules = compile_custom_modules,
+            compile_function_calls = compile_function_calls,
+            fuse_lm_head           = fuse_lm_head,
+            gradient_checkpointing = gradient_checkpointing,
+            manual_replacements    = manual_replacements,
+            epilogue_fusion        = epilogue_fusion,
+            max_autotune           = max_autotune,
+            shape_padding          = shape_padding,
+            cudagraphs             = cudagraphs,
+            debug                  = debug,
+            import_from_cache      = import_from_cache,
+            disable                = disable,
+        )
+    pass
+    return model_types
+pass
diff --git a/unsloth/models/gemma2.py b/unsloth/models/gemma2.py
index 4eb9d64313..62ecb9690f 100644
--- a/unsloth/models/gemma2.py
+++ b/unsloth/models/gemma2.py
@@ -60,8 +60,7 @@
     from flash_attn import flash_attn_func
 
 # [TODO] We must randomnly use torch.compile?
-# I checked the gradients and formulas and I'm sure it's correct.
-# I'm stumped :(
+# Gemma 2 uses double RMS Layernorms, so the backward passes should not overwrite the gradients!
 @torch.compile(fullgraph = False, dynamic = True, options = torch_compile_options)
 def fast_rms_layernorm_gemma2_compiled(layernorm, X, gemma = True):
     old_dtype = X.dtype
@@ -207,7 +206,7 @@ def Gemma2DecoderLayer_fast_forward(
         hidden_states += residual
     else:
         residual = hidden_states
-        hidden_states = fast_rms_layernorm_gemma2_compiled(self.input_layernorm, hidden_states, gemma = True)
+        hidden_states = fast_rms_layernorm(self.input_layernorm, hidden_states, gemma = True)
         hidden_states, self_attn_weights, present_key_value = self.self_attn(
             hidden_states=hidden_states,
             causal_mask=causal_mask,
@@ -218,14 +217,14 @@ def Gemma2DecoderLayer_fast_forward(
             use_cache=use_cache,
             padding_mask=padding_mask,
         )
-        hidden_states = fast_rms_layernorm_gemma2_compiled(self.post_attention_layernorm, hidden_states, gemma = True)
+        hidden_states = fast_rms_layernorm(self.post_attention_layernorm, hidden_states, gemma = True)
         hidden_states = residual + hidden_states
 
         # Fully Connected
         residual = hidden_states
-        hidden_states = fast_rms_layernorm_gemma2_compiled(self. pre_feedforward_layernorm, hidden_states, gemma = True)
+        hidden_states = fast_rms_layernorm(self. pre_feedforward_layernorm, hidden_states, gemma = True)
         hidden_states = self.mlp(hidden_states)
-        hidden_states = fast_rms_layernorm_gemma2_compiled(self.post_feedforward_layernorm, hidden_states, gemma = True)
+        hidden_states = fast_rms_layernorm(self.post_feedforward_layernorm, hidden_states, gemma = True)
         hidden_states = residual + hidden_states
     pass
 
diff --git a/unsloth/models/llama.py b/unsloth/models/llama.py
index 47a57024a2..0256fc1830 100644
--- a/unsloth/models/llama.py
+++ b/unsloth/models/llama.py
@@ -719,25 +719,33 @@ def LlamaModel_fast_forward(
     pass
 
     # Gemma2 has alternating SWA and global attn
+    use_static_mask  = True
+    dynamic_SWA_mask = None
+    dynamic_GA_mask  = None
     if IS_GEMMA2:
         if HAS_FLASH_ATTENTION_SOFTCAPPING and attention_mask is None:
             self.SWA_mask = True
             self.GA_mask  = False
         elif attention_mask is not None:
-            self.SWA_mask = _prepare_4d_causal_attention_mask_for_sdpa(
+
+            # Fixes https://github.com/unslothai/unsloth/issues/853
+            # Unsloth needs a 2D mask, not a [2, 1, n, n] mask!
+            dynamic_SWA_mask = _prepare_4d_causal_attention_mask_for_sdpa(
                 attention_mask,
                 (batch_size, seq_length),
                 inputs_embeds,
                 past_key_values_length,
                 sliding_window = self.config.sliding_window,
-            )
-            self.GA_mask = _prepare_4d_causal_attention_mask_for_sdpa(
+            )[0][0]
+            dynamic_GA_mask = _prepare_4d_causal_attention_mask_for_sdpa(
                 attention_mask,
                 (batch_size, seq_length),
                 inputs_embeds,
                 past_key_values_length,
                 sliding_window = None,
-            )
+            )[0][0]
+            use_static_mask = False
+
         elif not hasattr(self, "SWA_mask"):
             if HAS_FLEX_ATTENTION:
                 # Use Flex Attention instead!
@@ -772,7 +780,12 @@ def LlamaModel_fast_forward(
         past_key_value = past_key_values[idx] if past_key_values is not None else None
 
         mask = causal_mask
-        if IS_GEMMA2: mask = self.SWA_mask if (idx % 2 == 0) else self.GA_mask
+        if IS_GEMMA2:
+            if (idx % 2 == 0):
+                mask = self.SWA_mask if use_static_mask else dynamic_SWA_mask
+            else:
+                mask = self. GA_mask if use_static_mask else dynamic_GA_mask
+        pass
 
         if offloaded_gradient_checkpointing:
             hidden_states = Unsloth_Offloaded_Gradient_Checkpointer.apply(
@@ -955,14 +968,39 @@ def _CausalLM_fast_forward(
             )
         pass
         hidden_states = outputs[0]
+
         bsz, q_len, hd = hidden_states.shape
         lm_head = self.lm_head.weight
+        logit_softcapping = getattr(self.config, "final_logit_softcapping", 0)
+        logit_scaling     = getattr(self.config, "logit_scale", 0)
+
         if bsz == 1 and q_len == 1:
             logits = torch.mv(lm_head, hidden_states.ravel().to(lm_head.dtype))
             logits = logits.unsqueeze(0).unsqueeze(0)
         elif num_logits_to_keep != 0:
             logits = self.lm_head(hidden_states[:, -num_logits_to_keep:, :].to(lm_head.dtype))
         else:
+            if HAS_CUT_CROSS_ENTROPY and labels is not None:
+                n_items = kwargs.get("num_items_in_batch", None) or kwargs.get("n_items", None)
+                loss = fused_linear_cross_entropy(
+                    hidden_states      = hidden_states,
+                    lm_weight          = lm_head,
+                    labels             = labels,
+                    num_items_in_batch = n_items,
+                    logit_softcapping  = logit_softcapping,
+                )
+                if not return_dict:
+                    output = (logits,) + outputs[1:]
+                    return (loss,) + output if loss is not None else output
+
+                return CausalLMOutputWithPast(
+                    loss=loss,
+                    logits=None,
+                    past_key_values=outputs.past_key_values,
+                    hidden_states=outputs.hidden_states,
+                    attentions=outputs.attentions,
+                )
+            pass
             logits = self.lm_head(hidden_states.to(lm_head.dtype))
         pass
 
@@ -974,8 +1012,6 @@ def _CausalLM_fast_forward(
         pass
 
         loss = None
-        logit_softcapping = getattr(self.config, "final_logit_softcapping", 0)
-        logit_scaling     = getattr(self.config, "logit_scale", 0)
         if labels is not None:
             shift_logits = logits
             if not hasattr(self, "extra_ignored_labels"):
diff --git a/unsloth/models/loader.py b/unsloth/models/loader.py
index 7a6322d248..232fe6acff 100644
--- a/unsloth/models/loader.py
+++ b/unsloth/models/loader.py
@@ -20,8 +20,8 @@
 from transformers import AutoConfig
 from transformers import __version__ as transformers_version
 from peft import PeftConfig, PeftModel
-from .mapper import INT_TO_FLOAT_MAPPER, FLOAT_TO_INT_MAPPER, MAP_TO_UNSLOTH_16bit
-import os
+from .loader_utils import get_model_name
+import os, contextlib, sys
 try:
     from huggingface_hub.utils import get_token
 except:
@@ -63,105 +63,6 @@ def _get_dtype(dtype):
 pass
 
 
-def __get_model_name(
-    model_name,
-    load_in_4bit = True,
-    INT_TO_FLOAT_MAPPER  = None,
-    FLOAT_TO_INT_MAPPER  = None,
-    MAP_TO_UNSLOTH_16bit = None,
-):
-    model_name = str(model_name)
-    lower_model_name = model_name.lower()
-
-    if not SUPPORTS_FOURBIT and lower_model_name in INT_TO_FLOAT_MAPPER:
-
-        model_name = INT_TO_FLOAT_MAPPER[lower_model_name]
-        logger.warning_once(
-            f"Unsloth: Your transformers version of {transformers_version} does not support native "\
-            f"4bit loading.\nThe minimum required version is 4.37.\n"\
-            f'Try `pip install --upgrade "transformers>=4.37"`\n'\
-            f"to obtain the latest transformers build, then restart this session.\n"\
-            f"For now, we shall load `{model_name}` instead (still 4bit, just slower downloading)."
-        )
-        return model_name
-    
-    elif not load_in_4bit and lower_model_name in INT_TO_FLOAT_MAPPER:
-
-        new_model_name = INT_TO_FLOAT_MAPPER[lower_model_name]
-        # logger.warning_once(
-        #     f"Unsloth: You passed in `{model_name}` which is a 4bit model, yet you set\n"\
-        #     f"`load_in_4bit = False`. We shall load `{new_model_name}` instead."
-        # )
-        return new_model_name
-
-    elif not load_in_4bit and lower_model_name in MAP_TO_UNSLOTH_16bit:
-
-        new_model_name = MAP_TO_UNSLOTH_16bit[lower_model_name]
-        return new_model_name
-
-    elif load_in_4bit and SUPPORTS_FOURBIT and lower_model_name in FLOAT_TO_INT_MAPPER:
-
-        new_model_name = FLOAT_TO_INT_MAPPER[lower_model_name]
-        # logger.warning_once(
-        #     f"Unsloth: You passed in `{model_name}` and `load_in_4bit = True`.\n"\
-        #     f"We shall load `{new_model_name}` for 4x faster loading."
-        # )
-        return new_model_name
-    pass
-
-    return None
-pass
-
-
-def _get_new_mapper():
-    try:
-        import requests
-        new_mapper = "https://raw.githubusercontent.com/unslothai/unsloth/main/unsloth/models/mapper.py"
-        with requests.get(new_mapper, timeout = 3) as new_mapper: new_mapper = new_mapper.text
-        new_mapper = new_mapper[new_mapper.find("__INT_TO_FLOAT_MAPPER"):]
-        new_mapper = new_mapper\
-            .replace("INT_TO_FLOAT_MAPPER",  "NEW_INT_TO_FLOAT_MAPPER")\
-            .replace("FLOAT_TO_INT_MAPPER",  "NEW_FLOAT_TO_INT_MAPPER")\
-            .replace("MAP_TO_UNSLOTH_16bit", "NEW_MAP_TO_UNSLOTH_16bit")
-
-        exec(new_mapper, globals())
-        return NEW_INT_TO_FLOAT_MAPPER, NEW_FLOAT_TO_INT_MAPPER, NEW_MAP_TO_UNSLOTH_16bit
-    except:
-        return {}, {}, {}
-    pass
-pass
-
-
-def get_model_name(model_name, load_in_4bit = True):
-    new_model_name = __get_model_name(
-        model_name = model_name,
-        load_in_4bit = load_in_4bit,
-        INT_TO_FLOAT_MAPPER  = INT_TO_FLOAT_MAPPER,
-        FLOAT_TO_INT_MAPPER  = FLOAT_TO_INT_MAPPER,
-        MAP_TO_UNSLOTH_16bit = MAP_TO_UNSLOTH_16bit,
-    )
-    if new_model_name is None and model_name.count("/") == 1 and model_name[0].isalnum():
-        # Try checking if a new Unsloth version allows it!
-        NEW_INT_TO_FLOAT_MAPPER, NEW_FLOAT_TO_INT_MAPPER, NEW_MAP_TO_UNSLOTH_16bit = _get_new_mapper()
-        upgraded_model_name = __get_model_name(
-            model_name = model_name,
-            load_in_4bit = load_in_4bit,
-            INT_TO_FLOAT_MAPPER  = NEW_INT_TO_FLOAT_MAPPER,
-            FLOAT_TO_INT_MAPPER  = NEW_FLOAT_TO_INT_MAPPER,
-            MAP_TO_UNSLOTH_16bit = NEW_MAP_TO_UNSLOTH_16bit,
-        )
-        if upgraded_model_name is not None:
-            raise NotImplementedError(
-                f"Unsloth: {model_name} is not supported in your current Unsloth version! Please update Unsloth via:\n\n"\
-                'pip uninstall unsloth -y\n'\
-                'pip install --upgrade --no-cache-dir "unsloth[colab-new] @ git+https://github.com/unslothai/unsloth.git"'
-            )
-        pass
-    pass
-    return new_model_name if new_model_name is not None else model_name
-pass
-
-
 class FastLanguageModel(FastLlamaModel):
     @staticmethod
     def from_pretrained(
@@ -333,7 +234,8 @@ def from_pretrained(
         else:
             raise NotImplementedError(
                 f"Unsloth: {model_name} not supported yet!\n"\
-                "Make an issue to https://github.com/unslothai/unsloth!",
+                "Maybe you're doing vision finetuning? Please use FastVisionModel instead!\n"\
+                "Otherwise, make an issue to https://github.com/unslothai/unsloth!",
             )
         pass
 
@@ -411,4 +313,236 @@ def from_pretrained(
         pass
         return model, tokenizer
     pass
-pass
\ No newline at end of file
+pass
+
+
+from ._utils import (
+    patch_compiling_bitsandbytes,
+    patch_model_and_tokenizer,
+    prepare_model_for_kbit_training,
+    patch_unsloth_smart_gradient_checkpointing,
+    patch_compiled_autograd,
+    process_vision_info,
+    unsloth_compile_transformers,
+)
+from ..kernels import (
+    patch_loss_functions,
+    post_patch_loss_function,
+)
+from .vision import FastBaseVisionModel
+
+
+class FastVisionModel(FastBaseVisionModel):
+    @staticmethod
+    def from_pretrained(
+        model_name                 = "unsloth/Llama-3.2-11B-Vision-Instruct-bnb-4bit",
+        max_seq_length             = None, # [TODO] No effect
+        dtype                      = None,
+        load_in_4bit               = True,
+        token                      = None,
+        device_map                 = "sequential",
+        rope_scaling               = None, # [TODO] No effect
+        fix_tokenizer              = True, # [TODO] No effect
+        trust_remote_code          = False,
+        use_gradient_checkpointing = "unsloth",
+        resize_model_vocab         = None, # [TODO] No effect
+        revision                   = None,
+        *args, **kwargs,
+    ):
+        if token is None: token = get_token()
+
+        patch_compiled_autograd()
+        patch_compiling_bitsandbytes()
+        if use_gradient_checkpointing == "unsloth":
+            patch_unsloth_smart_gradient_checkpointing()
+        
+        old_model_name = model_name
+        model_name = get_model_name(model_name, load_in_4bit)
+
+        with contextlib.redirect_stdout(open(os.devnull, "w")):
+            patch_loss_functions(torch_compile = False)
+            model_types = unsloth_compile_transformers(
+                model_name              = model_name,
+                sdpa_dynamic_mask       = True,
+                sdpa_bool_masks         = True,
+                sdpa_gqa_replace        = True,
+                sdpa_dynamic_compile    = True,
+                compile_attention       = True,
+                disable_causal_masks    = True,
+                compile_torch_modules   = True,
+                compile_custom_modules  = True,
+                compile_function_calls  = True,
+                fuse_lm_head            = True,
+                gradient_checkpointing  = True,
+                manual_replacements     = True,
+                epilogue_fusion         = True,
+                max_autotune            = False,
+                shape_padding           = True,
+                cudagraphs              = False,
+                debug                   = False,
+                import_from_cache       = False,
+                disable                 = False,
+            )
+        pass
+
+        # First check if it's a normal model via AutoConfig
+        from huggingface_hub.utils import disable_progress_bars, enable_progress_bars, are_progress_bars_disabled
+        was_disabled = are_progress_bars_disabled()
+        disable_progress_bars()
+
+        autoconfig_error = None
+        peft_error = None
+        try:
+            model_config = AutoConfig.from_pretrained(
+                model_name,
+                token = token,
+                revision = revision,
+                trust_remote_code = trust_remote_code,
+            )
+            is_model = True
+        except Exception as error:
+            autoconfig_error = str(error)
+            is_model = False
+        try:
+            peft_config = PeftConfig.from_pretrained(
+                model_name,
+                token = token,
+                revision = revision,
+                trust_remote_code = trust_remote_code,
+            )
+            is_peft = True
+        except Exception as error:
+            peft_error = str(error)
+            is_peft = False
+        pass
+
+        # Both config.json and adapter_config.json should not exist!
+
+        # Old transformers versions check
+        both_exist = (is_model and is_peft) and not SUPPORTS_LLAMA32
+        
+        # New transformers need to check manually.
+        if SUPPORTS_LLAMA32:
+            # Check if folder exists locally
+            if os.path.isdir(model_name):
+                exist_adapter_config = os.path.exists(os.path.join(model_name, "adapter_config.json"))
+                exist_config         = os.path.exists(os.path.join(model_name, "config.json"))
+                both_exist = exist_adapter_config and exist_config
+            else:
+                files = HfFileSystem(token = token).glob(os.path.join(model_name, "*.json"))
+                files = (os.path.split(x)[-1] for x in files)
+                if sum(x == "adapter_config.json" or x == "config.json" for x in files) >= 2:
+                    both_exist = True
+                pass
+            pass
+        pass
+
+        # Error out if both LoRA and normal model config exists.
+        if both_exist:
+            raise RuntimeError(
+                "Unsloth: Your repo has a LoRA adapter and a base model.\n"\
+                "You have 2 files `config.json` and `adapter_config.json`.\n"\
+                "We must only allow one config file.\n"\
+                "Please separate the LoRA and base models to 2 repos."
+            )
+
+        elif not is_model and not is_peft:
+            error = autoconfig_error or peft_error
+            # Old transformers version
+            if "rope_scaling" in error.lower() and not SUPPORTS_LLAMA31:
+                raise ImportError(
+                    f"Unsloth: Your transformers version of {transformers_version} does not support new RoPE scaling methods.\n"\
+                    f"This includes Llama 3.1. The minimum required version is 4.43.2\n"\
+                    f'Try `pip install --upgrade "transformers>=4.43.2"`\n'\
+                    f"to obtain the latest transformers build, then restart this session."\
+                ) 
+            raise RuntimeError(autoconfig_error or peft_error)
+        pass
+
+        # Get base model for PEFT:
+        if is_peft:
+            # Check base model again for PEFT
+            model_name = get_model_name(peft_config.base_model_name_or_path, load_in_4bit)
+            model_config = AutoConfig.from_pretrained(
+                model_name,
+                token = token,
+                revision = revision,
+                trust_remote_code = trust_remote_code,
+            )
+        pass
+
+        if not was_disabled: enable_progress_bars()
+
+        # Check if this is local model since the tokenizer gets overwritten
+        if  os.path.exists(os.path.join(old_model_name, "tokenizer_config.json")) and \
+            os.path.exists(os.path.join(old_model_name, "tokenizer.json")) and \
+            os.path.exists(os.path.join(old_model_name, "special_tokens_map.json")):
+
+            tokenizer_name = old_model_name
+        else:
+            tokenizer_name = None
+        pass
+
+        model, tokenizer = FastBaseVisionModel.from_pretrained(
+            model_name        = model_name,
+            max_seq_length    = max_seq_length,
+            dtype             = _get_dtype(dtype),
+            load_in_4bit      = load_in_4bit,
+            token             = token,
+            device_map        = device_map,
+            trust_remote_code = trust_remote_code,
+            revision          = revision if not is_peft else None,
+            model_types       = model_types,
+            tokenizer_name    = tokenizer_name,
+            *args, **kwargs,
+        )
+        
+        if resize_model_vocab is not None:
+            model.resize_token_embeddings(resize_model_vocab)
+        pass
+
+        # In case the model supports tagging, add the unsloth tag.
+        if hasattr(model, "add_model_tags"):
+            model.add_model_tags(["unsloth",])
+        pass
+        if hasattr(tokenizer, "add_model_tags"):
+            tokenizer.add_model_tags(["unsloth",])
+        pass
+
+        if load_in_4bit:
+            # Fix up bitsandbytes config
+            quantization_config = \
+            {
+                # Sometimes torch_dtype is not a string!!
+                "bnb_4bit_compute_dtype"           : model.config.to_dict()["torch_dtype"],
+                "bnb_4bit_quant_type"              : "nf4",
+                "bnb_4bit_use_double_quant"        : True,
+                "llm_int8_enable_fp32_cpu_offload" : False,
+                "llm_int8_has_fp16_weight"         : False,
+                "llm_int8_skip_modules"            : None,
+                "llm_int8_threshold"               : 6.0,
+                "load_in_4bit"                     : True,
+                "load_in_8bit"                     : False,
+                "quant_method"                     : "bitsandbytes",
+            }
+            model.config.update({"quantization_config" : quantization_config})
+        pass
+
+        if is_peft:
+            # From https://github.com/huggingface/peft/issues/184
+            # Now add PEFT adapters
+            model.enable_input_require_grads()
+            model = PeftModel.from_pretrained(
+                model,
+                old_model_name,
+                token = token,
+                revision = revision,
+                is_trainable = True,
+                trust_remote_code = trust_remote_code,
+            )
+            # Patch it as well!
+            model = FastBaseVisionModel.patch_peft_model(model, use_gradient_checkpointing)
+        pass
+        return model, tokenizer
+    pass
+pass
diff --git a/unsloth/models/loader_utils.py b/unsloth/models/loader_utils.py
new file mode 100644
index 0000000000..b778b7e95b
--- /dev/null
+++ b/unsloth/models/loader_utils.py
@@ -0,0 +1,120 @@
+# Copyright 2023-present Daniel Han-Chen & the Unsloth team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from .mapper import INT_TO_FLOAT_MAPPER, FLOAT_TO_INT_MAPPER, MAP_TO_UNSLOTH_16bit
+# https://github.com/huggingface/transformers/pull/26037 allows 4 bit loading!
+from packaging.version import Version
+from transformers import __version__ as transformers_version
+transformers_version = Version(transformers_version)
+SUPPORTS_FOURBIT = transformers_version >= Version("4.37")
+
+
+def __get_model_name(
+    model_name,
+    load_in_4bit = True,
+    INT_TO_FLOAT_MAPPER  = None,
+    FLOAT_TO_INT_MAPPER  = None,
+    MAP_TO_UNSLOTH_16bit = None,
+):
+    model_name = str(model_name)
+    lower_model_name = model_name.lower()
+
+    if not SUPPORTS_FOURBIT and lower_model_name in INT_TO_FLOAT_MAPPER:
+
+        model_name = INT_TO_FLOAT_MAPPER[lower_model_name]
+        print(
+            f"Unsloth: Your transformers version of {transformers_version} does not support native "\
+            f"4bit loading.\nThe minimum required version is 4.37.\n"\
+            f'Try `pip install --upgrade "transformers>=4.37"`\n'\
+            f"to obtain the latest transformers build, then restart this session.\n"\
+            f"For now, we shall load `{model_name}` instead (still 4bit, just slower downloading)."
+        )
+        return model_name
+    
+    elif not load_in_4bit and lower_model_name in INT_TO_FLOAT_MAPPER:
+
+        new_model_name = INT_TO_FLOAT_MAPPER[lower_model_name]
+        # logger.warning_once(
+        #     f"Unsloth: You passed in `{model_name}` which is a 4bit model, yet you set\n"\
+        #     f"`load_in_4bit = False`. We shall load `{new_model_name}` instead."
+        # )
+        return new_model_name
+
+    elif not load_in_4bit and lower_model_name in MAP_TO_UNSLOTH_16bit:
+
+        new_model_name = MAP_TO_UNSLOTH_16bit[lower_model_name]
+        return new_model_name
+
+    elif load_in_4bit and SUPPORTS_FOURBIT and lower_model_name in FLOAT_TO_INT_MAPPER:
+
+        new_model_name = FLOAT_TO_INT_MAPPER[lower_model_name]
+        # logger.warning_once(
+        #     f"Unsloth: You passed in `{model_name}` and `load_in_4bit = True`.\n"\
+        #     f"We shall load `{new_model_name}` for 4x faster loading."
+        # )
+        return new_model_name
+    pass
+
+    return None
+pass
+
+
+def _get_new_mapper():
+    try:
+        import requests
+        new_mapper = "https://raw.githubusercontent.com/unslothai/unsloth/main/unsloth/models/mapper.py"
+        with requests.get(new_mapper, timeout = 3) as new_mapper: new_mapper = new_mapper.text
+        new_mapper = new_mapper[new_mapper.find("__INT_TO_FLOAT_MAPPER"):]
+        new_mapper = new_mapper\
+            .replace("INT_TO_FLOAT_MAPPER",  "NEW_INT_TO_FLOAT_MAPPER")\
+            .replace("FLOAT_TO_INT_MAPPER",  "NEW_FLOAT_TO_INT_MAPPER")\
+            .replace("MAP_TO_UNSLOTH_16bit", "NEW_MAP_TO_UNSLOTH_16bit")
+
+        exec(new_mapper, globals())
+        return NEW_INT_TO_FLOAT_MAPPER, NEW_FLOAT_TO_INT_MAPPER, NEW_MAP_TO_UNSLOTH_16bit
+    except:
+        return {}, {}, {}
+    pass
+pass
+
+
+def get_model_name(model_name, load_in_4bit = True):
+    new_model_name = __get_model_name(
+        model_name = model_name,
+        load_in_4bit = load_in_4bit,
+        INT_TO_FLOAT_MAPPER  = INT_TO_FLOAT_MAPPER,
+        FLOAT_TO_INT_MAPPER  = FLOAT_TO_INT_MAPPER,
+        MAP_TO_UNSLOTH_16bit = MAP_TO_UNSLOTH_16bit,
+    )
+    if new_model_name is None and model_name.count("/") == 1 and model_name[0].isalnum():
+        # Try checking if a new Unsloth version allows it!
+        NEW_INT_TO_FLOAT_MAPPER, NEW_FLOAT_TO_INT_MAPPER, NEW_MAP_TO_UNSLOTH_16bit = _get_new_mapper()
+        upgraded_model_name = __get_model_name(
+            model_name = model_name,
+            load_in_4bit = load_in_4bit,
+            INT_TO_FLOAT_MAPPER  = NEW_INT_TO_FLOAT_MAPPER,
+            FLOAT_TO_INT_MAPPER  = NEW_FLOAT_TO_INT_MAPPER,
+            MAP_TO_UNSLOTH_16bit = NEW_MAP_TO_UNSLOTH_16bit,
+        )
+        if upgraded_model_name is not None:
+            raise NotImplementedError(
+                f"Unsloth: {model_name} is not supported in your current Unsloth version! Please update Unsloth via:\n\n"\
+                'pip uninstall unsloth unsloth_zoo -y\n'\
+                'pip install --upgrade --no-cache-dir "unsloth[colab-new] @ git+https://github.com/unslothai/unsloth.git"\n'\
+                'pip install --upgrade --no-cache-dir "git+https://github.com/unslothai/unsloth-zoo.git"\n'\
+            )
+        pass
+    pass
+    return new_model_name if new_model_name is not None else model_name
+pass
diff --git a/unsloth/models/mapper.py b/unsloth/models/mapper.py
index d4f1278e1d..fc1dc8cdb0 100644
--- a/unsloth/models/mapper.py
+++ b/unsloth/models/mapper.py
@@ -409,12 +409,12 @@
         "Qwen/Qwen2.5-Coder-32B",
     ),
     "unsloth/Qwen2.5-Coder-0.5B-Instruct-bnb-4bit" : (
-        "unsloth/Qwen2.5-Coder-Instruct-0.5B",
-        "Qwen/Qwen2.5-Coder-Instruct-0.5B",
+        "unsloth/Qwen2.5-Coder-0.5B-Instruct",
+        "Qwen/Qwen2.5-Coder-0.5B-Instruct",
     ),
     "unsloth/Qwen2.5-Coder-1.5B-Instruct-bnb-4bit" : (
-        "unsloth/Qwen2.5-Coder-Instruct-1.5B",
-        "Qwen/Qwen2.5-Coder-Instruct-1.5B",
+        "unsloth/Qwen2.5-Coder-1.5B-Instruct",
+        "Qwen/Qwen2.5-Coder-1.5B-Instruct",
     ),
     "unsloth/Qwen2.5-Coder-3B-Instruct-bnb-4bit" : (
         "unsloth/Qwen2.5-Coder-3B-Instruct",
@@ -452,6 +452,46 @@
         "unsloth/Llama-3.1-Nemotron-70B-Instruct",
         "nvidia/Llama-3.1-Nemotron-70B-Instruct-HF",
     ),
+    "unsloth/Qwen2-VL-2B-Instruct-bnb-4bit" : (
+        "unsloth/Qwen2-VL-2B-Instruct",
+        "Qwen/Qwen2-VL-2B-Instruct",
+    ),
+    "unsloth/Qwen2-VL-7B-Instruct-bnb-4bit" : (
+        "unsloth/Qwen2-VL-7B-Instruct",
+        "Qwen/Qwen2-VL-7B-Instruct",
+    ),
+    "unsloth/Llama-3.2-11B-Vision-Instruct-bnb-4bit" : (
+        "unsloth/Llama-3.2-11B-Vision-Instruct",
+        "meta-llama/Llama-3.2-11B-Vision-Instruct",
+    ),
+    "unsloth/Llama-3.2-90B-Vision-Instruct-bnb-4bit" : (
+        "unsloth/Llama-3.2-90B-Vision-Instruct",
+        "meta-llama/Llama-3.2-90B-Vision-Instruct",
+    ),
+    "unsloth/Llama-3.2-11B-Vision-bnb-4bit" : (
+        "unsloth/Llama-3.2-11B-Vision",
+        "meta-llama/Llama-3.2-11B-Vision",
+    ),
+    "unsloth/Llama-3.2-90B-Vision-bnb-4bit" : (
+        "unsloth/Llama-3.2-90B-Vision",
+        "meta-llama/Llama-3.2-90B-Vision",
+    ),
+    "unsloth/Pixtral-12B-2409-bnb-4bit" : (
+        "unsloth/Pixtral-12B-2409",
+        "mistralai/Pixtral-12B-2409",
+    ),
+    "unsloth/Pixtral-12B-2409-Base-bnb-4bit" : (
+        "unsloth/Pixtral-12B-Base-2409",
+        "mistralai/Pixtral-12B-Base-2409",
+    ),
+    "unsloth/llava-1.5-7b-hf-bnb-4bit" : (
+        "unsloth/llava-1.5-7b-hf",
+        "llava-hf/llava-1.5-7b-hf",
+    ),
+    "unsloth/llava-v1.6-mistral-7b-hf-bnb-4bit" : (
+        "unsloth/llava-v1.6-mistral-7b-hf",
+        "llava-hf/llava-v1.6-mistral-7b-hf",
+    ),
 }
 
 INT_TO_FLOAT_MAPPER  = {}
diff --git a/unsloth/models/vision.py b/unsloth/models/vision.py
index 0b8c08a371..d083144651 100644
--- a/unsloth/models/vision.py
+++ b/unsloth/models/vision.py
@@ -1,58 +1,86 @@
+# Unsloth Zoo - Utilities for Unsloth
 # Copyright 2023-present Daniel Han-Chen & the Unsloth team. All rights reserved.
 #
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
+# This program is free software: you can redistribute it and/or modify
+# it under the terms of the GNU Lesser General Public License as published by
+# the Free Software Foundation, either version 3 of the License, or
+# (at your option) any later version.
 #
-#     http://www.apache.org/licenses/LICENSE-2.0
+# This program is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+# GNU General Public License for more details.
 #
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
+# You should have received a copy of the GNU Lesser General Public License
+# along with this program.  If not, see <https://www.gnu.org/licenses/>.
+
+import torch
+from transformers import (
+    BitsAndBytesConfig,
+    AutoModelForVision2Seq,
+    AutoProcessor,
+)
 from .llama import *
-from ..kernels import patch_layernorm, unpatch_layernorm
-from ..kernels import patch_rms_layernorm, unpatch_rms_layernorm
-from ..kernels import patch_llama_for_causal_lm, unpatch_llama_for_causal_lm
-from ._utils import patch_gradient_checkpointing
-
-from transformers import AutoProcessor
-try:
-    from transformers import MllamaForConditionalGeneration
-except:
-    raise ImportError(
-        "Unsloth: Please update your transformers version to 4.46.0 for Llama 3.2 support!"
-    )
-pass
+from ..kernels import (
+    post_patch_loss_function,
+)
+from ._utils import __version__
+from peft import LoraConfig, TaskType, get_peft_model
+from transformers import set_seed as transformers_set_seed
+from unsloth_zoo.peft_utils import (
+    get_peft_regex,
+    merge_and_overwrite_lora,
+)
+
+__all__ = [
+    "FastBaseVisionModel",
+]
+
+def _wrap_fast_inference(generate, device_type, dtype, model):
+    # Wraps inference with bfloat16 / float16
+    @torch.inference_mode
+    def _fast_generate(*args, **kwargs):
+        # For num_logits_to_keep
+        kwargs["num_logits_to_keep"] = 1
+
+        # Remove token_type_ids
+        kwargs.pop("token_type_ids", None)
+
+        # Check pad_token
+        model_eos_token_id = getattr(model.config, "eos_token_id", None)
+        if model_eos_token_id is not None and hasattr(model_eos_token_id, "__iter__"):
+            model_eos_token_id = model_eos_token_id[0]
+
+        kwargs["pad_token_id"] = kwargs.pop("pad_token_id", model_eos_token_id)
 
-class FastVisionModel:
+        try:
+            kwargs["pixel_values"] = kwargs["pixel_values"].to(model.dtype)
+        except:
+            pass
 
-    def pre_patch(self):
-        patch_gradient_checkpointing()
-        patch_layernorm()
-        patch_rms_layernorm()
-        patch_llama_for_causal_lm()
+        # Autocasted
+        with torch.autocast(device_type = device_type, dtype = dtype):
+            output = generate(*args, **kwargs)
+        pass
+        return output
     pass
+    return _fast_generate
+pass
 
-    def post_unpatch(self):
-        unpatch_layernorm()
-        unpatch_rms_layernorm()
-        unpatch_llama_for_causal_lm()
-    pass
 
+class FastBaseVisionModel:
 
     @staticmethod
     def from_pretrained(
-        model_name        = "llava-hf/llava-1.5-7b-hf",
+        model_name        = "unsloth/llama-3-8b-bnb-4bit",
         max_seq_length    = None,
         dtype             = None,
         load_in_4bit      = True,
         token             = None,
         device_map        = "sequential",
-        rope_scaling      = None,
         trust_remote_code = False,
+        model_types       = None,
+        tokenizer_name    = None,
         **kwargs,
     ):
         if trust_remote_code:
@@ -67,7 +95,7 @@ def from_pretrained(
         max_memory = round(gpu_stats.total_memory / 1024 / 1024 / 1024, 3)
 
         statistics = \
-           f"==((====))==  Unsloth {__version__}: Fast {model_patcher.__name__[4:-5]} patching. Transformers = {transformers_version}.\n"\
+           f"==((====))==  Unsloth {__version__}: Fast {model_types[0].title()} vision patching. Transformers = {transformers_version}.\n"\
            f"   \\\   /|    GPU: {gpu_stats.name}. Max memory: {max_memory} GB. Platform = {platform_system}.\n"\
            f"O^O/ \_/ \\    Pytorch: {torch.__version__}. CUDA = {gpu_stats.major}.{gpu_stats.minor}. CUDA Toolkit = {torch.version.cuda}.\n"\
            f"\        /    Bfloat16 = {str(SUPPORTS_BFLOAT16).upper()}. FA [Xformers = {xformers_version}. FA2 = {HAS_FLASH_ATTENTION}]\n"\
@@ -81,6 +109,7 @@ def from_pretrained(
         pass
         # Return old flag
         os.environ["HF_HUB_ENABLE_HF_TRANSFER"] = old_hf_transfer
+        os.environ["HF_HUB_ENABLE_HF_TRANSFER"] = "1"
 
         get_statistics() # For debugging - we use a download counter to see if environments are not breaking 
 
@@ -105,160 +134,36 @@ def from_pretrained(
             )
         pass
 
+        kwargs.pop("attn_implementation", None); # No need since we auto call it
+
         # Cannot be None, since HF now checks for the config
         if load_in_4bit: kwargs["quantization_config"] = bnb_config
         
-        self.pre_patch()
-        model = MllamaForConditionalGeneration.from_pretrained(
+        model = AutoModelForVision2Seq.from_pretrained(
             model_name,
             device_map              = device_map,
             torch_dtype             = dtype,
-            # quantization_config     = bnb_config,
+            # quantization_config   = bnb_config,
             token                   = token,
-            max_position_embeddings = max_position_embeddings,
             trust_remote_code       = trust_remote_code,
-            attn_implementation     = "sdpa",
+            # attn_implementation   = "sdpa", [TODO] Pixtral for eg fails
             **kwargs,
         )
-        self.post_unpatch()
-
         # Return old flag
         os.environ["HF_HUB_ENABLE_HF_TRANSFER"] = old_hf_transfer
         # We currently only support NVIDIA GPUs - AMD / Intel is a work in progress!
         post_check = check_nvidia()
 
         # Counteract saved tokenizers
+        tokenizer_name = model_name if tokenizer_name is None else tokenizer_name
         tokenizer = AutoProcessor.from_pretrained(
-            model_name,
-        )
-        model = FastVisionModel.post_patch(model)
-
-        # Patch Trainer
-        from transformers.trainer import Trainer
-        try:
-            if Trainer._inner_training_loop.__name__ != "_fast_inner_training_loop":
-                inner_training_loop = inspect.getsource(Trainer._inner_training_loop)
-                Trainer._original_training_loop = inner_training_loop
-            else:
-                inner_training_loop = Trainer._original_training_loop
-        except:
-            raise RuntimeError('Unsloth currently does not support multi GPU setups - but we are working on it!')
-        pass
-
-        if ((post_check - pre_check) >= 1).sum() > 1:
-            raise RuntimeError('Unsloth currently does not support multi GPU setups - but we are working on it!')
-
-        import transformers.trainer
-        items_in_trainer = dir(transformers.trainer)
-        good_items = []
-        for item in items_in_trainer:
-            # TODO: Support Deepspeed
-            if item.startswith(("deepspeed", "xm", "met", "smp")): continue
-            if item in inner_training_loop: good_items.append(item)
-        pass
-        exec("from transformers.trainer import (" + ", ".join(x for x in good_items) + ")", globals())
-
-        start = re.search('logger\.info\([\"\'].+?Running training', inner_training_loop).span(0)[0]
-        end = inner_training_loop.find("\n\n", start)
-        original_debug = inner_training_loop[start:end]
-        spaces = re.search('\n([\s\t]{1,})', original_debug).group(0)[1:]
-        front_spaces = re.match('([\s\t]{1,})', inner_training_loop).group(0)
-
-        debug_info = """debug_info = \\
-        f"==((====))==  Unsloth - 2x faster free finetuning | Num GPUs = {args.world_size}\\n"\\
-        f"   \\\\\\   /|    Num examples = {num_examples:,} | Num Epochs = {num_train_epochs:,}\\n"\\
-        f"O^O/ \\_/ \\    Batch size per device = {self._train_batch_size:,} | Gradient Accumulation steps = {args.gradient_accumulation_steps}\\n"\\
-        f"\\        /    Total batch size = {total_train_batch_size:,} | Total steps = {max_steps:,}\\n"\\
-        f' "-____-"     Number of trainable parameters = {get_model_param_count(model, trainable_only=True):,}'
-        logger.warning(debug_info)
-        import subprocess, re, gc, numpy as np
-        a = np.array([0,])
-        try:
-            a = subprocess.check_output('nvidia-smi --query-gpu=memory.used --format=csv', shell = True)
-            a = re.findall(rb'([\\d]{1,})[\\s]{1,}M', a)
-            a = np.array([int(x.decode('utf-8'))/1024 for x in a])
-        except:
-            if not torch.cuda.is_available():
-                raise RuntimeError('Unsloth: We do not support AMD / Intel machines yet - it is a work in progress!')
-        if ((a - PRE_CHECK) >= 1).sum() > 1:
-            raise RuntimeError('Unsloth currently does not support multi GPU setups - but we are working on it!')
-        for _ in range(3):
-            gc.collect()
-            torch.cuda.empty_cache()"""
-
-        debug_info = debug_info.split('\n')
-        debug_info = "\n".join([debug_info[0]] + [spaces + x[8:] for x in debug_info[1:]])
-        inner_training_loop = inner_training_loop.replace(original_debug, debug_info)
-
-        debug_info = """n_total_devices = total_train_batch_size // \\
-            args.gradient_accumulation_steps // self._train_batch_size
-        if n_total_devices > 1:
-            logger.warning_once('Unsloth currently does not support multi GPU setups - but we are working on it!')
-        debug_info ="""
-        debug_info = debug_info.split('\n')
-        debug_info = "\n".join([debug_info[0]] + [spaces + x[8:] for x in debug_info[1:]])
-        inner_training_loop = inner_training_loop.replace("debug_info =", debug_info, 1)
-
-        front_spaces = re.match(r"[\t\s]{1,}", inner_training_loop).group(0)
-        inner_training_loop = re.sub(r"^" + front_spaces, "", inner_training_loop, flags = re.MULTILINE)
-        inner_training_loop = inner_training_loop.replace(
-            "train_dataloader = tpu_spmd_dataloader(train_dataloader)",
-            "raise RuntimeError('Unsloth: TPUs are not yet supported!')"
+            tokenizer_name,
+            padding_side = "right",
+            token        = token,
         )
-        inner_training_loop = inner_training_loop.replace(
-            "self.accelerator.free_memory()",
-            "self.accelerator.free_memory()\n" + \
-            front_spaces + "if self.is_deepspeed_enabled:"\
-            "raise RuntimeError('Unsloth: Deepspeed is not yet supported!')\n", 1,
-        )
-
-        check_batches = """train_dataloader = self.get_train_dataloader()
-        ga  = args.gradient_accumulation_steps
-        bsz = self._train_batch_size
-        total_batches = bsz * ga * args.world_size
-        n_total_devices = total_batches // ga // bsz
-        if n_total_devices > 1:
-            logger.warning_once('Unsloth currently does not support multi GPU setups - but we are working on it!')
-            divisor = n_total_devices / 1
-            bsz = self._train_batch_size = max(int(bsz / divisor), 1)
-            if total_batches // ga // bsz > 1:
-                divisor = n_total_devices / 1
-                ga = args.gradient_accumulation_steps = max(int(ga / divisor), 1)"""
-        check_batches = check_batches.split('\n')
-        check_batches = "\n".join([check_batches[0]] + [front_spaces + x[8:] for x in check_batches[1:]])
-        inner_training_loop = inner_training_loop.replace(
-            "train_dataloader = self.get_train_dataloader()",
-            check_batches, 1,
-        )
-        inner_training_loop = inner_training_loop.replace(
-            "_inner_training_loop",
-            "_fast_inner_training_loop", 1,
-        )
-        exec(inner_training_loop, globals())
 
-        Trainer._inner_training_loop = _fast_inner_training_loop
-        inner_training_loop = inner_training_loop.replace(
-            "is_torch_tpu_available()",
-            "False",
-        )
-        if "n_total_devices >" not in inner_training_loop:
-            raise RuntimeError('Unsloth currently does not support multi GPU setups - but we are working on it!')
-        pass
-        inner_training_loop = inner_training_loop.replace(
-            "is_sagemaker_mp_enabled()",
-            "False",
-        )
-        exec(inner_training_loop, globals())
-        Trainer._inner_training_loop = _fast_inner_training_loop
-
-        # Save max_seq_length
-        model.max_seq_length = max_position_embeddings
-        internal_model = model
-        while hasattr(internal_model, "model"):
-            internal_model.max_seq_length = max_position_embeddings
-            internal_model = internal_model.model
-        pass
-        internal_model.max_seq_length = max_position_embeddings
+        model, tokenizer = patch_tokenizer(model, tokenizer)
+        model = post_patch_loss_function(model)
 
         # Fix up config for transformers uploading PEFT
         # Not necessary anymore since we require transformers>=4.37!
@@ -271,121 +176,105 @@ def from_pretrained(
         pass
 
         # Log Unsloth version for future fastpaths for inference
-        model.config.update({"unsloth_version" : __version__})
-
-        # Add save modules
-        patch_saving_functions(model)
-        Trainer._inner_training_loop = _fast_inner_training_loop
+        if hasattr(model, "config"):
+            model.config.update({"unsloth_version" : __version__})
+        pass
+        patch_saving_functions(model, vision = True)
+        patch_saving_functions(tokenizer, vision = True)
 
-        # Also fix torch_dtype
+        # Save tokenizer for inference purposes
+        tokenizer.padding_side = "left" # Force inference
         internal_model = model
         while hasattr(internal_model, "model"):
-            if hasattr(internal_model, "config"):
-                if   internal_model.config.torch_dtype ==  "float32":
-                    internal_model.config.torch_dtype = torch.float32
-                elif internal_model.config.torch_dtype == "bfloat16":
-                    internal_model.config.torch_dtype = torch.bfloat16
-                elif internal_model.config.torch_dtype ==  "float16":
-                    internal_model.config.torch_dtype = torch.float16
-                pass
-            pass
+            internal_model._saved_temp_tokenizer = tokenizer
             internal_model = internal_model.model
         pass
-        if hasattr(internal_model, "config"):
-            if   internal_model.config.torch_dtype ==  "float32":
-                internal_model.config.torch_dtype = torch.float32
-            elif internal_model.config.torch_dtype == "bfloat16":
-                internal_model.config.torch_dtype = torch.bfloat16
-            elif internal_model.config.torch_dtype ==  "float16":
-                internal_model.config.torch_dtype = torch.float16
-            pass
-        pass
+        internal_model._saved_temp_tokenizer = tokenizer
         
         return model, tokenizer
     pass
 
 
-    @staticmethod
-    def post_patch(model):
-        # Patch model
-        layers = model.model.layers
-        lm_head = model.get_output_embeddings().weight
-        
-        # Also patch all dtypes - BnB seems to not allocate the correct type?
-        # BnB default dtype seems to be float16!
-        correct_dtype = lm_head.weight.dtype
-
-        for name, module in model.named_modules():
-            if isinstance(module, (Bnb_Linear4bit, Peft_Linear4bit)):
-                weight = module.weight
-                quant_state = weight.quant_state
-
-                if type(quant_state) is list:
-                    # BnB seems to have float16 as default!
-                    module.weight.quant_state[2] = correct_dtype # Cast to correct dtype
-                else:
-                    # https://github.com/TimDettmers/bitsandbytes/pull/763/files
-                    quant_state.dtype = correct_dtype
-                pass
-            pass
-        pass
-
-        # Clear deleted GPU items
-        for _ in range(3):
-            gc.collect()
-            torch.cuda.empty_cache()
-        return model
-    pass
-
-
     @staticmethod
     def get_peft_model(
         model,
-        r                   = 16,
-        target_modules      = "all-linear",
-        lora_alpha          = 16,
-        lora_dropout        = 0,
-        bias                = "none",
-        layers_to_transform = None,
-        layers_pattern      = None,
+        r                          = 16,
+        target_modules             = None,
+        lora_alpha                 = 16,
+        lora_dropout               = 0,
+        bias                       = "none",
+        finetune_vision_layers     = True,
+        finetune_language_layers   = True,
+        finetune_attention_modules = True,
+        finetune_mlp_modules       = True,
+        layers_to_transform        = None,
+        layers_pattern             = None,
         use_gradient_checkpointing = True,
-        random_state        = 3407,
-        max_seq_length      = 2048, # not used anymore
-        use_rslora          = False,
-        modules_to_save     = None,
-        init_lora_weights   = True,
-        loftq_config        = {},
-        temporary_location  = "_unsloth_temporary_saved_buffers",
+        random_state               = 3407,
+        max_seq_length             = 2048, # not used anymore
+        use_rslora                 = False,
+        modules_to_save            = None,
+        init_lora_weights          = True,
+        loftq_config               = {},
+        temporary_location         = "_unsloth_temporary_saved_buffers",
         **kwargs,
     ):
         transformers_set_seed(random_state)
 
-        # Get LoRA
-        arguments = dict(
-            r                   = r,
-            lora_alpha          = lora_alpha,
-            target_modules      = target_modules,
-            lora_dropout        = lora_dropout,
-            bias                = bias,
-            layers_to_transform = layers_to_transform,
-            init_lora_weights   = init_lora_weights,
-            # loftq_config        = loftq_config,
-            # use_rslora          = use_rslora,
-            modules_to_save     = modules_to_save,
-            **kwargs,
-        )
+        if type(r) is not int:
+            raise TypeError(f"Unsloth: Rank of {str(r)} must be an integer.")
+        if r <= 0:
+            raise TypeError(f"Unsloth: Rank of {str(r)} must be larger than 0.")
+
+        if isinstance(model, PeftModelForCausalLM):
+            raise RuntimeError("Unsloth: You already added LoRA adapters to your model!")
+
+        if target_modules == "all-linear":
+            finetune_vision_layers     = True
+            finetune_language_layers   = True
+            finetune_attention_modules = True
+            finetune_mlp_modules       = True
+        pass
+        if target_modules is None:
+            target_modules = get_peft_regex(
+                model,
+                finetune_vision_layers     = finetune_vision_layers,
+                finetune_language_layers   = finetune_language_layers,
+                finetune_attention_modules = finetune_attention_modules,
+                finetune_mlp_modules       = finetune_mlp_modules,
+            )
+        else:
+            assert(type(target_modules) in (list, tuple,))
+        pass
 
-        lora_config = LoraConfig(**arguments)
+        # Clear deleted GPU items
+        for _ in range(3):
+            gc.collect()
+            torch.cuda.empty_cache()
+        pass
 
-        model = _get_peft_model(model, lora_config)
+        lora_config = LoraConfig(
+            r               = r,
+            lora_alpha      = lora_alpha,
+            target_modules  = target_modules,
+            lora_dropout    = lora_dropout,
+            bias            = bias,
+            task_type       = TaskType.CAUSAL_LM,
+        )
+        model = prepare_model_for_kbit_training(
+            model,
+            use_gradient_checkpointing = use_gradient_checkpointing,
+        )
+        model = get_peft_model(model, lora_config)
 
-        model = FastVisionModel.patch_peft_model(model, use_gradient_checkpointing)
+        model = FastBaseVisionModel.patch_peft_model(model, use_gradient_checkpointing)
 
         # Clear deleted GPU items
         for _ in range(3):
             gc.collect()
             torch.cuda.empty_cache()
         pass
+        patch_saving_functions(model, vision = True)
 
         return model
     pass
@@ -396,6 +285,11 @@ def patch_peft_model(
         model,
         use_gradient_checkpointing = True,
     ):
+        if not isinstance(model, PeftModelForCausalLM):
+            raise TypeError(
+                "Unsloth: Your model needs to call `.get_peft_model` first!"
+            )
+        pass
 
         model = prepare_model_for_kbit_training(
             model,
@@ -403,20 +297,6 @@ def patch_peft_model(
             use_reentrant = True,
         )
 
-        # Fix up config for transformers uploading PEFT
-        for active_adapter in model.peft_config.keys():
-            # Not necessary since we requires transformers >= 4.37
-            if False:
-                name = model.peft_config[active_adapter].base_model_name_or_path
-                if name.startswith("unsloth/") and name.endswith("-bnb-4bit"):
-                    name = name[:len(name) - len("-bnb-4bit")]
-                    model.peft_config[active_adapter].base_model_name_or_path = name
-                pass
-            # Add revision to enable future fast inference paths
-            # [TODO] Bugs out!see https://github.com/unslothai/unsloth/issues/492
-            # model.peft_config[active_adapter].revision = f"unsloth"
-        pass
-
         from transformers.trainer import Trainer 
         if Trainer._inner_training_loop.__name__ != "_fast_inner_training_loop":
             raise RuntimeError(
@@ -426,24 +306,7 @@ def patch_peft_model(
                 'Thank you for your understanding and we appreciate it immensely!'
             )
         pass
-
-        logger.warning_once(
-            f"Unsloth {__version__} patched {len(model.model.model.layers)} layers with "\
-            f"{n_qkv} QKV layers, {n_o} O layers and {n_mlp} MLP layers.",
-        )
-        patch_saving_functions(model)
-
-        # Patch cross entropy loss labels
-        # Fixes https://github.com/unslothai/unsloth/issues/10
-        max_seq_length = model.max_seq_length
-        extra_ignored_labels = torch.full((max_seq_length, 1), -100, device = "cuda:0")
-        model.model.extra_ignored_labels = extra_ignored_labels
-        internal_model = model
-        while hasattr(internal_model, "model"):
-            internal_model.max_seq_length = max_seq_length
-            internal_model = internal_model.model
-        pass
-        internal_model.max_seq_length = max_seq_length        
+        patch_saving_functions(model, vision = True)
 
         # Patch tokenizer to pad to the right
         internal_model = model
@@ -468,37 +331,40 @@ def patch_peft_model(
 
     @staticmethod
     def for_inference(model):
-        # if model.config.model_type == "qwen2":
-        #     FastLlamaModel.for_training(model)
-        #     return
-        # pass
-
-        internal_model = model
-        internal_model.gradient_checkpointing = False
-        internal_model.training = False
+        model.gradient_checkpointing = False
+        model.training = False
 
-        while hasattr(internal_model, "model"):
-            internal_model = internal_model.model
-            internal_model.gradient_checkpointing = False
-            internal_model.training = False
-        pass
-        if hasattr(internal_model, "training"):
-            internal_model.training = False
+        for name, module in model.named_modules():
+            if hasattr(module, "gradient_checkpointing"):
+                module.gradient_checkpointing = False
+            if hasattr(module, "training"):
+                module.training = False
         pass
 
-        # Also check if lm_head / embeddings are trained
-        internal_model = model
-        while not hasattr(internal_model, "lm_head"):
-            internal_model = internal_model.model
-        pass
-        lm_head = internal_model.lm_head.weight
-        device_type = lm_head.device.type
         dtype = model.config.torch_dtype
-        
         if type(dtype) is str:
             if   dtype ==  "float16": dtype = torch.float16
             elif dtype == "bfloat16": dtype = torch.bfloat16
         pass
+        device_type = model.device.type
+
+        # Wrap model.generate
+        if model.generate.__name__ != "_fast_generate":
+            model._unwrapped_old_generate = model.generate
+            model.generate = _wrap_fast_inference(model.generate, device_type, dtype, model)
+        pass
+        
+        # Patch tokenizer to pad to the left
+        internal_model = model
+        while hasattr(internal_model, "model"):
+            if hasattr(internal_model, "_saved_temp_tokenizer"):
+                internal_model._saved_temp_tokenizer.padding_side = "left"
+            pass
+            internal_model = internal_model.model
+        pass
+        if hasattr(internal_model, "_saved_temp_tokenizer"):
+            internal_model._saved_temp_tokenizer.padding_side = "left"
+        pass
 
         # Also disable training for embeddings for NEFTune
         if hasattr(model, "get_input_embeddings"):
@@ -516,23 +382,32 @@ def for_inference(model):
 
     @staticmethod
     def for_training(model, use_gradient_checkpointing = True):
-        internal_model = model
-        internal_model.gradient_checkpointing = use_gradient_checkpointing
-        internal_model.training = True
+        model.gradient_checkpointing = use_gradient_checkpointing
+        model.training = True
 
-        # Delete all fast inference loras
-        for param in model.parameters():
-            if hasattr(param, "_fast_lora"):
-                del param._fast_lora
+        for name, module in model.named_modules():
+            if hasattr(module, "gradient_checkpointing"):
+                module.gradient_checkpointing = use_gradient_checkpointing
+            if hasattr(module, "training"):
+                module.training = True
         pass
 
+        # Also revert model.generate
+        if hasattr(model, "_unwrapped_old_generate"):
+            model.generate = model._unwrapped_old_generate
+            del model._unwrapped_old_generate
+        pass
+
+        # Patch tokenizer to pad to the right
+        internal_model = model
         while hasattr(internal_model, "model"):
+            if hasattr(internal_model, "_saved_temp_tokenizer"):
+                internal_model._saved_temp_tokenizer.padding_side = "right"
+            pass
             internal_model = internal_model.model
-            internal_model.gradient_checkpointing = use_gradient_checkpointing
-            internal_model.training = True
         pass
-        if hasattr(internal_model, "training"):
-            internal_model.training = True
+        if hasattr(internal_model, "_saved_temp_tokenizer"):
+            internal_model._saved_temp_tokenizer.padding_side = "right"
         pass
 
         # Also re-enable training for embeddings for NEFTune
@@ -548,3 +423,5 @@ def for_training(model, use_gradient_checkpointing = True):
         return model
     pass
 pass
+
+
diff --git a/unsloth/save.py b/unsloth/save.py
index b4c6b499cf..b503b2b47a 100644
--- a/unsloth/save.py
+++ b/unsloth/save.py
@@ -2041,8 +2041,153 @@ def unsloth_convert_lora_to_ggml_and_save_locally(
     print("Unsloth: Done.")
     print(f"Unsloth: Conversion completed! Output file: {output_file}")
     print("\nThis GGML making function was made by Maheswar. Ping him @Maheswar on the Unsloth Discord or on HuggingFace (@mahiatlinux) if you like this!")
+pass
+
+
+from unsloth_zoo.peft_utils import merge_and_overwrite_lora
+from .models.loader_utils import get_model_name
+
+@torch.inference_mode
+def unsloth_generic_save(
+    model,
+    tokenizer,
+    save_directory       : Union[str, os.PathLike] = "unsloth_finetuned_merge",
+    save_method          : str = "lora", # ["lora", "merged_16bit", "merged_4bit"]
+    push_to_hub          : bool = False,
+    token                : Optional[Union[str, bool]] = None,
+    is_main_process      : bool = True,
+    state_dict           : Optional[dict] = None,
+    save_function        : Callable = torch.save,
+    max_shard_size       : Union[int, str] = "5GB",
+    safe_serialization   : bool = True,
+    variant              : Optional[str] = None,
+    save_peft_format     : bool = True,
+
+    # Push to hub
+    use_temp_dir         : Optional[bool] = None,
+    commit_message       : Optional[str] = "Trained with Unsloth",
+    private              : Optional[bool] = None,
+    create_pr            : bool = False,
+    revision             : str = None,
+    commit_description   : str = "Upload model trained with Unsloth 2x faster",
+    tags                 : List[str] = None,
+
+    # Our functions
+    temporary_location   : str = "_unsloth_temporary_saved_buffers",
+    maximum_memory_usage : float = 0.9,
+):
+    if token is None and push_to_hub: token = get_token()
+
+    merge_and_overwrite_lora(
+        get_model_name,
+        create_huggingface_repo,
+        model,
+        save_location        = save_directory,
+        push_to_hub          = push_to_hub,
+        token                = token,
+        upload_location      = save_directory if push_to_hub else None,
+        low_disk_space_usage = True,
+        private              = private,
+    )
+    return
+pass
+
+
+def unsloth_generic_save_pretrained_merged(
+    self,
+    save_directory       : Union[str, os.PathLike],
+    tokenizer            = None,
+    save_method          : str = "merged_16bit", # ["lora", "merged_16bit", "merged_4bit"]
+    push_to_hub          : bool = False,
+    token                : Optional[Union[str, bool]] = None,
+    is_main_process      : bool = True,
+    state_dict           : Optional[dict] = None,
+    save_function        : Callable = torch.save,
+    max_shard_size       : Union[int, str] = "5GB",
+    safe_serialization   : bool = True,
+    variant              : Optional[str] = None,
+    save_peft_format     : bool = True,
+    tags                 : List[str] = None,
+    temporary_location   : str = "_unsloth_temporary_saved_buffers",
+    maximum_memory_usage : float = 0.75,
+):   
+    """
+        Same as .push_to_hub(...) except 4bit weights are auto
+        converted to float16 with as few overhead as possible.
 
-def patch_saving_functions(model):
+        Choose for `save_method` to be either:
+        1. `16bit`: Merge LoRA into float16 weights. Useful for GGUF / llama.cpp.
+        2.  `4bit`: Merge LoRA into int4 weights. Useful for DPO / HF inference.
+        3.  `lora`: Save LoRA adapters with no merging. Useful for HF inference.
+    """
+    if tokenizer is None:
+        logger.warning_once(
+            "Unsloth: You're not saving a tokenizer as well?\n"\
+            "You can do it separately via `tokenizer.save_pretrained(...)`"
+        )
+    pass
+
+    arguments = dict(locals())
+    arguments["model"] = self
+    del arguments["self"]
+    unsloth_generic_save(**arguments)
+    for _ in range(3):
+        gc.collect()
+pass
+
+
+def unsloth_generic_push_to_hub_merged(
+    self,
+    repo_id              : str,
+    tokenizer            = None,
+    save_method          : str = "merged_16bit", # ["lora", "merged_16bit", "merged_4bit"]
+    use_temp_dir         : Optional[bool] = None,
+    commit_message       : Optional[str] = "Trained with Unsloth",
+    private              : Optional[bool] = None,
+    token                : Union[bool, str, None] = None,
+    max_shard_size       : Union[int, str, None] = "5GB",
+    create_pr            : bool = False,
+    safe_serialization   : bool = True,
+    revision             : str = None,
+    commit_description   : str = "Upload model trained with Unsloth 2x faster",
+    tags                 : Optional[List[str]] = None,
+    temporary_location   : str = "_unsloth_temporary_saved_buffers",
+    maximum_memory_usage : float = 0.75,
+):
+    """
+        Same as .push_to_hub(...) except 4bit weights are auto
+        converted to float16 with as few overhead as possible.
+
+        Choose for `save_method` to be either:
+        1. `16bit`: Merge LoRA into float16 weights. Useful for GGUF / llama.cpp.
+        2.  `4bit`: Merge LoRA into int4 weights. Useful for DPO / HF inference.
+        3.  `lora`: Save LoRA adapters with no merging. Useful for HF inference.
+    """
+    if tokenizer is None:
+        logger.warning_once(
+            "Unsloth: You're not saving a tokenizer as well?\n"\
+            "You can do it separately via `tokenizer.push_to_hub(...)`"
+        )
+    pass
+
+    arguments = dict(locals())
+    arguments["model"]          = self
+    arguments["save_directory"] = repo_id
+    arguments["push_to_hub"]    = True
+    del arguments["self"]
+    del arguments["repo_id"]
+    unsloth_generic_save(**arguments)
+    for _ in range(3):
+        gc.collect()
+pass
+
+
+def not_implemented_save(*args, **kwargs):
+    raise NotImplementedError("Unsloth: Sorry GGUF is currently not supported for vision models!")
+pass
+
+
+def patch_saving_functions(model, vision = False):
     import inspect
     import types
     from typing import Callable, Optional, Union, List
@@ -2131,14 +2276,22 @@ def patch_saving_functions(model):
     pass
 
     # Add saving methods to top level model
-    if hasattr(model, "config"):
-        # Counteract tokenizers
-        model.push_to_hub_merged     = types.MethodType(unsloth_push_to_hub_merged,                    model)
-        model.save_pretrained_merged = types.MethodType(unsloth_save_pretrained_merged,                model)
-        model.push_to_hub_gguf       = types.MethodType(unsloth_push_to_hub_gguf,                      model)
-        model.save_pretrained_gguf   = types.MethodType(unsloth_save_pretrained_gguf,                  model)
-        model.push_to_hub_ggml       = types.MethodType(unsloth_convert_lora_to_ggml_and_push_to_hub,  model)
-        model.save_pretrained_ggml   = types.MethodType(unsloth_convert_lora_to_ggml_and_save_locally, model)
+    if not vision:
+        if hasattr(model, "config"):
+            # Counteract tokenizers
+            model.push_to_hub_merged     = types.MethodType(unsloth_push_to_hub_merged,                    model)
+            model.save_pretrained_merged = types.MethodType(unsloth_save_pretrained_merged,                model)
+            model.push_to_hub_gguf       = types.MethodType(unsloth_push_to_hub_gguf,                      model)
+            model.save_pretrained_gguf   = types.MethodType(unsloth_save_pretrained_gguf,                  model)
+            model.push_to_hub_ggml       = types.MethodType(unsloth_convert_lora_to_ggml_and_push_to_hub,  model)
+            model.save_pretrained_ggml   = types.MethodType(unsloth_convert_lora_to_ggml_and_save_locally, model)
+        pass
+    else:
+        # Vision only 1 option
+        model.push_to_hub_merged     = types.MethodType(unsloth_generic_push_to_hub_merged,     model)
+        model.save_pretrained_merged = types.MethodType(unsloth_generic_save_pretrained_merged, model)
+        model.push_to_hub_gguf       = types.MethodType(not_implemented_save,                   model)
+        model.save_pretrained_gguf   = types.MethodType(not_implemented_save,                   model)
     pass
     return model
 pass
diff --git a/unsloth/trainer.py b/unsloth/trainer.py
index 00956ed41b..012be4b0cb 100644
--- a/unsloth/trainer.py
+++ b/unsloth/trainer.py
@@ -20,13 +20,13 @@
 import trl
 import inspect
 from trl import SFTTrainer
-try:
-    from trl import SFTConfig as TrainingArguments
-except:
-    from transformers import TrainingArguments
-pass
 from . import is_bfloat16_supported
-from unsloth_zoo.training_utils import unsloth_train as _unsloth_train
+from unsloth_zoo.training_utils import (
+    unsloth_train as _unsloth_train,
+)
+from unsloth_zoo.vision_utils import (
+    UnslothVisionDataCollator,
+)
 from packaging.version import Version
 import dataclasses
 
@@ -35,6 +35,7 @@
     "UnslothTrainer",
     "unsloth_train",
     "_patch_trl_trainer",
+    "UnslothVisionDataCollator",
 ]
 
 # Unsloth gradient accumulation fix:
@@ -60,7 +61,11 @@ def unsloth_train(trainer, *args, **kwargs):
     pass
 pass
 
-
+try:
+    from trl import SFTConfig as TrainingArguments
+except:
+    from transformers import TrainingArguments
+pass
 @dataclass
 class UnslothTrainingArguments(TrainingArguments):
     embedding_learning_rate : Optional[float] = field(
@@ -134,7 +139,7 @@ def create_optimizer(self):
 
 # From `trl>=0.13.0`, they changed how to pass several params to the trainer
 # We need to patch to make the transition smooth
-def create_backwards_compatible_trainer(trainer_class, config_class):
+def _backwards_compatible_trainer(trainer_class, config_class):
     original_init = trainer_class.__init__
     
     @wraps(original_init)
@@ -167,6 +172,7 @@ def new_init(self, *args, **kwargs):
             }
 
             # Get parameters that exist in Config but not in TrainingArguments
+            from transformers import TrainingArguments
             moved_params = \
                 set(inspect.signature(config_class)     .parameters.keys()) - \
                 set(inspect.signature(TrainingArguments).parameters.keys())
@@ -207,14 +213,13 @@ def _patch_trl_trainer():
 
     import trl.trainer
     trl_classes = dir(trl.trainer)
-
-    non_convertable_trainer = set(["PPOv2", "AlignProp"])
-    trl_trainers = set(x[:-len("Trainer")] for x in trl_classes if x.endswith("Trainer")) - non_convertable_trainer
-    trl_configs  = set(x[:-len("Config")]  for x in trl_classes if x.endswith("Config"))  - non_convertable_trainer
+    trl_trainers = set(x[:-len("Trainer")] for x in trl_classes if x.endswith("Trainer"))
+    trl_configs  = set(x[:-len("Config")]  for x in trl_classes if x.endswith("Config"))
     trl_classes = list(trl_trainers & trl_configs)
 
     for x in trl_classes:
-        exec(f"trl.{x}Trainer.__init__ = create_backwards_compatible_trainer(trl.{x}Trainer, trl.{x}Config)", globals())
+        try:    exec(f"trl.{x}Trainer.__init__ = _backwards_compatible_trainer(trl.{x}Trainer, trl.{x}Config)", globals())
+        except: continue
     pass
 
     trl.__UNSLOTH_BACKWARDS_COMPATIBLE__ = True