diff --git a/pyproject.toml b/pyproject.toml
index 7dfca63faa..5b9dc8bb57 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -40,7 +40,7 @@ triton = [
 ]
 
 huggingface = [
-    "unsloth_zoo>=2025.3.7",
+    "unsloth_zoo>=2025.3.8",
     "packaging",
     "tyro",
     "transformers>=4.46.1,!=4.47.0",
@@ -354,7 +354,7 @@ colab-ampere-torch220 = [
     "flash-attn>=2.6.3",
 ]
 colab-new = [
-    "unsloth_zoo>=2025.3.7",
+    "unsloth_zoo>=2025.3.8",
     "packaging",
     "tyro",
     "transformers>=4.46.1,!=4.47.0",
diff --git a/unsloth/__init__.py b/unsloth/__init__.py
index 38453f3614..5bbb85d520 100644
--- a/unsloth/__init__.py
+++ b/unsloth/__init__.py
@@ -198,7 +198,7 @@ def is_bf16_supported(): return SUPPORTS_BFLOAT16
 # Check for unsloth_zoo
 try:
     unsloth_zoo_version = importlib_version("unsloth_zoo")
-    if Version(unsloth_zoo_version) < Version("2025.3.7"):
+    if Version(unsloth_zoo_version) < Version("2025.3.8"):
         try:
             os.system("pip install --upgrade --no-cache-dir --no-deps unsloth_zoo")
         except:
diff --git a/unsloth/models/__init__.py b/unsloth/models/__init__.py
index a187ee577a..317525c793 100644
--- a/unsloth/models/__init__.py
+++ b/unsloth/models/__init__.py
@@ -13,7 +13,7 @@
 # limitations under the License.
 
 from .llama   import FastLlamaModel
-from .loader  import FastLanguageModel, FastVisionModel
+from .loader  import FastLanguageModel, FastVisionModel, FastTextModel, FastModel
 from .mistral import FastMistralModel
 from .qwen2   import FastQwen2Model
 from .granite import FastGraniteModel
diff --git a/unsloth/models/_utils.py b/unsloth/models/_utils.py
index 37c69ef877..03eb21f4eb 100644
--- a/unsloth/models/_utils.py
+++ b/unsloth/models/_utils.py
@@ -12,7 +12,7 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-__version__ = "2025.3.8"
+__version__ = "2025.3.9"
 
 __all__ = [
     "SUPPORTS_BFLOAT16",
diff --git a/unsloth/models/llama.py b/unsloth/models/llama.py
index a490fb8ab4..3504037b66 100644
--- a/unsloth/models/llama.py
+++ b/unsloth/models/llama.py
@@ -91,7 +91,7 @@ def original_apply_o(self, X):
 pass
 
 from math import sqrt as math_sqrt
-KV_CACHE_INCREMENT = 256 # KV Cache update size
+KV_CACHE_INCREMENT = 512 # KV Cache update size
 torch_nn_functional_softmax = torch.nn.functional.softmax
 # SDPA has GQA internally
 SDPA_HAS_GQA = "enable_gqa" in scaled_dot_product_attention.__doc__
@@ -1656,6 +1656,13 @@ def from_pretrained(
                 "Are you certain you want to do remote code execution?"
             )
         pass
+        if fast_inference:
+            import platform
+            if platform.system().lower() == 'windows':
+                print("Unsloth: vLLM does not work in Windows! Will use Unsloth inference!")
+                fast_inference = False
+        pass
+
         if token is None: token = get_token()
         if model_patcher is None: model_patcher = FastLlamaModel
         SUPPORTS_BFLOAT16 = is_bfloat16_supported()
@@ -1966,12 +1973,17 @@ def from_pretrained(
             for layer in model.model.layers:
                 layer.self_attn.rotary_emb = rotary_emb
         pass
-        
+
+        # Add for_inference and for_training
+        model.for_training  = functools.partial(FastLlamaModel.for_training,  model)
+        model.for_inference = functools.partial(FastLlamaModel.for_inference, model)
+
         # Patch generate
         if model.generate.__name__ != "unsloth_fast_generate":
             model._old_generate = model.generate
             unsloth_fast_generate.__doc__ = model._old_generate.__doc__
             model.generate = types.MethodType(unsloth_fast_generate, model)
+        pass
         return model, tokenizer
     pass
 
@@ -2404,7 +2416,7 @@ def get_peft_model(
         # Add for_inference and for_training
         model.for_training  = functools.partial(FastLlamaModel.for_training,  model)
         model.for_inference = functools.partial(FastLlamaModel.for_inference, model)
-        
+
         # Patch generate
         if model.generate.__name__ != "unsloth_fast_generate":
             model._old_generate = model.generate
diff --git a/unsloth/models/loader.py b/unsloth/models/loader.py
index 30128cd134..800c016cc8 100644
--- a/unsloth/models/loader.py
+++ b/unsloth/models/loader.py
@@ -383,10 +383,13 @@ def from_pretrained(
     patch_loss_functions,
     post_patch_loss_function,
 )
-from .vision import FastBaseVisionModel
-
+from .vision import FastBaseModel
+from transformers import (
+    AutoModelForVision2Seq,
+    AutoModelForCausalLM,
+)
 
-class FastVisionModel(FastBaseVisionModel):
+class FastModel(FastBaseModel):
     @staticmethod
     def from_pretrained(
         model_name                 = "unsloth/Llama-3.2-11B-Vision-Instruct-bnb-4bit",
@@ -413,7 +416,7 @@ def from_pretrained(
         patch_compiling_bitsandbytes()
         if use_gradient_checkpointing == "unsloth":
             patch_unsloth_smart_gradient_checkpointing(dtype = dtype)
-        
+
         old_model_name = model_name
         if not use_exact_model_name:
             model_name = get_model_name(model_name, load_in_4bit)
@@ -427,7 +430,7 @@ def from_pretrained(
         from huggingface_hub.utils import disable_progress_bars, enable_progress_bars, are_progress_bars_disabled
         was_disabled = are_progress_bars_disabled()
         disable_progress_bars()
-        
+
         autoconfig_error = None
         peft_error = None
         try:
@@ -458,7 +461,7 @@ def from_pretrained(
 
         # Old transformers versions check
         both_exist = (is_model and is_peft) and not SUPPORTS_LLAMA32
-        
+
         # New transformers need to check manually.
         if SUPPORTS_LLAMA32:
             # Check if folder exists locally
@@ -515,9 +518,12 @@ def from_pretrained(
         if not was_disabled: enable_progress_bars()
 
         do_logging = os.environ.get("UNSLOTH_ENABLE_LOGGING", "0") == "1"
-        redirector = sys.stdout if do_logging else open(os.devnull, "w")
+        if do_logging:
+            redirector = contextlib.nullcontext()
+        else:
+            redirector = contextlib.redirect_stdout(open(os.devnull, "w"))
 
-        with contextlib.redirect_stdout(redirector):
+        with redirector:
             patch_loss_functions(torch_compile = False)
             model_types = unsloth_compile_transformers(
                 model_name              = model_name,
@@ -547,7 +553,6 @@ def from_pretrained(
                 return_logits           = return_logits,
             )
         pass
-        if do_logging: redirector.close()
 
         # Check if this is local model since the tokenizer gets overwritten
         if  os.path.exists(os.path.join(old_model_name, "tokenizer_config.json")) and \
@@ -559,7 +564,12 @@ def from_pretrained(
             tokenizer_name = None
         pass
 
-        model, tokenizer = FastBaseVisionModel.from_pretrained(
+        # Check if VLM
+        is_vlm = (x.endswith("ForConditionalGeneration") for x in model_config.architectures)
+        is_vlm = is_vlm or hasattr(model_config, "vision_config")
+        auto_model = AutoModelForVision2Seq if is_vlm else AutoModelForCausalLM
+
+        model, tokenizer = FastBaseModel.from_pretrained(
             model_name        = model_name,
             max_seq_length    = max_seq_length,
             dtype             = _get_dtype(dtype),
@@ -570,6 +580,7 @@ def from_pretrained(
             revision          = revision if not is_peft else None,
             model_types       = model_types,
             tokenizer_name    = tokenizer_name,
+            auto_model        = auto_model,
             *args, **kwargs,
         )
         
@@ -617,8 +628,14 @@ def from_pretrained(
                 trust_remote_code = trust_remote_code,
             )
             # Patch it as well!
-            model = FastBaseVisionModel.patch_peft_model(model, use_gradient_checkpointing)
+            model = FastBaseModel.patch_peft_model(model, use_gradient_checkpointing)
         pass
         return model, tokenizer
     pass
 pass
+
+class FastVisionModel(FastModel):
+    pass
+
+class FastTextModel(FastModel):
+    pass
diff --git a/unsloth/models/mapper.py b/unsloth/models/mapper.py
index da7f449bb4..a2e609f203 100644
--- a/unsloth/models/mapper.py
+++ b/unsloth/models/mapper.py
@@ -611,6 +611,21 @@
         "open-thoughts/OpenThinker-7B",
         "unsloth/OpenThinker-7B-bnb-4bit",
     ),
+    "unsloth/granite-3.2-2b-instruct-unsloth-bnb-4bit" : (
+        "unsloth/granite-3.2-2b-instruct",
+        "ibm-granite/granite-3.2-2b-instruct",
+        "unsloth/granite-3.2-2b-instruct-bnb-4bit",
+    ),
+    "unsloth/granite-3.2-8b-instruct-unsloth-bnb-4bit" : (
+        "unsloth/granite-3.2-8b-instruct",
+        "ibm-granite/granite-3.2-8b-instruct",
+        "unsloth/granite-3.2-8b-instruct-bnb-4bit",
+    ),
+    "unsloth/QwQ-32B-unsloth-bnb-4bit" : (
+        "unsloth/QwQ-32B",
+        "Qwen/QwQ-32B",
+        "unsloth/QwQ-32B-bnb-4bit",
+    ),
 }
 
 INT_TO_FLOAT_MAPPER  = {}
diff --git a/unsloth/models/vision.py b/unsloth/models/vision.py
index d13d394669..ff07ef6917 100644
--- a/unsloth/models/vision.py
+++ b/unsloth/models/vision.py
@@ -17,6 +17,8 @@
     BitsAndBytesConfig,
     AutoModelForVision2Seq,
     AutoProcessor,
+    AutoTokenizer,
+    AutoModelForCausalLM,
 )
 from .llama import *
 from ..kernels import (
@@ -31,48 +33,60 @@
     requires_grad_for_gradient_checkpointing,
 )
 from triton import __version__ as triton_version
+from unsloth_zoo.utils import _get_dtype
+from unsloth_zoo.patching_utils import patch_model_and_tokenizer
+import types
+import functools
 
 __all__ = [
-    "FastBaseVisionModel",
+    "FastBaseModel",
 ]
 
-def _wrap_fast_inference(generate, device_type, dtype, model):
-    # Wraps inference with bfloat16 / float16
-    @torch.inference_mode
-    def _fast_generate(*args, **kwargs):
-        # For num_logits_to_keep
-        # kwargs["num_logits_to_keep"] = 1
 
-        # Remove token_type_ids
-        kwargs.pop("token_type_ids", None)
+def unsloth_base_fast_generate(
+    self,
+    *args,
+    **kwargs,
+):
+    FastBaseModel.for_inference(self)
+    dtype = _get_dtype(self.config.torch_dtype)
 
-        # Check pad_token
-        model_eos_token_id = getattr(model.config, "eos_token_id", None)
-        if model_eos_token_id is not None and hasattr(model_eos_token_id, "__iter__"):
-            model_eos_token_id = model_eos_token_id[0]
+    # Check if VLM
+    is_vlm = (x.endswith("ForConditionalGeneration") for x in self.config.architectures)
+    is_vlm = is_vlm or hasattr(self.config, "vision_config")
 
-        kwargs["pad_token_id"] = kwargs.pop("pad_token_id", model_eos_token_id)
+    # Remove token_type_ids
+    kwargs.pop("token_type_ids", None)
 
-        try:
-            kwargs["pixel_values"] = kwargs["pixel_values"].to(model.dtype)
-        except:
-            pass
+    # VLMs do not allow logits_to_keep
+    if not is_vlm: kwargs["logits_to_keep"] = 1
 
-        # Autocasted
-        with torch.autocast(device_type = device_type, dtype = dtype):
-            output = generate(*args, **kwargs)
-        pass
-        return output
+    # Check pad_token
+    model_eos_token_id = getattr(self.config, "eos_token_id", None)
+    if model_eos_token_id is not None and hasattr(model_eos_token_id, "__iter__"):
+        model_eos_token_id = model_eos_token_id[0]
+
+    kwargs["pad_token_id"] = kwargs.pop("pad_token_id", model_eos_token_id)
+
+    # Get pixel values for VLMs
+    try: kwargs["pixel_values"] = kwargs["pixel_values"].to(dtype)
+    except: pass
+
+    # Mixed precision autocast
+    with torch.inference_mode(), torch.autocast(device_type = "cuda", dtype = dtype):
+        output = self._old_generate(*args, **kwargs)
     pass
-    return _fast_generate
+
+    FastBaseModel.for_training(self)
+    return output
 pass
 
 
-class FastBaseVisionModel:
+class FastBaseModel:
 
     @staticmethod
     def from_pretrained(
-        model_name        = "unsloth/llama-3-8b-bnb-4bit",
+        model_name        = "unsloth/Llama-3.2-1B-Instruct",
         max_seq_length    = None,
         dtype             = None,
         load_in_4bit      = True,
@@ -81,6 +95,7 @@ def from_pretrained(
         trust_remote_code = False,
         model_types       = None,
         tokenizer_name    = None,
+        auto_model        = AutoModelForVision2Seq,
         **kwargs,
     ):
         if trust_remote_code:
@@ -94,12 +109,16 @@ def from_pretrained(
         gpu_stats = torch.cuda.get_device_properties(0)
         max_memory = round(gpu_stats.total_memory / 1024 / 1024 / 1024, 3)
 
+        from importlib.metadata import version as importlib_version
+        try:    vllm_version = f" vLLM: {importlib_version('vllm')}."
+        except: vllm_version = ""
+
         statistics = \
-           f"==((====))==  Unsloth {__version__}: Fast {model_types[0].title()} vision patching. Transformers: {transformers_version}.\n"\
-           f"   {chr(92)}{chr(92)}   /|    GPU: {gpu_stats.name}. Max memory: {max_memory} GB. Platform: {platform_system}.\n"\
+           f"==((====))==  Unsloth {__version__}: Fast {model_types[0].title()} patching. Transformers: {transformers_version}.{vllm_version}\n"\
+           f"   {chr(92)}{chr(92)}   /|    {gpu_stats.name}. Num GPUs = {torch.cuda.device_count()}. Max memory: {max_memory} GB. Platform: {platform_system}.\n"\
            f"O^O/ {chr(92)}_/ {chr(92)}    Torch: {torch.__version__}. CUDA: {gpu_stats.major}.{gpu_stats.minor}. CUDA Toolkit: {torch.version.cuda}. Triton: {triton_version}\n"\
            f"{chr(92)}        /    Bfloat16 = {str(SUPPORTS_BFLOAT16).upper()}. FA [Xformers = {xformers_version}. FA2 = {HAS_FLASH_ATTENTION}]\n"\
-           f' "-____-"     Free Apache license: http://github.com/unslothai/unsloth'
+           f' "-____-"     Free license: http://github.com/unslothai/unsloth'
         print(statistics)
 
         # Warn about fast transfers
@@ -136,8 +155,8 @@ def from_pretrained(
 
         # Cannot be None, since HF now checks for the config
         if load_in_4bit: kwargs["quantization_config"] = bnb_config
-        
-        model = AutoModelForVision2Seq.from_pretrained(
+
+        model = auto_model.from_pretrained(
             model_name,
             device_map              = device_map,
             torch_dtype             = dtype,
@@ -152,26 +171,25 @@ def from_pretrained(
 
         # Counteract saved tokenizers
         tokenizer_name = model_name if tokenizer_name is None else tokenizer_name
-        tokenizer = AutoProcessor.from_pretrained(
+        auto_processor = AutoProcessor if auto_model is AutoModelForVision2Seq else AutoTokenizer
+        tokenizer = auto_processor.from_pretrained(
             tokenizer_name,
             padding_side = "right",
             token        = token,
         )
         # Add padding side as well
-        tokenizer.tokenizer.padding_side = "right"
+        if hasattr(tokenizer, "tokenizer"):
+            tokenizer.tokenizer.padding_side = "right"
 
         model, tokenizer = patch_tokenizer(model, tokenizer)
         model = post_patch_loss_function(model)
-
-        # Fix up config for transformers uploading PEFT
-        # Not necessary anymore since we require transformers>=4.37!
-        if False:
-            name = model.config._name_or_path
-            if name.startswith("unsloth/") and name.endswith("-bnb-4bit"):
-                name = name[:len(name) - len("-bnb-4bit")]
-                model.config.update({"_name_or_path" : name})
-            pass
-        pass
+        # Fix other stuff like BnB compute data types
+        model, tokenizer = patch_model_and_tokenizer(
+            model,
+            tokenizer,
+            downcast_rope = False,
+            fix_embeddings = False,
+        )
 
         # Log Unsloth version for future fastpaths for inference
         if hasattr(model, "config"):
@@ -187,13 +205,22 @@ def from_pretrained(
         # Save tokenizer for inference purposes
         tokenizer.padding_side = "left" # Force inference
         tokenizer.tokenizer.padding_side = "left" # Force inference
-        internal_model = model
-        while hasattr(internal_model, "model"):
-            internal_model._saved_temp_tokenizer = tokenizer
-            internal_model = internal_model.model
+        m = model
+        while hasattr(m, "model"):
+            m._saved_temp_tokenizer = tokenizer
+            # Also set is_loaded_in_8bit to disable incorrect DDP
+            m.is_loaded_in_8bit = True
+            m = m.model
         pass
-        internal_model._saved_temp_tokenizer = tokenizer
-        
+        m._saved_temp_tokenizer = tokenizer
+        # Also set is_loaded_in_8bit to disable incorrect DDP
+        m.is_loaded_in_8bit = True
+
+        # Patch generate
+        if model.generate.__name__ != "unsloth_base_fast_generate":
+            model._old_generate = model.generate
+            unsloth_base_fast_generate.__doc__ = model._old_generate.__doc__
+            model.generate = types.MethodType(unsloth_base_fast_generate, model)
         return model, tokenizer
     pass
 
@@ -272,7 +299,7 @@ def get_peft_model(
         # Enable gradients on modules which are trainable
         requires_grad_for_gradient_checkpointing(model)
 
-        model = FastBaseVisionModel.patch_peft_model(model, use_gradient_checkpointing)
+        model = FastBaseModel.patch_peft_model(model, use_gradient_checkpointing)
 
         # Clear deleted GPU items
         for _ in range(3):
@@ -281,6 +308,9 @@ def get_peft_model(
         pass
         patch_saving_functions(model, vision = True)
 
+        # Add for_inference and for_training
+        model.for_training  = functools.partial(FastBaseModel.for_training,  model)
+        model.for_inference = functools.partial(FastBaseModel.for_inference, model)
         return model
     pass
 
@@ -314,62 +344,57 @@ def patch_peft_model(
         patch_saving_functions(model, vision = True)
 
         # Patch tokenizer to pad to the right
-        internal_model = model
-        while hasattr(internal_model, "model"):
-            if hasattr(internal_model, "_saved_temp_tokenizer"):
-                internal_model._saved_temp_tokenizer.tokenizer.padding_side = "right"
+        m = model
+        while hasattr(m, "model"):
+            if hasattr(m, "_saved_temp_tokenizer"):
+                m._saved_temp_tokenizer.tokenizer.padding_side = "right"
             pass
-            internal_model = internal_model.model
+            # Also set is_loaded_in_8bit to disable incorrect DDP
+            m.is_loaded_in_8bit = True
+            m = m.model
         pass
-        if hasattr(internal_model, "_saved_temp_tokenizer"):
-            internal_model._saved_temp_tokenizer.tokenizer.padding_side = "right"
+        if hasattr(m, "_saved_temp_tokenizer"):
+            m._saved_temp_tokenizer.tokenizer.padding_side = "right"
         pass
+        # Also set is_loaded_in_8bit to disable incorrect DDP
+        m.is_loaded_in_8bit = True
 
         # Clear deleted GPU items
         for _ in range(3):
             gc.collect()
             torch.cuda.empty_cache()
         pass
+        # Add for_inference and for_training
+        model.for_training  = functools.partial(FastBaseModel.for_training,  model)
+        model.for_inference = functools.partial(FastBaseModel.for_inference, model)
+
+        # Patch generate
+        if model.generate.__name__ != "unsloth_base_fast_generate":
+            model._old_generate = model.generate
+            unsloth_base_fast_generate.__doc__ = model._old_generate.__doc__
+            model.generate = types.MethodType(unsloth_base_fast_generate, model)
         return model
     pass
 
 
     @staticmethod
     def for_inference(model):
-        model.gradient_checkpointing = False
-        model.training = False
-
-        for name, module in model.named_modules():
-            if hasattr(module, "gradient_checkpointing"):
-                module.gradient_checkpointing = False
-            if hasattr(module, "training"):
-                module.training = False
-        pass
-
-        dtype = model.config.torch_dtype
-        if type(dtype) is str:
-            if   dtype ==  "float16": dtype = torch.float16
-            elif dtype == "bfloat16": dtype = torch.bfloat16
-        pass
-        device_type = model.device.type
-
-        # Wrap model.generate
-        if model.generate.__name__ != "_fast_generate":
-            model._unwrapped_old_generate = model.generate
-            model.generate = _wrap_fast_inference(model.generate, device_type, dtype, model)
-        pass
-        
-        # Patch tokenizer to pad to the left
-        internal_model = model
-        while hasattr(internal_model, "model"):
-            if hasattr(internal_model, "_saved_temp_tokenizer"):
-                internal_model._saved_temp_tokenizer.tokenizer.padding_side = "left"
-            pass
-            internal_model = internal_model.model
-        pass
-        if hasattr(internal_model, "_saved_temp_tokenizer"):
-            internal_model._saved_temp_tokenizer.tokenizer.padding_side = "left"
+        if not hasattr(model, "parameters"):
+            raise TypeError("Unsloth: I think you're passing a tokenizer, not the model to for_inference!")
+
+        def _for_inference(m):
+            if hasattr(m, "gradient_checkpointing"): m.gradient_checkpointing = False
+            if hasattr(m, "training"): m.training = False
+            # Pad tokenizer to the left
+            if hasattr(m, "_saved_temp_tokenizer"): m._saved_temp_tokenizer.padding_side = "left"
+            # Set a flag for generation!
+            m._flag_for_generation = True
         pass
+        m = model
+        while hasattr(m, "model"):
+            _for_inference(m)
+            m = m.model
+        _for_inference(m)
 
         # Also disable training for embeddings for NEFTune
         if hasattr(model, "get_input_embeddings"):
@@ -380,40 +405,34 @@ def for_inference(model):
             embeddings = model.get_output_embeddings()
             if hasattr(embeddings, "training"): embeddings.training = False
         pass
-
         return model
     pass
 
 
     @staticmethod
     def for_training(model, use_gradient_checkpointing = True):
-        model.gradient_checkpointing = use_gradient_checkpointing
-        model.training = True
-
-        for name, module in model.named_modules():
-            if hasattr(module, "gradient_checkpointing"):
-                module.gradient_checkpointing = use_gradient_checkpointing
-            if hasattr(module, "training"):
-                module.training = True
-        pass
+        if not hasattr(model, "parameters"):
+            raise TypeError("Unsloth: I think you're passing a tokenizer, not the model to for_training!")
 
-        # Also revert model.generate
-        if hasattr(model, "_unwrapped_old_generate"):
-            model.generate = model._unwrapped_old_generate
-            del model._unwrapped_old_generate
+        # Delete all fast inference loras
+        for param in model.parameters():
+            if hasattr(param, "_fast_lora"):
+                del param._fast_lora
         pass
 
-        # Patch tokenizer to pad to the right
-        internal_model = model
-        while hasattr(internal_model, "model"):
-            if hasattr(internal_model, "_saved_temp_tokenizer"):
-                internal_model._saved_temp_tokenizer.tokenizer.padding_side = "right"
-            pass
-            internal_model = internal_model.model
-        pass
-        if hasattr(internal_model, "_saved_temp_tokenizer"):
-            internal_model._saved_temp_tokenizer.tokenizer.padding_side = "right"
+        def _for_training(m):
+            if hasattr(m, "gradient_checkpointing"): m.gradient_checkpointing = use_gradient_checkpointing
+            if hasattr(m, "training"): m.training = True
+            # Pad tokenizer to the left
+            if hasattr(m, "_saved_temp_tokenizer"): m._saved_temp_tokenizer.padding_side = "right"
+            # Set a flag for generation!
+            if hasattr(m, "_flag_for_generation"): del m._flag_for_generation
         pass
+        m = model
+        while hasattr(m, "model"):
+            _for_training(m)
+            m = m.model
+        _for_training(m)
 
         # Also re-enable training for embeddings for NEFTune
         if hasattr(model, "get_input_embeddings"):
@@ -424,7 +443,6 @@ def for_training(model, use_gradient_checkpointing = True):
             embeddings = model.get_output_embeddings()
             if hasattr(embeddings, "training"): embeddings.training = True
         pass
-
         return model
     pass
 pass