From c39f56fce039742693814b7770bde020399251a3 Mon Sep 17 00:00:00 2001
From: Daniel Han <danielhanchen@gmail.com>
Date: Sat, 9 Aug 2025 14:45:43 -0700
Subject: [PATCH 001/272] Fix mamba

---
 unsloth/models/loader.py | 2 ++
 unsloth/models/vision.py | 1 +
 2 files changed, 3 insertions(+)

diff --git a/unsloth/models/loader.py b/unsloth/models/loader.py
index ea746be43d..75561c4775 100644
--- a/unsloth/models/loader.py
+++ b/unsloth/models/loader.py
@@ -587,6 +587,8 @@ def from_pretrained(
             if transformers_version < Version("4.53.0"):
                 raise RuntimeError("Unsloth: Gemma 3N only works on transformers >= 4.53.0" + LATEST)
         elif "falcon-h1" in lowered_model_name:
+            # Falcon must use float32 Triton ie TRITON_F32_DEFAULT = 'ieee'
+            # since Mamba kernels error out on using lower precision
             os.environ["UNSLOTH_FORCE_CUSTOM_DTYPE"] = \
                 "float16;torch.float32;torch.float16;"\
                 "if name.endswith(('q_proj', 'k_proj', 'v_proj', 'o_proj', 'gate_proj', 'up_proj', 'down_proj', 'head')): module.to(torch.float16); "\
diff --git a/unsloth/models/vision.py b/unsloth/models/vision.py
index 5524d8f16d..bdf86196d4 100644
--- a/unsloth/models/vision.py
+++ b/unsloth/models/vision.py
@@ -373,6 +373,7 @@ def from_pretrained(
                 custom_datatype = _custom_datatype
                 # Execute code as well
                 if len(execute_code.strip()) != 0:
+                    print(execute_code)
                     exec(execute_code)
             else:
                 custom_datatype = None

From 4bd35c509f26c4ff3409090175bba7fab4a604a9 Mon Sep 17 00:00:00 2001
From: Daniel Han <danielhanchen@gmail.com>
Date: Sat, 9 Aug 2025 14:50:53 -0700
Subject: [PATCH 002/272] Update loader.py

---
 unsloth/models/loader.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/unsloth/models/loader.py b/unsloth/models/loader.py
index 75561c4775..186d302d44 100644
--- a/unsloth/models/loader.py
+++ b/unsloth/models/loader.py
@@ -591,8 +591,8 @@ def from_pretrained(
             # since Mamba kernels error out on using lower precision
             os.environ["UNSLOTH_FORCE_CUSTOM_DTYPE"] = \
                 "float16;torch.float32;torch.float16;"\
-                "if name.endswith(('q_proj', 'k_proj', 'v_proj', 'o_proj', 'gate_proj', 'up_proj', 'down_proj', 'head')): module.to(torch.float16); "\
-                "os.environ['TRITON_F32_DEFAULT'] = 'ieee';"
+                "if name.endswith(('q_proj', 'k_proj', 'v_proj', 'o_proj', 'gate_proj', 'up_proj', 'down_proj', 'head')): "\
+                "module, os.environ['TRITON_F32_DEFAULT'] = module.to(torch.float16), 'ieee'"
         elif "gpt-oss" in lowered_model_name:
             os.environ["UNSLOTH_DISABLE_STATIC_GENERATION"] = "1"
             # CCE fails on Tesla T4

From 1f0a4c32aac3ca721fb50cad39a8dbbf28e4fc1b Mon Sep 17 00:00:00 2001
From: Daniel Han <danielhanchen@gmail.com>
Date: Sat, 9 Aug 2025 14:51:04 -0700
Subject: [PATCH 003/272] Update vision.py

---
 unsloth/models/vision.py | 1 -
 1 file changed, 1 deletion(-)

diff --git a/unsloth/models/vision.py b/unsloth/models/vision.py
index bdf86196d4..5524d8f16d 100644
--- a/unsloth/models/vision.py
+++ b/unsloth/models/vision.py
@@ -373,7 +373,6 @@ def from_pretrained(
                 custom_datatype = _custom_datatype
                 # Execute code as well
                 if len(execute_code.strip()) != 0:
-                    print(execute_code)
                     exec(execute_code)
             else:
                 custom_datatype = None

From 3cb97197d56f31c040c8bc17f68bb682aacb1928 Mon Sep 17 00:00:00 2001
From: Daniel Han <danielhanchen@gmail.com>
Date: Sat, 9 Aug 2025 14:54:35 -0700
Subject: [PATCH 004/272] Update loader.py

---
 unsloth/models/loader.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/unsloth/models/loader.py b/unsloth/models/loader.py
index 186d302d44..b8f2432fc0 100644
--- a/unsloth/models/loader.py
+++ b/unsloth/models/loader.py
@@ -591,8 +591,8 @@ def from_pretrained(
             # since Mamba kernels error out on using lower precision
             os.environ["UNSLOTH_FORCE_CUSTOM_DTYPE"] = \
                 "float16;torch.float32;torch.float16;"\
-                "if name.endswith(('q_proj', 'k_proj', 'v_proj', 'o_proj', 'gate_proj', 'up_proj', 'down_proj', 'head')): "\
-                "module, os.environ['TRITON_F32_DEFAULT'] = module.to(torch.float16), 'ieee'"
+                "if name.endswith(('q_proj', 'k_proj', 'v_proj', 'o_proj', 'gate_proj', 'up_proj', 'down_proj', 'head')): module.to(torch.float16);"\
+                "os.environ['TRITON_F32_DEFAULT'] = 'ieee'"
         elif "gpt-oss" in lowered_model_name:
             os.environ["UNSLOTH_DISABLE_STATIC_GENERATION"] = "1"
             # CCE fails on Tesla T4

From 1432eac9d0b82ab732e4e4f1f9fbb0fbbb4c63df Mon Sep 17 00:00:00 2001
From: Datta Nimmaturi <venkatadattasainimmaturi@gmail.com>
Date: Wed, 13 Aug 2025 08:16:43 +0530
Subject: [PATCH 005/272] Filter vLLM standby logs (#3131)

* filter vLLM standby logs

* safeguard standby logger patch

* Update unsloth/models/_utils.py

* Update unsloth/models/_utils.py

* Update unsloth/models/_utils.py

---------

Co-authored-by: Daniel Han <danielhanchen@gmail.com>
---
 unsloth/models/_utils.py | 34 ++++++++++++++++++++++++++++++++++
 1 file changed, 34 insertions(+)

diff --git a/unsloth/models/_utils.py b/unsloth/models/_utils.py
index 4426a28266..d904d8674a 100644
--- a/unsloth/models/_utils.py
+++ b/unsloth/models/_utils.py
@@ -152,6 +152,40 @@ def __init__(self, text): self.text = text
     def filter(self, x): return not (self.text in x.getMessage())
 pass
 
+if os.environ.get('UNSLOTH_ENABLE_LOGGING', '0') != '1':
+    try:
+        from vllm.worker.worker import logger as vllm_worker_logger
+        vllm_worker_logger.addFilter(HideLoggingMessage("Sleep mode freed"))
+        del vllm_worker_logger
+    except:
+        pass
+    try:
+        from vllm.v1.worker.gpu_worker import logger as vllm_gpu_worker_logger
+        vllm_gpu_worker_logger.addFilter(HideLoggingMessage("Sleep mode freed"))
+        del vllm_gpu_worker_logger
+    except:
+        pass
+    try:
+        from vllm.executor.executor_base import logger as vllm_executor_logger
+        vllm_executor_logger.addFilter(HideLoggingMessage("to fall asleep"))
+        vllm_executor_logger.addFilter(HideLoggingMessage("to wake up"))
+        del vllm_executor_logger
+    except:
+        pass
+    try:
+        from vllm.core.block.prefix_caching_block import logger as vllm_prefix_caching_logger
+        vllm_prefix_caching_logger.addFilter(HideLoggingMessage("reset prefix cache"))
+        del vllm_prefix_caching_logger
+    except:
+        pass
+    try:
+        from vllm.v1.core.block_pool import logger as vllm_block_pool_logger
+        vllm_block_pool_logger.addFilter(HideLoggingMessage("reset prefix cache"))
+        del vllm_block_pool_logger
+    except:
+        pass
+pass
+
 # The speedups for torchdynamo mostly come with GPU Ampere or higher and which is not detected here.
 from transformers.training_args import logger as transformers_training_args_logger
 transformers_training_args_logger.addFilter(HideLoggingMessage("The speedups"))

From fd1124ab64c96af40dbdf8294a9e2bdaa55e01cf Mon Sep 17 00:00:00 2001
From: Daniel Han <danielhanchen@gmail.com>
Date: Tue, 12 Aug 2025 21:26:39 -0700
Subject: [PATCH 006/272] Update loader.py

---
 unsloth/models/loader.py | 15 +++++++++++++++
 1 file changed, 15 insertions(+)

diff --git a/unsloth/models/loader.py b/unsloth/models/loader.py
index b8f2432fc0..15f3e43aef 100644
--- a/unsloth/models/loader.py
+++ b/unsloth/models/loader.py
@@ -111,6 +111,14 @@ def from_pretrained(
         disable_log_stats          = True,
         *args, **kwargs,
     ):
+        # Login to allow private models
+        if token is None: token = get_token()
+        if token is not None:
+            try:
+                from huggingface_hub import login
+                login(token = token)
+            except:
+                pass
         if load_in_8bit or full_finetuning:
             return FastModel.from_pretrained(
                 model_name                 = model_name,
@@ -513,6 +521,13 @@ def from_pretrained(
         *args, **kwargs,
     ):
         if token is None: token = get_token()
+        # Login to allow private models
+        if token is not None:
+            try:
+                from huggingface_hub import login
+                login(token = token)
+            except:
+                pass
         if whisper_language is not None: assert(type(whisper_language) is str)
         if whisper_task is not None: assert(type(whisper_task) is str)
         SUPPORTS_BFLOAT16 = is_bfloat16_supported()

From b78189b2d5a127b43a10f5aed1359a1cfe3629c5 Mon Sep 17 00:00:00 2001
From: Daniel Han <danielhanchen@gmail.com>
Date: Wed, 13 Aug 2025 03:27:54 -0700
Subject: [PATCH 007/272] Add scaler

---
 unsloth/models/_utils.py | 12 ++++++++++++
 unsloth/models/rl.py     | 14 ++++++++++++++
 2 files changed, 26 insertions(+)

diff --git a/unsloth/models/_utils.py b/unsloth/models/_utils.py
index d904d8674a..3bd3c2c294 100644
--- a/unsloth/models/_utils.py
+++ b/unsloth/models/_utils.py
@@ -152,6 +152,7 @@ def __init__(self, text): self.text = text
     def filter(self, x): return not (self.text in x.getMessage())
 pass
 
+# Stop vLLM messages
 if os.environ.get('UNSLOTH_ENABLE_LOGGING', '0') != '1':
     try:
         from vllm.worker.worker import logger as vllm_worker_logger
@@ -258,6 +259,17 @@ def filter(self, x): return not (self.text in x.getMessage())
 except:
     pass
 
+# You passed `quantization_config` or equivalent parameters
+try:
+    warnings.filterwarnings(
+        action = "ignore",
+        message = r".*quantization_config.*",
+        category = UserWarning,
+        append = True,
+    )
+except:
+    pass
+
 # Errors out on
 # Some weights of Gemma3nForConditionalGeneration were not initialized from the model checkpoint
 from transformers.modeling_utils import logger as transformers_logger
diff --git a/unsloth/models/rl.py b/unsloth/models/rl.py
index deb779588c..e751ef5e30 100644
--- a/unsloth/models/rl.py
+++ b/unsloth/models/rl.py
@@ -421,6 +421,20 @@ def _patch_trl_rl_trainers(trainer_file = "grpo_trainer"):
         RLTrainer_post += neftune_check
     pass
 
+    # Add accelerator scaler to model
+    if "model" in call_args:
+        neftune_check = \
+        "if hasattr(self, 'accelerator'):\n"\
+        "    scaler = self.accelerator.scaler\n"\
+        "    current_model = model\n"\
+        "    while hasattr(current_model, 'model'):\n"\
+        "        current_model.accelerator_scaler = scaler\n"\
+        "        current_model = current_model.model\n"\
+        "    current_model.accelerator_scaler = scaler\n"\
+        "pass\n"
+        RLTrainer_post += neftune_check
+    pass
+
     # Edit optional metrics
     other_metrics_processor = ""
     if trainer_file in RL_METRICS_CHANGES:

From cd2e284c97bb60618da78fcf1314f3a3a5885dd8 Mon Sep 17 00:00:00 2001
From: Daniel Han <danielhanchen@gmail.com>
Date: Wed, 13 Aug 2025 05:12:35 -0700
Subject: [PATCH 008/272] Update llama.py

---
 unsloth/models/llama.py | 25 +++++++++++++++++++------
 1 file changed, 19 insertions(+), 6 deletions(-)

diff --git a/unsloth/models/llama.py b/unsloth/models/llama.py
index 3c0d5012ae..eafbd5a433 100644
--- a/unsloth/models/llama.py
+++ b/unsloth/models/llama.py
@@ -1197,12 +1197,25 @@ def _CausalLM_fast_forward(
                 if self.config.model_type == "falcon_h1":
                     hidden_states = hidden_states * self.config.lm_head_multiplier
 
-                loss = fused_linear_cross_entropy(
-                    hidden_states      = hidden_states,
-                    lm_weight          = lm_head,
-                    labels             = labels,
-                    num_items_in_batch = n_items,
-                    logit_softcapping  = logit_softcapping,
+                # loss = fused_linear_cross_entropy(
+                #     hidden_states      = hidden_states,
+                #     lm_weight          = lm_head,
+                #     labels             = labels,
+                #     num_items_in_batch = n_items,
+                #     logit_softcapping  = logit_softcapping,
+                # )
+                loss = unsloth_fused_ce_loss(
+                    trainer              = None,
+                    hidden_states        = hidden_states,
+                    lm_head_weight       = lm_head,
+                    lm_head_bias         = None,
+                    labels               = labels,
+                    mask                 = None,
+                    n_items              = n_items,
+                    scaling              = getattr(self, "accelerator_scaler", None),
+                    target_gb            = 1,
+                    torch_compile        = True,
+                    logit_softcapping    = logit_softcapping,
                 )
                 if not return_dict:
                     output = (logits,) + outputs[1:]

From 5e976a5881296f35c6affae56178d3a2abc1fb50 Mon Sep 17 00:00:00 2001
From: Daniel Han <danielhanchen@gmail.com>
Date: Wed, 13 Aug 2025 05:18:55 -0700
Subject: [PATCH 009/272] Update _utils.py

---
 unsloth/models/_utils.py | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/unsloth/models/_utils.py b/unsloth/models/_utils.py
index 3bd3c2c294..d6eb82f01c 100644
--- a/unsloth/models/_utils.py
+++ b/unsloth/models/_utils.py
@@ -58,6 +58,7 @@
     "HAS_CUT_CROSS_ENTROPY",
     "EMPTY_LOGITS",
     "fused_linear_cross_entropy",
+    "unsloth_fused_ce_loss",
     "patch_unsloth_smart_gradient_checkpointing",
     "unpatch_unsloth_smart_gradient_checkpointing",
 
@@ -109,6 +110,7 @@
     HAS_CUT_CROSS_ENTROPY,
     fused_linear_cross_entropy,
     _unsloth_get_batch_samples,
+    unsloth_fused_ce_loss,
 )
 from unsloth_zoo.vision_utils import (
     process_vision_info,

From f451adff6be85230da2cd50bf068f23726d9b99d Mon Sep 17 00:00:00 2001
From: Daniel Han <danielhanchen@gmail.com>
Date: Wed, 13 Aug 2025 06:04:40 -0700
Subject: [PATCH 010/272] Versioning

---
 pyproject.toml           | 6 +++---
 unsloth/models/_utils.py | 2 +-
 2 files changed, 4 insertions(+), 4 deletions(-)

diff --git a/pyproject.toml b/pyproject.toml
index 8e18688ddf..e563ba6fc5 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -7,7 +7,7 @@ name = "unsloth"
 dynamic = ["version"]
 description = "2-5X faster LLM finetuning"
 readme = "README.md"
-requires-python = ">=3.9,<3.13"
+requires-python = ">=3.9,<=3.13"
 license = {text = "Apache-2.0"}
 keywords = ["ai", "llm",]
 authors = [
@@ -37,7 +37,7 @@ triton = [
 ]
 
 huggingface = [
-    "unsloth_zoo>=2025.8.3",
+    "unsloth_zoo>=2025.8.4",
     "packaging",
     "tyro",
     "transformers>=4.51.3,!=4.47.0,!=4.52.0,!=4.52.1,!=4.52.2,!=4.52.3,!=4.53.0",
@@ -384,7 +384,7 @@ colab-ampere-torch220 = [
     "flash-attn>=2.6.3",
 ]
 colab-new = [
-    "unsloth_zoo>=2025.8.3",
+    "unsloth_zoo>=2025.8.4",
     "packaging",
     "tyro",
     "transformers>=4.51.3,!=4.47.0,!=4.52.0,!=4.52.1,!=4.52.2,!=4.52.3,!=4.53.0",
diff --git a/unsloth/models/_utils.py b/unsloth/models/_utils.py
index d6eb82f01c..d1df57ad5c 100644
--- a/unsloth/models/_utils.py
+++ b/unsloth/models/_utils.py
@@ -12,7 +12,7 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-__version__ = "2025.8.4"
+__version__ = "2025.8.5"
 
 __all__ = [
     "SUPPORTS_BFLOAT16",

From 3b82c4259cd7506b351bf9b073a3033be22da8aa Mon Sep 17 00:00:00 2001
From: Daniel Han <danielhanchen@gmail.com>
Date: Thu, 14 Aug 2025 03:31:47 -0700
Subject: [PATCH 011/272] GPT OSS fix

---
 unsloth/models/loader.py | 10 ++++++++--
 1 file changed, 8 insertions(+), 2 deletions(-)

diff --git a/unsloth/models/loader.py b/unsloth/models/loader.py
index 7ac27158a2..960f9cc23f 100644
--- a/unsloth/models/loader.py
+++ b/unsloth/models/loader.py
@@ -615,12 +615,18 @@ def from_pretrained(
             os.environ["UNSLOTH_ENABLE_CCE"] = "0"
             if not load_in_4bit:
                 # Only upcast MoE biases for MXFP4, not BnB
+                # Also set down projection compute dtype to be float32
                 os.environ["UNSLOTH_FORCE_CUSTOM_DTYPE"] = \
                     "all;None;None;"\
                     "x = 'gate_up_proj_bias'\n"\
-                    "if hasattr(module, x): setattr(module, x, torch.nn.Parameter(getattr(module, x).to(torch.float32)) if isinstance(getattr(module, x), torch.nn.Parameter) else getattr(module, x).to(torch.float32))\n"\
+                    "if hasattr(module, x): "\
+                    "setattr(module, x, torch.nn.Parameter(getattr(module, x).to(torch.float32)) if isinstance(getattr(module, x), torch.nn.Parameter) else getattr(module, x).to(torch.float32))\n"\
                     "x = 'down_proj_bias'\n"\
-                    "if hasattr(module, x): setattr(module, x, torch.nn.Parameter(getattr(module, x).to(torch.float32)) if isinstance(getattr(module, x), torch.nn.Parameter) else getattr(module, x).to(torch.float32))\n;"
+                    "if hasattr(module, x): "\
+                    "setattr(module, x, torch.nn.Parameter(getattr(module, x).to(torch.float32)) if isinstance(getattr(module, x), torch.nn.Parameter) else getattr(module, x).to(torch.float32))\n"\
+                    ""\
+                    "if 'down_projs' in name and hasattr(module, 'compute_dtype'): module.compute_dtype = torch.float32\n"\
+                    ";"
         else:
             for check_model_name in DISABLE_COMPILE_MODEL_NAMES:
                 if check_model_name in lowered_model_name:

From 61366efc914563179c460c16e2e8e144fd4cb4d8 Mon Sep 17 00:00:00 2001
From: Daniel Han <danielhanchen@gmail.com>
Date: Thu, 14 Aug 2025 03:50:52 -0700
Subject: [PATCH 012/272] GPT OSS fix

---
 unsloth/models/_utils.py | 2 ++
 unsloth/models/loader.py | 4 +++-
 2 files changed, 5 insertions(+), 1 deletion(-)

diff --git a/unsloth/models/_utils.py b/unsloth/models/_utils.py
index d1df57ad5c..ab2694fde1 100644
--- a/unsloth/models/_utils.py
+++ b/unsloth/models/_utils.py
@@ -68,6 +68,7 @@
     "patch_fast_lora",
     "validate_loftq_config",
     "RaiseUninitialized",
+    "dequantize_module_weight",
 ]
 
 import torch
@@ -724,6 +725,7 @@ def prepare_model_for_kbit_training(
 # Weirdly LoraLayer.update_layer downcasts PEFT layers to float16??
 # For mixed precision, we need it to be in float32 not float16.
 from peft import __version__ as peft_version
+from peft.utils.integrations import dequantize_module_weight
 if Version(peft_version) < Version("0.12.0"):
     from peft.tuners.lora.layer import LoraLayer
     try:
diff --git a/unsloth/models/loader.py b/unsloth/models/loader.py
index 960f9cc23f..bb102376d4 100644
--- a/unsloth/models/loader.py
+++ b/unsloth/models/loader.py
@@ -625,7 +625,9 @@ def from_pretrained(
                     "if hasattr(module, x): "\
                     "setattr(module, x, torch.nn.Parameter(getattr(module, x).to(torch.float32)) if isinstance(getattr(module, x), torch.nn.Parameter) else getattr(module, x).to(torch.float32))\n"\
                     ""\
-                    "if 'down_projs' in name and hasattr(module, 'compute_dtype'): module.compute_dtype = torch.float32\n"\
+                    "if 'down_projs' in name and hasattr(module, 'compute_dtype') and "\
+                    "torch.amax(dequantize_module_weight(module)) >= 1024:"\
+                    "module.compute_dtype = torch.float32\n"\
                     ";"
         else:
             for check_model_name in DISABLE_COMPILE_MODEL_NAMES:

From de043d95684df41bf69ec8ea3c29538a9bcab1e4 Mon Sep 17 00:00:00 2001
From: Daniel Han <danielhanchen@gmail.com>
Date: Thu, 14 Aug 2025 04:28:57 -0700
Subject: [PATCH 013/272] Update loader.py

---
 unsloth/models/loader.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/unsloth/models/loader.py b/unsloth/models/loader.py
index bb102376d4..c61aab750d 100644
--- a/unsloth/models/loader.py
+++ b/unsloth/models/loader.py
@@ -627,7 +627,7 @@ def from_pretrained(
                     ""\
                     "if 'down_projs' in name and hasattr(module, 'compute_dtype') and "\
                     "torch.amax(dequantize_module_weight(module)) >= 1024:"\
-                    "module.compute_dtype = torch.float32\n"\
+                    "module._pre_set_compute_dtype = torch.float32\n"\
                     ";"
         else:
             for check_model_name in DISABLE_COMPILE_MODEL_NAMES:

From c1ef6f1a6270e24b47259856e4b229f44cbe4053 Mon Sep 17 00:00:00 2001
From: Daniel Han <danielhanchen@gmail.com>
Date: Thu, 14 Aug 2025 04:36:16 -0700
Subject: [PATCH 014/272] Update vision.py

---
 unsloth/models/vision.py | 1 +
 1 file changed, 1 insertion(+)

diff --git a/unsloth/models/vision.py b/unsloth/models/vision.py
index 5524d8f16d..0f267104f3 100644
--- a/unsloth/models/vision.py
+++ b/unsloth/models/vision.py
@@ -458,6 +458,7 @@ def from_pretrained(
         # Edit data-types
         if custom_datatype is not None:
             for jj, (name, module) in enumerate(model.named_modules()):
+                print(custom_datatype)
                 exec(custom_datatype)
             pass
         pass

From f18cd268bae43f9c531bc78a0ded608339b9b056 Mon Sep 17 00:00:00 2001
From: Daniel Han <danielhanchen@gmail.com>
Date: Thu, 14 Aug 2025 04:41:27 -0700
Subject: [PATCH 015/272] Update vision.py

---
 unsloth/models/vision.py | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/unsloth/models/vision.py b/unsloth/models/vision.py
index 0f267104f3..fcba556e7a 100644
--- a/unsloth/models/vision.py
+++ b/unsloth/models/vision.py
@@ -356,6 +356,7 @@ def from_pretrained(
         correct_dtype = None
         if os.environ.get("UNSLOTH_FORCE_CUSTOM_DTYPE", "") != "":
             custom_datatype = os.environ["UNSLOTH_FORCE_CUSTOM_DTYPE"]
+            print(custom_datatype)
             assert custom_datatype.count(";") >= 4
             checker, _dtype, _bnb_compute_dtype, _custom_datatype, execute_code = custom_datatype.split(";", 4)
 
@@ -371,6 +372,7 @@ def from_pretrained(
                     bnb_compute_dtype = eval(_bnb_compute_dtype)
                 correct_dtype = bnb_compute_dtype
                 custom_datatype = _custom_datatype
+                print(custom_datatype)
                 # Execute code as well
                 if len(execute_code.strip()) != 0:
                     exec(execute_code)
@@ -458,7 +460,6 @@ def from_pretrained(
         # Edit data-types
         if custom_datatype is not None:
             for jj, (name, module) in enumerate(model.named_modules()):
-                print(custom_datatype)
                 exec(custom_datatype)
             pass
         pass

From 02152243313ae76b42e4b887d7d5c1c87b0901a6 Mon Sep 17 00:00:00 2001
From: Daniel Han <danielhanchen@gmail.com>
Date: Thu, 14 Aug 2025 04:44:56 -0700
Subject: [PATCH 016/272] Update loader.py

---
 unsloth/models/loader.py | 9 +++++----
 unsloth/models/vision.py | 2 --
 2 files changed, 5 insertions(+), 6 deletions(-)

diff --git a/unsloth/models/loader.py b/unsloth/models/loader.py
index c61aab750d..d0b7d4dc4c 100644
--- a/unsloth/models/loader.py
+++ b/unsloth/models/loader.py
@@ -615,16 +615,17 @@ def from_pretrained(
             os.environ["UNSLOTH_ENABLE_CCE"] = "0"
             if not load_in_4bit:
                 # Only upcast MoE biases for MXFP4, not BnB
-                # Also set down projection compute dtype to be float32
                 os.environ["UNSLOTH_FORCE_CUSTOM_DTYPE"] = \
                     "all;None;None;"\
                     "x = 'gate_up_proj_bias'\n"\
                     "if hasattr(module, x): "\
                     "setattr(module, x, torch.nn.Parameter(getattr(module, x).to(torch.float32)) if isinstance(getattr(module, x), torch.nn.Parameter) else getattr(module, x).to(torch.float32))\n"\
                     "x = 'down_proj_bias'\n"\
-                    "if hasattr(module, x): "\
-                    "setattr(module, x, torch.nn.Parameter(getattr(module, x).to(torch.float32)) if isinstance(getattr(module, x), torch.nn.Parameter) else getattr(module, x).to(torch.float32))\n"\
-                    ""\
+                    ";"
+            else:
+                # Set down projection compute dtype to be float32 for float16 machines
+                os.environ["UNSLOTH_FORCE_CUSTOM_DTYPE"] = \
+                    "all;None;None;"\
                     "if 'down_projs' in name and hasattr(module, 'compute_dtype') and "\
                     "torch.amax(dequantize_module_weight(module)) >= 1024:"\
                     "module._pre_set_compute_dtype = torch.float32\n"\
diff --git a/unsloth/models/vision.py b/unsloth/models/vision.py
index fcba556e7a..5524d8f16d 100644
--- a/unsloth/models/vision.py
+++ b/unsloth/models/vision.py
@@ -356,7 +356,6 @@ def from_pretrained(
         correct_dtype = None
         if os.environ.get("UNSLOTH_FORCE_CUSTOM_DTYPE", "") != "":
             custom_datatype = os.environ["UNSLOTH_FORCE_CUSTOM_DTYPE"]
-            print(custom_datatype)
             assert custom_datatype.count(";") >= 4
             checker, _dtype, _bnb_compute_dtype, _custom_datatype, execute_code = custom_datatype.split(";", 4)
 
@@ -372,7 +371,6 @@ def from_pretrained(
                     bnb_compute_dtype = eval(_bnb_compute_dtype)
                 correct_dtype = bnb_compute_dtype
                 custom_datatype = _custom_datatype
-                print(custom_datatype)
                 # Execute code as well
                 if len(execute_code.strip()) != 0:
                     exec(execute_code)

From 5ed4a46e7c37e81e9db29f205ad811b061c330c1 Mon Sep 17 00:00:00 2001
From: Daniel Han <danielhanchen@gmail.com>
Date: Thu, 14 Aug 2025 17:23:46 -0700
Subject: [PATCH 017/272] Update vision.py

---
 unsloth/models/vision.py | 1 +
 1 file changed, 1 insertion(+)

diff --git a/unsloth/models/vision.py b/unsloth/models/vision.py
index 5524d8f16d..bfd0011f89 100644
--- a/unsloth/models/vision.py
+++ b/unsloth/models/vision.py
@@ -386,6 +386,7 @@ def from_pretrained(
             print(f"Unsloth: {model_type_arch.title()} does not support SDPA - switching to eager!")
             del kwargs["attn_implementation"]
         pass
+        print(supports_sdpa, kwargs)
 
         bnb_config = None
         if full_finetuning and (load_in_4bit or load_in_8bit):

From a22255811467e34ddac87e9af9879e141bb35673 Mon Sep 17 00:00:00 2001
From: Daniel Han <danielhanchen@gmail.com>
Date: Thu, 14 Aug 2025 19:22:16 -0700
Subject: [PATCH 018/272] Update vision.py

---
 unsloth/models/vision.py | 1 -
 1 file changed, 1 deletion(-)

diff --git a/unsloth/models/vision.py b/unsloth/models/vision.py
index bfd0011f89..5524d8f16d 100644
--- a/unsloth/models/vision.py
+++ b/unsloth/models/vision.py
@@ -386,7 +386,6 @@ def from_pretrained(
             print(f"Unsloth: {model_type_arch.title()} does not support SDPA - switching to eager!")
             del kwargs["attn_implementation"]
         pass
-        print(supports_sdpa, kwargs)
 
         bnb_config = None
         if full_finetuning and (load_in_4bit or load_in_8bit):

From 6cffb1cb06a7b2b5d14a3d36acc5970f1bd790a5 Mon Sep 17 00:00:00 2001
From: Daniel Han <danielhanchen@gmail.com>
Date: Fri, 15 Aug 2025 04:25:15 -0700
Subject: [PATCH 019/272] Update llama.py

---
 unsloth/models/llama.py | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/unsloth/models/llama.py b/unsloth/models/llama.py
index ab7f4bfdde..ae03a685eb 100644
--- a/unsloth/models/llama.py
+++ b/unsloth/models/llama.py
@@ -701,8 +701,9 @@ def LlamaModel_fast_forward(
     # Fix out of bounds tokenization
     if hasattr(self, "max_seq_length"):
         if seq_length > self.max_seq_length:
+            shape = input_ids.shape if input_ids is not None else inputs_embeds.shape
             logger.warning_once(
-                f"Unsloth: Input IDs of length {seq_length} > the model's max sequence length of {self.max_seq_length}.\n"\
+                f"Unsloth: Input IDs of shape {shape} with length {seq_length} > the model's max sequence length of {self.max_seq_length}.\n"\
                 "We shall truncate it ourselves. It's imperative if you correct this issue first."
             )
         if input_ids is not None:

From 15d33a5f0a3fed1e8fbd89acf25dda33ceefc436 Mon Sep 17 00:00:00 2001
From: Daniel Han <danielhanchen@gmail.com>
Date: Fri, 15 Aug 2025 04:34:50 -0700
Subject: [PATCH 020/272] Update llama.py

---
 unsloth/models/llama.py | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/unsloth/models/llama.py b/unsloth/models/llama.py
index ae03a685eb..badcd51a12 100644
--- a/unsloth/models/llama.py
+++ b/unsloth/models/llama.py
@@ -698,6 +698,9 @@ def LlamaModel_fast_forward(
 
     seq_length_with_past = seq_length
 
+    shape = input_ids.shape if input_ids is not None else inputs_embeds.shape
+    print(shape)
+
     # Fix out of bounds tokenization
     if hasattr(self, "max_seq_length"):
         if seq_length > self.max_seq_length:

From 95a4dafadb9c1a3b65b4b0c0643741a4b6e144eb Mon Sep 17 00:00:00 2001
From: Daniel Han <danielhanchen@gmail.com>
Date: Fri, 15 Aug 2025 04:54:45 -0700
Subject: [PATCH 021/272] Update llama.py

---
 unsloth/models/llama.py | 3 ---
 1 file changed, 3 deletions(-)

diff --git a/unsloth/models/llama.py b/unsloth/models/llama.py
index badcd51a12..ae03a685eb 100644
--- a/unsloth/models/llama.py
+++ b/unsloth/models/llama.py
@@ -698,9 +698,6 @@ def LlamaModel_fast_forward(
 
     seq_length_with_past = seq_length
 
-    shape = input_ids.shape if input_ids is not None else inputs_embeds.shape
-    print(shape)
-
     # Fix out of bounds tokenization
     if hasattr(self, "max_seq_length"):
         if seq_length > self.max_seq_length:

From 4104bba896a760833061ece7dbbdff7423b5d141 Mon Sep 17 00:00:00 2001
From: Daniel Han <danielhanchen@gmail.com>
Date: Fri, 15 Aug 2025 04:55:34 -0700
Subject: [PATCH 022/272] Versioning

---
 pyproject.toml           | 4 ++--
 unsloth/models/_utils.py | 2 +-
 2 files changed, 3 insertions(+), 3 deletions(-)

diff --git a/pyproject.toml b/pyproject.toml
index e563ba6fc5..6f6f225bde 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -37,7 +37,7 @@ triton = [
 ]
 
 huggingface = [
-    "unsloth_zoo>=2025.8.4",
+    "unsloth_zoo>=2025.8.5",
     "packaging",
     "tyro",
     "transformers>=4.51.3,!=4.47.0,!=4.52.0,!=4.52.1,!=4.52.2,!=4.52.3,!=4.53.0",
@@ -384,7 +384,7 @@ colab-ampere-torch220 = [
     "flash-attn>=2.6.3",
 ]
 colab-new = [
-    "unsloth_zoo>=2025.8.4",
+    "unsloth_zoo>=2025.8.5",
     "packaging",
     "tyro",
     "transformers>=4.51.3,!=4.47.0,!=4.52.0,!=4.52.1,!=4.52.2,!=4.52.3,!=4.53.0",
diff --git a/unsloth/models/_utils.py b/unsloth/models/_utils.py
index ab2694fde1..c84fd118e7 100644
--- a/unsloth/models/_utils.py
+++ b/unsloth/models/_utils.py
@@ -12,7 +12,7 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-__version__ = "2025.8.5"
+__version__ = "2025.8.6"
 
 __all__ = [
     "SUPPORTS_BFLOAT16",

From 8cc1999edaee313354f76c2c232389ad3bf07f23 Mon Sep 17 00:00:00 2001
From: Daniel Han <danielhanchen@gmail.com>
Date: Fri, 15 Aug 2025 05:03:06 -0700
Subject: [PATCH 023/272] Update mapper.py

---
 unsloth/models/mapper.py | 10 ++++++++++
 1 file changed, 10 insertions(+)

diff --git a/unsloth/models/mapper.py b/unsloth/models/mapper.py
index 829fe29583..e8fc55c2bd 100644
--- a/unsloth/models/mapper.py
+++ b/unsloth/models/mapper.py
@@ -941,6 +941,16 @@
         "Qwen/Qwen3-4B-Thinking-2507",
         "unsloth/Qwen3-4B-Thinking-2507-bnb-4bit",
     ),
+    "unsloth/gemma-3-270m-it-unsloth-bnb-4bit" : (
+        "unsloth/gemma-3-270m-it",
+        "google/gemma-3-270m-it",
+        "unsloth/gemma-3-270m-it-bnb-4bit",
+    ),
+    "unsloth/gemma-3-270m-unsloth-bnb-4bit" : (
+        "unsloth/gemma-3-270m",
+        "google/gemma-3-270m",
+        "unsloth/gemma-3-270m-bnb-4bit",
+    ),
 }
 
 INT_TO_FLOAT_MAPPER  = {}

From ffda8a743c54fb648e8fef8039dfbd724d2fdce2 Mon Sep 17 00:00:00 2001
From: Daniel Han <danielhanchen@gmail.com>
Date: Fri, 15 Aug 2025 18:39:46 -0700
Subject: [PATCH 024/272] Update vision.py

---
 unsloth/models/vision.py | 1 +
 1 file changed, 1 insertion(+)

diff --git a/unsloth/models/vision.py b/unsloth/models/vision.py
index a5de457cef..a629021339 100644
--- a/unsloth/models/vision.py
+++ b/unsloth/models/vision.py
@@ -451,6 +451,7 @@ def from_pretrained(
             # attn_implementation   = attn_implementation,
             **kwargs,
         )
+        print(model.model.layers[0].input_layernorm.weight, model.model.layers[0].input_layernorm.weight.dtype)
         raise_handler.remove()
         # Return old flag
         os.environ["HF_HUB_ENABLE_HF_TRANSFER"] = old_hf_transfer

From cdf2e17aea327a652b034a9a2601fee0ae780fb5 Mon Sep 17 00:00:00 2001
From: Daniel Han <danielhanchen@gmail.com>
Date: Fri, 15 Aug 2025 18:49:30 -0700
Subject: [PATCH 025/272] Update vision.py

---
 unsloth/models/vision.py | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/unsloth/models/vision.py b/unsloth/models/vision.py
index a629021339..fa3bb25e12 100644
--- a/unsloth/models/vision.py
+++ b/unsloth/models/vision.py
@@ -470,6 +470,7 @@ def from_pretrained(
             if DEVICE_TYPE == "cuda":  torch.cuda.empty_cache()
             elif DEVICE_TYPE == "xpu": torch.xpu.empty_cache()
         pass
+        print(model.model.layers[0].input_layernorm.weight, model.model.layers[0].input_layernorm.weight.dtype)
 
         # Counteract saved tokenizers
         tokenizer_name = model_name if tokenizer_name is None else tokenizer_name
@@ -516,6 +517,7 @@ def from_pretrained(
         )
         model, tokenizer = patch_tokenizer(model, tokenizer)
         model = post_patch_loss_function(model)
+        print(model.model.layers[0].input_layernorm.weight, model.model.layers[0].input_layernorm.weight.dtype)
 
         # Log Unsloth version for future fastpaths for inference
         if hasattr(model, "config"):

From 941d1aeb8f6fb724ca2ca2bc6793980e0647931c Mon Sep 17 00:00:00 2001
From: Daniel Han <danielhanchen@gmail.com>
Date: Fri, 15 Aug 2025 18:52:00 -0700
Subject: [PATCH 026/272] Update vision.py

---
 unsloth/models/vision.py | 1 +
 1 file changed, 1 insertion(+)

diff --git a/unsloth/models/vision.py b/unsloth/models/vision.py
index fa3bb25e12..4dc9cc4639 100644
--- a/unsloth/models/vision.py
+++ b/unsloth/models/vision.py
@@ -507,6 +507,7 @@ def from_pretrained(
                 tokenizer.pad_token_id = __tokenizer.pad_token_id
         pass
         # Fix other stuff like BnB compute data types
+        print("do_forced_float32", do_forced_float32)
         model, tokenizer = patch_model_and_tokenizer(
             model,
             tokenizer,

From 73fa72cb69866bec70cad78855fef994eb95b916 Mon Sep 17 00:00:00 2001
From: Daniel Han <danielhanchen@gmail.com>
Date: Fri, 15 Aug 2025 19:13:27 -0700
Subject: [PATCH 027/272] Upcast norms

---
 unsloth/models/loader.py | 33 +++++++++++++++++++++++++++------
 1 file changed, 27 insertions(+), 6 deletions(-)

diff --git a/unsloth/models/loader.py b/unsloth/models/loader.py
index 59226f0f42..edd909abfe 100644
--- a/unsloth/models/loader.py
+++ b/unsloth/models/loader.py
@@ -571,8 +571,15 @@ def from_pretrained(
         elif "qwen2.5" in lowered_model_name and transformers_version < Version("4.49.0"):
             raise RuntimeError("Unsloth: Qwen 2.5 only works on transformers >= 4.49.0." + LATEST)
         # Gemma 3
-        elif "gemma-3" in lowered_model_name and transformers_version < Version("4.50.0.dev0"):
-            raise RuntimeError("Unsloth: Gemma 3 only works on transformers >= 4.50.0." + NIGHTLY)
+        elif "gemma-3" in lowered_model_name:
+            if transformers_version < Version("4.50.0.dev0"):
+                raise RuntimeError("Unsloth: Gemma 3 only works on transformers >= 4.50.0." + NIGHTLY)
+            # Set norms to float32 since anyways they get upcasted to float32
+            os.environ["UNSLOTH_FORCE_CUSTOM_DTYPE"] = \
+                "all;None;None;"\
+                "if name.endswith('norm'): "\
+                "module._pre_set_compute_dtype = torch.float32\n"\
+                ";"
         # Cohere
         elif "c4ai-command-a-03-2025" in lowered_model_name and transformers_version < Version("4.50.0.dev0"):
             raise RuntimeError("Unsloth: Cohere's Command model only works on transformers >= 4.50.0." + NIGHTLY)
@@ -582,7 +589,8 @@ def from_pretrained(
             os.environ["UNSLOTH_DISABLE_STATIC_GENERATION"] = "1" # Sesame fails
             os.environ["UNSLOTH_FORCE_CUSTOM_DTYPE"] = \
                 "all;torch.float32;torch.float16;"\
-                "if name.endswith(('_proj', 'fc1', 'fc2', 'codebook', 'head')): module.to(torch.float16);"
+                "if name.endswith(('_proj', 'fc1', 'fc2', 'codebook', 'head')): module.to(torch.float16)"\
+                ";"
         # Granite 4
         elif 'granite-4' in lowered_model_name:
             # granite-4 rms norms are stored as 16 bit, but we upcast
@@ -594,9 +602,12 @@ def from_pretrained(
         # Gemma 3N
         elif "gemma-3n" in lowered_model_name:
             os.environ["UNSLOTH_DISABLE_STATIC_GENERATION"] = "1"
+            # Set norms to float32 since anyways they get upcasted to float32
             os.environ["UNSLOTH_FORCE_CUSTOM_DTYPE"] = \
                 "float16;torch.float16;torch.float16;"\
-                "if name.endswith(('.conv')): module;"\
+                "if name.endswith('norm'): "\
+                "module._pre_set_compute_dtype = torch.float32\n"\
+                ";"\
                 "from unsloth_zoo.temporary_patches.gemma3n import patch_Gemma3nConvNormAct_forward; patch_Gemma3nConvNormAct_forward()"
             
             if transformers_version < Version("4.53.0"):
@@ -606,7 +617,8 @@ def from_pretrained(
             # since Mamba kernels error out on using lower precision
             os.environ["UNSLOTH_FORCE_CUSTOM_DTYPE"] = \
                 "float16;torch.float32;torch.float16;"\
-                "if name.endswith(('q_proj', 'k_proj', 'v_proj', 'o_proj', 'gate_proj', 'up_proj', 'down_proj', 'head')): module.to(torch.float16);"\
+                "if name.endswith(('q_proj', 'k_proj', 'v_proj', 'o_proj', 'gate_proj', 'up_proj', 'down_proj', 'head')): module.to(torch.float16)"\
+                ";"\
                 "os.environ['TRITON_F32_DEFAULT'] = 'ieee'"
         elif "gpt-oss" in lowered_model_name:
             os.environ["UNSLOTH_DISABLE_STATIC_GENERATION"] = "1"
@@ -615,22 +627,31 @@ def from_pretrained(
             os.environ["UNSLOTH_ENABLE_CCE"] = "0"
             if not load_in_4bit:
                 # Only upcast MoE biases for MXFP4, not BnB
+                # Set norms to float32 since anyways they get upcasted to float32
                 os.environ["UNSLOTH_FORCE_CUSTOM_DTYPE"] = \
                     "all;None;None;"\
                     "x = 'gate_up_proj_bias'\n"\
                     "if hasattr(module, x): "\
                     "setattr(module, x, torch.nn.Parameter(getattr(module, x).to(torch.float32)) if isinstance(getattr(module, x), torch.nn.Parameter) else getattr(module, x).to(torch.float32))\n"\
+                    ""\
                     "x = 'down_proj_bias'\n"\
                     "if hasattr(module, x): "\
                     "setattr(module, x, torch.nn.Parameter(getattr(module, x).to(torch.float32)) if isinstance(getattr(module, x), torch.nn.Parameter) else getattr(module, x).to(torch.float32))\n"\
+                    ""\
+                    "if name.endswith('norm'): "\
+                    "module._pre_set_compute_dtype = torch.float32\n"\
                     ";"
             else:
                 # Set down projection compute dtype to be float32 for float16 machines
+                # Set norms to float32 since anyways they get upcasted to float32
                 os.environ["UNSLOTH_FORCE_CUSTOM_DTYPE"] = \
                     "all;None;None;"\
-                    "if 'down_projs' in name and hasattr(module, 'compute_dtype') and "\
+                    "if 'down_projs' in name and "\
                     "torch.amax(dequantize_module_weight(module)) >= 1024:"\
                     "module._pre_set_compute_dtype = torch.float32\n"\
+                    ""\
+                    "if name.endswith('norm'): "\
+                    "module._pre_set_compute_dtype = torch.float32\n"\
                     ";"
         else:
             for check_model_name in DISABLE_COMPILE_MODEL_NAMES:

From e4bbeef2c9b56635ff20ffbaff865c26a052babc Mon Sep 17 00:00:00 2001
From: Daniel Han <danielhanchen@gmail.com>
Date: Fri, 15 Aug 2025 19:22:19 -0700
Subject: [PATCH 028/272] Update loader.py

---
 unsloth/models/loader.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/unsloth/models/loader.py b/unsloth/models/loader.py
index edd909abfe..86850b0253 100644
--- a/unsloth/models/loader.py
+++ b/unsloth/models/loader.py
@@ -646,7 +646,7 @@ def from_pretrained(
                 # Set norms to float32 since anyways they get upcasted to float32
                 os.environ["UNSLOTH_FORCE_CUSTOM_DTYPE"] = \
                     "all;None;None;"\
-                    "if 'down_projs' in name and "\
+                    "if 'down_projs' in name and hasattr(module, 'weight') and "\
                     "torch.amax(dequantize_module_weight(module)) >= 1024:"\
                     "module._pre_set_compute_dtype = torch.float32\n"\
                     ""\

From c8d00bebb323700f00742dec14b1319603db7720 Mon Sep 17 00:00:00 2001
From: Daniel Han <danielhanchen@gmail.com>
Date: Fri, 15 Aug 2025 19:25:03 -0700
Subject: [PATCH 029/272] Update vision.py

---
 unsloth/models/vision.py | 4 ----
 1 file changed, 4 deletions(-)

diff --git a/unsloth/models/vision.py b/unsloth/models/vision.py
index 4dc9cc4639..a5de457cef 100644
--- a/unsloth/models/vision.py
+++ b/unsloth/models/vision.py
@@ -451,7 +451,6 @@ def from_pretrained(
             # attn_implementation   = attn_implementation,
             **kwargs,
         )
-        print(model.model.layers[0].input_layernorm.weight, model.model.layers[0].input_layernorm.weight.dtype)
         raise_handler.remove()
         # Return old flag
         os.environ["HF_HUB_ENABLE_HF_TRANSFER"] = old_hf_transfer
@@ -470,7 +469,6 @@ def from_pretrained(
             if DEVICE_TYPE == "cuda":  torch.cuda.empty_cache()
             elif DEVICE_TYPE == "xpu": torch.xpu.empty_cache()
         pass
-        print(model.model.layers[0].input_layernorm.weight, model.model.layers[0].input_layernorm.weight.dtype)
 
         # Counteract saved tokenizers
         tokenizer_name = model_name if tokenizer_name is None else tokenizer_name
@@ -507,7 +505,6 @@ def from_pretrained(
                 tokenizer.pad_token_id = __tokenizer.pad_token_id
         pass
         # Fix other stuff like BnB compute data types
-        print("do_forced_float32", do_forced_float32)
         model, tokenizer = patch_model_and_tokenizer(
             model,
             tokenizer,
@@ -518,7 +515,6 @@ def from_pretrained(
         )
         model, tokenizer = patch_tokenizer(model, tokenizer)
         model = post_patch_loss_function(model)
-        print(model.model.layers[0].input_layernorm.weight, model.model.layers[0].input_layernorm.weight.dtype)
 
         # Log Unsloth version for future fastpaths for inference
         if hasattr(model, "config"):

From 564b6f8cd6f73bd0f064347a0d83ab236783317e Mon Sep 17 00:00:00 2001
From: Daniel Han <danielhanchen@gmail.com>
Date: Sat, 16 Aug 2025 23:10:15 -0700
Subject: [PATCH 030/272] Upcast layernorms

---
 unsloth/models/loader.py | 24 +++++++++---------------
 unsloth/models/vision.py |  6 ++++++
 2 files changed, 15 insertions(+), 15 deletions(-)

diff --git a/unsloth/models/loader.py b/unsloth/models/loader.py
index 86850b0253..e59aef1fd0 100644
--- a/unsloth/models/loader.py
+++ b/unsloth/models/loader.py
@@ -575,11 +575,7 @@ def from_pretrained(
             if transformers_version < Version("4.50.0.dev0"):
                 raise RuntimeError("Unsloth: Gemma 3 only works on transformers >= 4.50.0." + NIGHTLY)
             # Set norms to float32 since anyways they get upcasted to float32
-            os.environ["UNSLOTH_FORCE_CUSTOM_DTYPE"] = \
-                "all;None;None;"\
-                "if name.endswith('norm'): "\
-                "module._pre_set_compute_dtype = torch.float32\n"\
-                ";"
+            os.environ["UNSLOTH_HIGH_PRECISION_LAYERNORM"] = "1"
         # Cohere
         elif "c4ai-command-a-03-2025" in lowered_model_name and transformers_version < Version("4.50.0.dev0"):
             raise RuntimeError("Unsloth: Cohere's Command model only works on transformers >= 4.50.0." + NIGHTLY)
@@ -593,25 +589,25 @@ def from_pretrained(
                 ";"
         # Granite 4
         elif 'granite-4' in lowered_model_name:
-            # granite-4 rms norms are stored as 16 bit, but we upcast
-            os.environ["UNSLOTH_UPCAST_LAYERNORM"] = "1"
+            # Granite-4 rms norms are stored as 16 bit, but we upcast
+            os.environ["UNSLOTH_HIGH_PRECISION_LAYERNORM"] = "1"
             os.environ["UNSLOTH_DISABLE_STATIC_GENERATION"] = "1"
         # Olmo 2
         elif "olmo-2" in lowered_model_name and transformers_version < Version("4.50.0.dev0"):
             raise RuntimeError("Unsloth: OLMo-2 only works on transformers >= 4.50.0." + NIGHTLY)
         # Gemma 3N
         elif "gemma-3n" in lowered_model_name:
+            if transformers_version < Version("4.53.0"):
+                raise RuntimeError("Unsloth: Gemma 3N only works on transformers >= 4.53.0" + LATEST)
             os.environ["UNSLOTH_DISABLE_STATIC_GENERATION"] = "1"
-            # Set norms to float32 since anyways they get upcasted to float32
             os.environ["UNSLOTH_FORCE_CUSTOM_DTYPE"] = \
                 "float16;torch.float16;torch.float16;"\
                 "if name.endswith('norm'): "\
                 "module._pre_set_compute_dtype = torch.float32\n"\
                 ";"\
                 "from unsloth_zoo.temporary_patches.gemma3n import patch_Gemma3nConvNormAct_forward; patch_Gemma3nConvNormAct_forward()"
-            
-            if transformers_version < Version("4.53.0"):
-                raise RuntimeError("Unsloth: Gemma 3N only works on transformers >= 4.53.0" + LATEST)
+            # Set norms to float32 since anyways they get upcasted to float32
+            os.environ["UNSLOTH_HIGH_PRECISION_LAYERNORM"] = "1"
         elif "falcon-h1" in lowered_model_name:
             # Falcon must use float32 Triton ie TRITON_F32_DEFAULT = 'ieee'
             # since Mamba kernels error out on using lower precision
@@ -638,8 +634,6 @@ def from_pretrained(
                     "if hasattr(module, x): "\
                     "setattr(module, x, torch.nn.Parameter(getattr(module, x).to(torch.float32)) if isinstance(getattr(module, x), torch.nn.Parameter) else getattr(module, x).to(torch.float32))\n"\
                     ""\
-                    "if name.endswith('norm'): "\
-                    "module._pre_set_compute_dtype = torch.float32\n"\
                     ";"
             else:
                 # Set down projection compute dtype to be float32 for float16 machines
@@ -650,9 +644,9 @@ def from_pretrained(
                     "torch.amax(dequantize_module_weight(module)) >= 1024:"\
                     "module._pre_set_compute_dtype = torch.float32\n"\
                     ""\
-                    "if name.endswith('norm'): "\
-                    "module._pre_set_compute_dtype = torch.float32\n"\
                     ";"
+            # Set norms to float32 since anyways they get upcasted to float32
+            os.environ["UNSLOTH_HIGH_PRECISION_LAYERNORM"] = "1"
         else:
             for check_model_name in DISABLE_COMPILE_MODEL_NAMES:
                 if check_model_name in lowered_model_name:
diff --git a/unsloth/models/vision.py b/unsloth/models/vision.py
index a5de457cef..6790c5cd12 100644
--- a/unsloth/models/vision.py
+++ b/unsloth/models/vision.py
@@ -455,6 +455,12 @@ def from_pretrained(
         # Return old flag
         os.environ["HF_HUB_ENABLE_HF_TRANSFER"] = old_hf_transfer
 
+        # Check float32 norm weights
+        if os.environ.get("UNSLOTH_HIGH_PRECISION_LAYERNORM", "0") == "1":
+            for jj, (name, module) in enumerate(model.named_modules()):
+                if name.endswith("norm") and hasattr(module, "weight"):
+                    module._pre_set_compute_dtype = torch.float32
+        pass
         # Edit data-types
         if custom_datatype is not None:
             with torch.no_grad():

From b8a34b4a5eeeddab69320aed0097a801d7d0b1b8 Mon Sep 17 00:00:00 2001
From: Daniel Han <danielhanchen@gmail.com>
Date: Sun, 17 Aug 2025 16:45:46 -0700
Subject: [PATCH 031/272] Update llama.py

---
 unsloth/models/llama.py | 1 +
 1 file changed, 1 insertion(+)

diff --git a/unsloth/models/llama.py b/unsloth/models/llama.py
index ae03a685eb..7217c0b593 100644
--- a/unsloth/models/llama.py
+++ b/unsloth/models/llama.py
@@ -170,6 +170,7 @@ def needs_device_kw(fn) -> bool:
 
     if "cache_position" in kwargs:
         kwargs["position_ids"] = kwargs["cache_position"]
+    print(attention_mask)
     return { "input_ids" : input_ids, "attention_mask": attention_mask, **kwargs, }
 pass
 

From 509fcb5ea138a7f7d29d033399b0fd0d953499e4 Mon Sep 17 00:00:00 2001
From: Daniel Han <danielhanchen@gmail.com>
Date: Sun, 17 Aug 2025 16:55:02 -0700
Subject: [PATCH 032/272] Update llama.py

---
 unsloth/models/llama.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/unsloth/models/llama.py b/unsloth/models/llama.py
index 7217c0b593..6beb9943e8 100644
--- a/unsloth/models/llama.py
+++ b/unsloth/models/llama.py
@@ -170,7 +170,6 @@ def needs_device_kw(fn) -> bool:
 
     if "cache_position" in kwargs:
         kwargs["position_ids"] = kwargs["cache_position"]
-    print(attention_mask)
     return { "input_ids" : input_ids, "attention_mask": attention_mask, **kwargs, }
 pass
 
@@ -798,6 +797,7 @@ def LlamaModel_fast_forward(
     pass
 
     # Ignore attention_mask
+    print(attention_mask)
     if attention_mask is None:
         padding_mask = None
     elif self.training:

From 27f1a2efc64f75eade35e5322b2278bbb1b8812a Mon Sep 17 00:00:00 2001
From: Daniel Han <danielhanchen@gmail.com>
Date: Sun, 17 Aug 2025 17:38:42 -0700
Subject: [PATCH 033/272] Update llama.py

---
 unsloth/models/llama.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/unsloth/models/llama.py b/unsloth/models/llama.py
index 6beb9943e8..763d69a5b8 100644
--- a/unsloth/models/llama.py
+++ b/unsloth/models/llama.py
@@ -797,7 +797,7 @@ def LlamaModel_fast_forward(
     pass
 
     # Ignore attention_mask
-    print(attention_mask)
+    print(attention_mask, attention_mask.dtype, attention_mask.shape, attention_mask[:, :, 0])
     if attention_mask is None:
         padding_mask = None
     elif self.training:

From 931851abfdd6fea51c72eee6afdc4809fec14bc3 Mon Sep 17 00:00:00 2001
From: Daniel Han <danielhanchen@gmail.com>
Date: Sun, 17 Aug 2025 17:51:17 -0700
Subject: [PATCH 034/272] Update llama.py

---
 unsloth/models/llama.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/unsloth/models/llama.py b/unsloth/models/llama.py
index 763d69a5b8..7cb39f9c77 100644
--- a/unsloth/models/llama.py
+++ b/unsloth/models/llama.py
@@ -797,7 +797,7 @@ def LlamaModel_fast_forward(
     pass
 
     # Ignore attention_mask
-    print(attention_mask, attention_mask.dtype, attention_mask.shape, attention_mask[:, :, 0])
+    print(attention_mask, attention_mask.dtype, attention_mask.shape, attention_mask)
     if attention_mask is None:
         padding_mask = None
     elif self.training:

From 3b9057bf81aedafba9c7d30f7e3eca80486bec07 Mon Sep 17 00:00:00 2001
From: Daniel Han <danielhanchen@gmail.com>
Date: Sun, 17 Aug 2025 19:16:35 -0700
Subject: [PATCH 035/272] Update llama.py

---
 unsloth/models/llama.py | 7 +++++--
 1 file changed, 5 insertions(+), 2 deletions(-)

diff --git a/unsloth/models/llama.py b/unsloth/models/llama.py
index 7cb39f9c77..4100afc60e 100644
--- a/unsloth/models/llama.py
+++ b/unsloth/models/llama.py
@@ -11,7 +11,7 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-
+global final_attention_mask
 import torch
 import gc
 import math
@@ -797,7 +797,10 @@ def LlamaModel_fast_forward(
     pass
 
     # Ignore attention_mask
-    print(attention_mask, attention_mask.dtype, attention_mask.shape, attention_mask)
+    if "RAISE_ATTENTION_MASK" in os.environ:
+        global final_attention_mask
+        final_attention_mask = attention_mask
+        raise
     if attention_mask is None:
         padding_mask = None
     elif self.training:

From 3dd87bb0ccc3886611f7fe60e24ec97393c47342 Mon Sep 17 00:00:00 2001
From: Daniel Han <danielhanchen@gmail.com>
Date: Mon, 18 Aug 2025 03:10:07 -0700
Subject: [PATCH 036/272] Update llama.py

---
 unsloth/models/llama.py | 6 +-----
 1 file changed, 1 insertion(+), 5 deletions(-)

diff --git a/unsloth/models/llama.py b/unsloth/models/llama.py
index 4100afc60e..ae03a685eb 100644
--- a/unsloth/models/llama.py
+++ b/unsloth/models/llama.py
@@ -11,7 +11,7 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-global final_attention_mask
+
 import torch
 import gc
 import math
@@ -797,10 +797,6 @@ def LlamaModel_fast_forward(
     pass
 
     # Ignore attention_mask
-    if "RAISE_ATTENTION_MASK" in os.environ:
-        global final_attention_mask
-        final_attention_mask = attention_mask
-        raise
     if attention_mask is None:
         padding_mask = None
     elif self.training:

From b757faf23e7c4cdbc5eee85c39f4841fd9841450 Mon Sep 17 00:00:00 2001
From: Daniel Han <danielhanchen@gmail.com>
Date: Mon, 18 Aug 2025 05:36:47 -0700
Subject: [PATCH 037/272] Update save.py

---
 unsloth/save.py | 41 +++++++++++++++++++++++++++++++++++++----
 1 file changed, 37 insertions(+), 4 deletions(-)

diff --git a/unsloth/save.py b/unsloth/save.py
index e6d09b78fa..ef9c84e925 100644
--- a/unsloth/save.py
+++ b/unsloth/save.py
@@ -1195,6 +1195,41 @@ def save_to_gguf(
             f"--outfile {final_location} --vocab-type {vocab_type} "\
             f"--outtype {first_conversion} --concurrency {n_cpus} --pad-vocab"
     else:
+        # Fix up conversion script is possible
+        with open(convert_location, "rb") as f: converter_latest = f.read()
+        # Fix metadata
+        converter_latest = re.sub(
+            rb"(self\.metadata \= .+?\(.+?\)"\
+            rb"[\n]{1,}([\s]{4,}))",
+            rb"\1"\
+            rb"if hasattr(self.metadata, 'quantized_by'): self.metadata.quantized_by = 'Unsloth'\n"\
+            rb"\2if hasattr(self.metadata, 'repo_url'): self.metadata.repo_url = 'https://huggingface.co/unsloth'\n"\
+            rb"\2if hasattr(self.metadata, 'tags'): self.metadata.tags = ['unsloth', 'llama.cpp']\n"\
+            rb"\2",
+            converter_latest,
+        )
+
+        # Make mistral_common optional for now
+        # from x import y
+        converter_latest = re.sub(
+            rb"(from mistral_common[^\n\(]{1,})[\s]{0,}\n",
+            rb"try:\n    \1\nexcept:\n    pass\n",
+            converter_latest,
+        )
+        # from x import (y, z,)
+        converter_latest = re.sub(
+            rb"(from mistral_common[^\n\(]{1,}[\s]{0,}\(.+?\))",
+            rb"try:\n    \1\nexcept:\n    pass\n",
+            converter_latest,
+            flags = re.MULTILINE | re.DOTALL,
+        )
+
+        try:
+            # Write file
+            with open(convert_location, "wb") as file:
+                file.write(converter_latest)
+        except:
+            pass
         command = f"python {convert_location} {model_directory} "\
             f"--outfile {final_location} "\
             f"--outtype {first_conversion}"
@@ -1694,7 +1729,7 @@ def push_to_ollama_hub(username: str, model_name: str, tag: str):
         print(f"\nMODEL PUBLISHED FAILED WITH RETURN CODE {return_code}")
     else:
         print("\nMODEL PUBLISHED SUCCESSFULLY")
-
+pass
 
 def push_to_ollama(
     tokenizer,
@@ -1726,9 +1761,7 @@ def push_to_ollama(
     )
 
     print("Successfully pushed to ollama")
-
-
-
+pass
 
 
 def unsloth_save_pretrained_gguf(

From 2e86333f332204c613a2e5636b88f0e1ef34487d Mon Sep 17 00:00:00 2001
From: Daniel Han <danielhanchen@gmail.com>
Date: Mon, 18 Aug 2025 05:42:11 -0700
Subject: [PATCH 038/272] Update rl.py

---
 unsloth/models/rl.py | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/unsloth/models/rl.py b/unsloth/models/rl.py
index e751ef5e30..b08d4eda62 100644
--- a/unsloth/models/rl.py
+++ b/unsloth/models/rl.py
@@ -487,6 +487,8 @@ def _patch_trl_rl_trainers(trainer_file = "grpo_trainer"):
         "logging_steps"                 : 1,
         "max_seq_length"                : None,
         "num_generations"               : 8,
+        "steps_per_generation"          : 1, # Otherwise defaults to ga_steps which is wrong
+        "generation_batch_size"         : None, # Useless. If steps_per_generation set, generation_batch_size clashes
         "top_k"                         : None,
         "vllm_mode"                     : "colocate",
         "generation_kwargs"             : {},

From b01e948b8d351ce1a8ae41de55e8dc7a7648bc32 Mon Sep 17 00:00:00 2001
From: Daniel Han <danielhanchen@gmail.com>
Date: Mon, 18 Aug 2025 05:44:09 -0700
Subject: [PATCH 039/272] Update pyproject.toml

---
 pyproject.toml | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/pyproject.toml b/pyproject.toml
index 6f6f225bde..f8558a83b6 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -37,7 +37,7 @@ triton = [
 ]
 
 huggingface = [
-    "unsloth_zoo>=2025.8.5",
+    "unsloth_zoo>=2025.8.6",
     "packaging",
     "tyro",
     "transformers>=4.51.3,!=4.47.0,!=4.52.0,!=4.52.1,!=4.52.2,!=4.52.3,!=4.53.0",
@@ -384,7 +384,7 @@ colab-ampere-torch220 = [
     "flash-attn>=2.6.3",
 ]
 colab-new = [
-    "unsloth_zoo>=2025.8.5",
+    "unsloth_zoo>=2025.8.6",
     "packaging",
     "tyro",
     "transformers>=4.51.3,!=4.47.0,!=4.52.0,!=4.52.1,!=4.52.2,!=4.52.3,!=4.53.0",

From a751fd789636a36ba1edd75775946a1339689e00 Mon Sep 17 00:00:00 2001
From: Daniel Han <danielhanchen@gmail.com>
Date: Mon, 18 Aug 2025 06:07:29 -0700
Subject: [PATCH 040/272] Update rl.py

---
 unsloth/models/rl.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/unsloth/models/rl.py b/unsloth/models/rl.py
index b08d4eda62..52b1e83694 100644
--- a/unsloth/models/rl.py
+++ b/unsloth/models/rl.py
@@ -487,8 +487,8 @@ def _patch_trl_rl_trainers(trainer_file = "grpo_trainer"):
         "logging_steps"                 : 1,
         "max_seq_length"                : None,
         "num_generations"               : 8,
-        "steps_per_generation"          : 1, # Otherwise defaults to ga_steps which is wrong
-        "generation_batch_size"         : None, # Useless. If steps_per_generation set, generation_batch_size clashes
+        # "steps_per_generation"          : 1, # Otherwise defaults to ga_steps which is wrong
+        # "generation_batch_size"         : None, # Useless. If steps_per_generation set, generation_batch_size clashes
         "top_k"                         : None,
         "vllm_mode"                     : "colocate",
         "generation_kwargs"             : {},

From 3cb6eaf68bda8bb8bad74bd2087c6f1aa366d80e Mon Sep 17 00:00:00 2001
From: Daniel Han <danielhanchen@gmail.com>
Date: Mon, 18 Aug 2025 06:24:30 -0700
Subject: [PATCH 041/272] Update rl_replacements.py

---
 unsloth/models/rl_replacements.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/unsloth/models/rl_replacements.py b/unsloth/models/rl_replacements.py
index 2555f0df1f..717e6cbf11 100644
--- a/unsloth/models/rl_replacements.py
+++ b/unsloth/models/rl_replacements.py
@@ -556,7 +556,7 @@ def grpo_trainer_fix_batch_size(RLTrainer_source, RLConfig_source):
     "    per_device_train_batch_size = num_generations\n"
     return check_batch_size
 pass
-RL_CONFIG_CHANGES["grpo_trainer"].append(grpo_trainer_fix_batch_size)
+# RL_CONFIG_CHANGES["grpo_trainer"].append(grpo_trainer_fix_batch_size)
 
 
 # Add other reward function names

From de77a26c00cbc93050e103cf5060e54eac72b15c Mon Sep 17 00:00:00 2001
From: Daniel Han <danielhanchen@gmail.com>
Date: Mon, 18 Aug 2025 21:02:30 -0700
Subject: [PATCH 042/272] Update rl.py

---
 unsloth/models/rl.py | 24 ++++++++++++++++++++++--
 1 file changed, 22 insertions(+), 2 deletions(-)

diff --git a/unsloth/models/rl.py b/unsloth/models/rl.py
index 52b1e83694..4dabdee639 100644
--- a/unsloth/models/rl.py
+++ b/unsloth/models/rl.py
@@ -133,15 +133,18 @@ class Unsloth{RLConfig_name}({RLConfig_name}):
         default = -1,
         metadata = {{'help': 'Chunk size to reduce memory usage. -1 is most efficient.'}},
     )
+    {max_seq_length_pre}
     def __init__({RLConfig_arguments},
         vllm_sampling_params = None,
         unsloth_num_chunks = -1,
+        {max_seq_length_call}
         **kwargs,
     ):
 {RLConfig_extra_args}
         super().__init__({RLConfig_call_args}{RLConfig_kwargs})
         self.vllm_sampling_params = vllm_sampling_params
         self.unsloth_num_chunks = unsloth_num_chunks
+        {max_seq_length_post}
 pass
 
 {RLTrainer_extras}
@@ -266,6 +269,21 @@ def _patch_trl_rl_trainers(trainer_file = "grpo_trainer"):
         extra_args += mixed_precision
     pass
 
+    # Check if max_seq_length is NOT defined (max_length is now default)
+    if "max_seq_length" not in call_args and "max_length" in call_args:
+        max_seq_length_pre = \
+            """max_seq_length : Optional[int] = field(
+        default = None,
+        metadata = {{'help': 'Maximum sequence length to truncate to.'}},
+    )"""
+        max_seq_length_call = "max_seq_length = max_seq_length,"
+        max_seq_length_post = "self.max_seq_length = max_seq_length"
+    else:
+        max_seq_length_pre = ""
+        max_seq_length_call = ""
+        max_seq_length_post = ""
+    pass
+
     # Check if per_device_eval_batch_size (default 8) bigger than bsz
     # Also use FP16 / BF16 evaluation
     if "args" in call_args:
@@ -353,9 +371,7 @@ def _patch_trl_rl_trainers(trainer_file = "grpo_trainer"):
             "            max_length = args.max_length\n"\
             "    else:\n"\
             "        model_max_length = getattr(model, 'max_seq_length', None)\n"\
-            "        # print(model_max_length, 'mml1')\n"\
             "        if model_max_length is None: model_max_length = getattr(model, 'max_length', None)\n"\
-            "        # print(model_max_length, 'mml2')\n"\
             "        if model_max_length is not None:\n"\
             "            args.max_length = model_max_length\n"\
             "            max_length = args.max_length\n"\
@@ -666,6 +682,10 @@ def _patch_trl_rl_trainers(trainer_file = "grpo_trainer"):
         RLTrainer_post       = RLTrainer_post,
         RL_pre               = RL_pre,
 
+        max_seq_length_pre   = max_seq_length_pre,
+        max_seq_length_call  = max_seq_length_call,
+        max_seq_length_post  = max_seq_length_post,
+
         selective_log_softmax_code = selective_log_softmax_code,
     )
 

From 27ca53180d68e80818e8e40f03e85d6abd897401 Mon Sep 17 00:00:00 2001
From: Daniel Han <danielhanchen@gmail.com>
Date: Mon, 18 Aug 2025 21:08:45 -0700
Subject: [PATCH 043/272] Update rl.py

---
 unsloth/models/rl.py | 30 +++++++++++++++---------------
 1 file changed, 15 insertions(+), 15 deletions(-)

diff --git a/unsloth/models/rl.py b/unsloth/models/rl.py
index 4dabdee639..f21bcbe4db 100644
--- a/unsloth/models/rl.py
+++ b/unsloth/models/rl.py
@@ -269,21 +269,6 @@ def _patch_trl_rl_trainers(trainer_file = "grpo_trainer"):
         extra_args += mixed_precision
     pass
 
-    # Check if max_seq_length is NOT defined (max_length is now default)
-    if "max_seq_length" not in call_args and "max_length" in call_args:
-        max_seq_length_pre = \
-            """max_seq_length : Optional[int] = field(
-        default = None,
-        metadata = {{'help': 'Maximum sequence length to truncate to.'}},
-    )"""
-        max_seq_length_call = "max_seq_length = max_seq_length,"
-        max_seq_length_post = "self.max_seq_length = max_seq_length"
-    else:
-        max_seq_length_pre = ""
-        max_seq_length_call = ""
-        max_seq_length_post = ""
-    pass
-
     # Check if per_device_eval_batch_size (default 8) bigger than bsz
     # Also use FP16 / BF16 evaluation
     if "args" in call_args:
@@ -551,6 +536,21 @@ def _patch_trl_rl_trainers(trainer_file = "grpo_trainer"):
         extra_args += learning_rate_check
     pass
 
+    # Check if max_seq_length is NOT defined (max_length is now default)
+    if "max_seq_length" not in call_args and "max_length" in call_args:
+        max_seq_length_pre = \
+            """max_seq_length : Optional[int] = field(
+        default = None,
+        metadata = {{'help': 'Maximum sequence length to truncate to.'}},
+    )"""
+        max_seq_length_call = "max_seq_length = max_seq_length,"
+        max_seq_length_post = "self.max_seq_length = max_seq_length"
+    else:
+        max_seq_length_pre = ""
+        max_seq_length_call = ""
+        max_seq_length_post = ""
+    pass
+
     # Add output_dir saving
     if "output_dir" in call_args:
         # Default checks

From 6514c8ee55baf15360f5bf840dcaf6e8cf9eeb0f Mon Sep 17 00:00:00 2001
From: Daniel Han <danielhanchen@gmail.com>
Date: Mon, 18 Aug 2025 21:10:05 -0700
Subject: [PATCH 044/272] Update rl.py

---
 unsloth/models/rl.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/unsloth/models/rl.py b/unsloth/models/rl.py
index f21bcbe4db..afa6b25731 100644
--- a/unsloth/models/rl.py
+++ b/unsloth/models/rl.py
@@ -541,7 +541,7 @@ def _patch_trl_rl_trainers(trainer_file = "grpo_trainer"):
         max_seq_length_pre = \
             """max_seq_length : Optional[int] = field(
         default = None,
-        metadata = {{'help': 'Maximum sequence length to truncate to.'}},
+        metadata = {'help': 'Maximum sequence length to truncate to.'},
     )"""
         max_seq_length_call = "max_seq_length = max_seq_length,"
         max_seq_length_post = "self.max_seq_length = max_seq_length"

From 3e29ae7ca8fa2ef130a3dedce365d5c33a7d63b7 Mon Sep 17 00:00:00 2001
From: Daniel Han <danielhanchen@gmail.com>
Date: Mon, 18 Aug 2025 22:41:37 -0700
Subject: [PATCH 045/272] Update _utils.py

---
 unsloth/models/_utils.py | 32 ++++++++++++++++++++++++++++++++
 1 file changed, 32 insertions(+)

diff --git a/unsloth/models/_utils.py b/unsloth/models/_utils.py
index 749becf098..dd1798f105 100644
--- a/unsloth/models/_utils.py
+++ b/unsloth/models/_utils.py
@@ -273,6 +273,38 @@ def filter(self, x): return not (self.text in x.getMessage())
 except:
     pass
 
+# Using a slow image processor as `use_fast`
+try:
+    from transformers.processing_utils import logger as processing_utils_logger
+    processing_utils_logger.addFilter(HideLoggingMessage("`use_fast`"))
+    del processing_utils_logger
+except:
+    pass
+
+# Using a slow image processor as `use_fast`
+try:
+    from transformers.models.auto.image_processing_auto import logger as processing_utils_logger
+    processing_utils_logger.addFilter(HideLoggingMessage("`use_fast`"))
+    del processing_utils_logger
+except:
+    pass
+
+# `use_cache=True` is incompatible with gradient checkpointing
+try:
+    from transformers.trainer import logger as trainer_logger
+    trainer_logger.addFilter(HideLoggingMessage("`use_cache=True`"))
+    del trainer_logger
+except:
+    pass
+
+# `use_cache=True` is incompatible with gradient checkpointing
+try:
+    from transformers.utils.generic import logger as trainer_logger
+    trainer_logger.addFilter(HideLoggingMessage("`use_cache=True`"))
+    del trainer_logger
+except:
+    pass
+
 # Errors out on
 # Some weights of Gemma3nForConditionalGeneration were not initialized from the model checkpoint
 from transformers.modeling_utils import logger as transformers_logger

From a42f6247d09a42ce858a4ce6af733463c2eb958b Mon Sep 17 00:00:00 2001
From: Daniel Han <danielhanchen@gmail.com>
Date: Tue, 19 Aug 2025 02:33:58 -0700
Subject: [PATCH 046/272] Update __init__.py

---
 unsloth/__init__.py | 25 +++++++++++++++++++++++++
 1 file changed, 25 insertions(+)

diff --git a/unsloth/__init__.py b/unsloth/__init__.py
index 5d9ddbd43f..1055dfb3eb 100644
--- a/unsloth/__init__.py
+++ b/unsloth/__init__.py
@@ -12,6 +12,31 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
+try:
+    # Fix up AttributeError: 'MessageFactory' object has no attribute 'GetPrototype'
+    # MUST do this at the start primarily due to tensorflow causing issues
+    import google.protobuf.message_factory
+    class MessageFactory:
+        def CreatePrototype(self, *args, **kwargs): return
+        def GetMessages(self, *args, **kwargs): return
+        def GetPrototype(self, *args, **kwargs): return
+    if not hasattr(google.protobuf.message_factory, "MessageFactory"):
+        google.protobuf.message_factory.MessageFactory = MessageFactory
+    elif hasattr(google.protobuf.message_factory, "MessageFactory") and \
+        not hasattr(google.protobuf.message_factory.MessageFactory, "GetPrototype") and \
+        not hasattr(google.protobuf.message_factory, "GetMessageClass"):
+        google.protobuf.message_factory.MessageFactory = MessageFactory
+    elif hasattr(google.protobuf.message_factory, "MessageFactory") and \
+        not hasattr(google.protobuf.message_factory.MessageFactory, "GetPrototype") and \
+        hasattr(google.protobuf.message_factory, "GetMessageClass"):
+        GetMessageClass = google.protobuf.message_factory.GetMessageClass
+        def GetPrototype(self, descriptor):
+            return GetMessageClass(descriptor)
+        google.protobuf.message_factory.MessageFactory.GetPrototype = GetPrototype
+    pass
+except:
+    pass
+
 import warnings, importlib, sys
 from packaging.version import Version
 import os, re, subprocess, inspect

From 9437f9e269d28070c2ee68abd6dce087b0cb78f4 Mon Sep 17 00:00:00 2001
From: Daniel Han <danielhanchen@gmail.com>
Date: Tue, 19 Aug 2025 03:14:46 -0700
Subject: [PATCH 047/272] Torch 2.8

---
 pyproject.toml           | 112 ++++++++++++++++++++++++++++++++++++++-
 unsloth/_auto_install.py |   6 ++-
 2 files changed, 116 insertions(+), 2 deletions(-)

diff --git a/pyproject.toml b/pyproject.toml
index f8558a83b6..0462327beb 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -207,6 +207,16 @@ cu126onlytorch260 = [
     "xformers @ https://download.pytorch.org/whl/cu126/xformers-0.0.29.post3-cp311-cp311-win_amd64.whl ; python_version=='3.11' and platform_system == 'Windows'",
     "xformers @ https://download.pytorch.org/whl/cu126/xformers-0.0.29.post3-cp312-cp312-win_amd64.whl ; python_version=='3.12' and platform_system == 'Windows'",
 ]
+cu118onlytorch270 = [
+    "xformers @ https://download.pytorch.org/whl/cu118/xformers-0.0.30-cp39-cp39-manylinux_2_28_x86_64.whl ; python_version=='3.9' and platform_system == 'Linux'",
+    "xformers @ https://download.pytorch.org/whl/cu118/xformers-0.0.30-cp310-cp310-manylinux_2_28_x86_64.whl ; python_version=='3.10' and platform_system == 'Linux'",
+    "xformers @ https://download.pytorch.org/whl/cu118/xformers-0.0.30-cp311-cp311-manylinux_2_28_x86_64.whl ; python_version=='3.11' and platform_system == 'Linux'",
+    "xformers @ https://download.pytorch.org/whl/cu118/xformers-0.0.30-cp312-cp312-manylinux_2_28_x86_64.whl ; python_version=='3.12' and platform_system == 'Linux'",
+    "xformers @ https://download.pytorch.org/whl/cu118/xformers-0.0.30-cp39-cp39-win_amd64.whl ; python_version=='3.9' and platform_system == 'Windows'",
+    "xformers @ https://download.pytorch.org/whl/cu118/xformers-0.0.30-cp310-cp310-win_amd64.whl ; python_version=='3.10' and platform_system == 'Windows'",
+    "xformers @ https://download.pytorch.org/whl/cu118/xformers-0.0.30-cp311-cp311-win_amd64.whl ; python_version=='3.11' and platform_system == 'Windows'",
+    "xformers @ https://download.pytorch.org/whl/cu118/xformers-0.0.30-cp312-cp312-win_amd64.whl ; python_version=='3.12' and platform_system == 'Windows'",
+]
 cu126onlytorch270 = [
     "xformers @ https://download.pytorch.org/whl/cu126/xformers-0.0.30-cp39-cp39-manylinux_2_28_x86_64.whl ; python_version=='3.9' and platform_system == 'Linux'",
     "xformers @ https://download.pytorch.org/whl/cu126/xformers-0.0.30-cp310-cp310-manylinux_2_28_x86_64.whl ; python_version=='3.10' and platform_system == 'Linux'",
@@ -227,6 +237,30 @@ cu128onlytorch270 = [
     "xformers @ https://download.pytorch.org/whl/cu128/xformers-0.0.30-cp311-cp311-win_amd64.whl ; python_version=='3.11' and platform_system == 'Windows'",
     "xformers @ https://download.pytorch.org/whl/cu128/xformers-0.0.30-cp312-cp312-win_amd64.whl ; python_version=='3.12' and platform_system == 'Windows'",
 ]
+cu118onlytorch271 = [
+    "xformers @ https://download.pytorch.org/whl/cu118/xformers-0.0.31.post1-cp39-abi3-manylinux_2_28_x86_64.whl ; platform_system == 'Linux'",
+    "xformers @ https://download.pytorch.org/whl/cu118/xformers-0.0.31.post1-cp39-abi3-win_amd64.whl ; platform_system == 'Windows'",
+]
+cu126onlytorch271 = [
+    "xformers @ https://download.pytorch.org/whl/cu126/xformers-0.0.31.post1-cp39-abi3-manylinux_2_28_x86_64.whl ; platform_system == 'Linux'",
+    "xformers @ https://download.pytorch.org/whl/cu126/xformers-0.0.31.post1-cp39-abi3-win_amd64.whl ; platform_system == 'Windows'",
+]
+cu128onlytorch271 = [
+    "xformers @ https://download.pytorch.org/whl/cu128/xformers-0.0.31.post1-cp39-abi3-manylinux_2_28_x86_64.whl ; platform_system == 'Linux'",
+    "xformers @ https://download.pytorch.org/whl/cu128/xformers-0.0.31.post1-cp39-abi3-win_amd64.whl ; platform_system == 'Windows'",
+]
+cu118onlytorch280 = [
+    "xformers @ https://download.pytorch.org/whl/cu126/xformers-0.0.32.post2-cp39-abi3-manylinux_2_28_x86_64.whl ; platform_system == 'Linux'",
+    "xformers @ https://download.pytorch.org/whl/cu126/xformers-0.0.32.post2-cp39-abi3-win_amd64.whl ; platform_system == 'Windows'",
+]
+cu126onlytorch280 = [
+    "xformers @ https://download.pytorch.org/whl/cu128/xformers-0.0.32.post2-cp39-abi3-manylinux_2_28_x86_64.whl ; platform_system == 'Linux'",
+    "xformers @ https://download.pytorch.org/whl/cu128/xformers-0.0.32.post2-cp39-abi3-win_amd64.whl ; platform_system == 'Windows'",
+]
+cu128onlytorch280 = [
+    "xformers @ https://download.pytorch.org/whl/cu129/xformers-0.0.32.post2-cp39-abi3-manylinux_2_28_x86_64.whl ; platform_system == 'Linux'",
+    "xformers @ https://download.pytorch.org/whl/cu129/xformers-0.0.32.post2-cp39-abi3-win_amd64.whl ; platform_system == 'Windows'",
+]
 cu118 = [
     "unsloth[huggingface]",
     "bitsandbytes>=0.45.5",
@@ -337,6 +371,11 @@ cu126-torch260 = [
     "bitsandbytes>=0.45.5",
     "unsloth[cu126onlytorch260]",
 ]
+cu118-torch270 = [
+    "unsloth[huggingface]",
+    "bitsandbytes>=0.45.5",
+    "unsloth[cu118onlytorch270]",
+]
 cu126-torch270 = [
     "unsloth[huggingface]",
     "bitsandbytes>=0.45.5",
@@ -347,6 +386,36 @@ cu128-torch270 = [
     "bitsandbytes>=0.45.5",
     "unsloth[cu128onlytorch270]",
 ]
+cu118-torch271 = [
+    "unsloth[huggingface]",
+    "bitsandbytes>=0.45.5",
+    "unsloth[cu118onlytorch271]",
+]
+cu126-torch271 = [
+    "unsloth[huggingface]",
+    "bitsandbytes>=0.45.5",
+    "unsloth[cu126onlytorch271]",
+]
+cu128-torch271 = [
+    "unsloth[huggingface]",
+    "bitsandbytes>=0.45.5",
+    "unsloth[cu128onlytorch271]",
+]
+cu118-torch280 = [
+    "unsloth[huggingface]",
+    "bitsandbytes>=0.45.5",
+    "unsloth[cu118onlytorch280]",
+]
+cu126-torch280 = [
+    "unsloth[huggingface]",
+    "bitsandbytes>=0.45.5",
+    "unsloth[cu126onlytorch280]",
+]
+cu128-torch280 = [
+    "unsloth[huggingface]",
+    "bitsandbytes>=0.45.5",
+    "unsloth[cu128onlytorch280]",
+]
 kaggle = [
     "unsloth[huggingface]",
 ]
@@ -540,6 +609,12 @@ cu126-ampere-torch260 = [
     "unsloth[cu126onlytorch260]",
     "unsloth[flashattention]",
 ]
+cu118-ampere-torch270 = [
+    "unsloth[huggingface]",
+    "bitsandbytes>=0.45.5",
+    "unsloth[cu118onlytorch270]",
+    "unsloth[flashattention]",
+]
 cu126-ampere-torch270 = [
     "unsloth[huggingface]",
     "bitsandbytes>=0.45.5",
@@ -552,7 +627,42 @@ cu128-ampere-torch270 = [
     "unsloth[cu128onlytorch270]",
     "unsloth[flashattention]",
 ]
-
+cu118-ampere-torch271 = [
+    "unsloth[huggingface]",
+    "bitsandbytes>=0.45.5",
+    "unsloth[cu118onlytorch271]",
+    "unsloth[flashattention]",
+]
+cu126-ampere-torch271 = [
+    "unsloth[huggingface]",
+    "bitsandbytes>=0.45.5",
+    "unsloth[cu126onlytorch271]",
+    "unsloth[flashattention]",
+]
+cu128-ampere-torch271 = [
+    "unsloth[huggingface]",
+    "bitsandbytes>=0.45.5",
+    "unsloth[cu128onlytorch271]",
+    "unsloth[flashattention]",
+]
+cu118-ampere-torch280 = [
+    "unsloth[huggingface]",
+    "bitsandbytes>=0.45.5",
+    "unsloth[cu118onlytorch280]",
+    "unsloth[flashattention]",
+]
+cu126-ampere-torch280 = [
+    "unsloth[huggingface]",
+    "bitsandbytes>=0.45.5",
+    "unsloth[cu126onlytorch280]",
+    "unsloth[flashattention]",
+]
+cu128-ampere-torch280 = [
+    "unsloth[huggingface]",
+    "bitsandbytes>=0.45.5",
+    "unsloth[cu128onlytorch280]",
+    "unsloth[flashattention]",
+]
 flashattentiontorch260abiFALSEcu12x = [
     "flash-attn @ https://github.com/Dao-AILab/flash-attention/releases/download/v2.7.4.post1/flash_attn-2.7.4.post1+cu12torch2.6cxx11abiFALSE-cp39-cp39-linux_x86_64.whl ; platform_system == 'Linux' and python_version == '3.9'",
     "flash-attn @ https://github.com/Dao-AILab/flash-attention/releases/download/v2.7.4.post1/flash_attn-2.7.4.post1+cu12torch2.6cxx11abiFALSE-cp310-cp310-linux_x86_64.whl ; platform_system == 'Linux' and python_version == '3.10'",
diff --git a/unsloth/_auto_install.py b/unsloth/_auto_install.py
index c8559394ed..27b23ed476 100644
--- a/unsloth/_auto_install.py
+++ b/unsloth/_auto_install.py
@@ -30,7 +30,11 @@
 elif v  < V('2.5.1'): x = 'cu{}{}-torch250'
 elif v <= V('2.5.1'): x = 'cu{}{}-torch251'
 elif v  < V('2.7.0'): x = 'cu{}{}-torch260'
-elif v  < V('2.8.0'): x = 'cu{}{}-torch270'
+elif v  < V('2.7.9'): x = 'cu{}{}-torch270'
+elif v  < V('2.8.0'): x = 'cu{}{}-torch271'
+elif v  < V('2.8.9'): x = 'cu{}{}-torch280'
 else: raise RuntimeError(f"Torch = {v} too new!")
+if v > V('2.6.9') and cuda not in ("11.8", "12.6", "12.8"):
+	raise RuntimeError(f"CUDA = {cuda} not supported!")
 x = x.format(cuda.replace(".", ""), "-ampere" if is_ampere else "")
 print(f'pip install --upgrade pip && pip install "unsloth[{x}] @ git+https://github.com/unslothai/unsloth.git"')
\ No newline at end of file

From 1dd99a2ebc8cf9b19d97ffffcc47bd27582f60cd Mon Sep 17 00:00:00 2001
From: Daniel Han <danielhanchen@gmail.com>
Date: Tue, 19 Aug 2025 03:16:34 -0700
Subject: [PATCH 048/272] Update rl_replacements.py

---
 unsloth/models/rl_replacements.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/unsloth/models/rl_replacements.py b/unsloth/models/rl_replacements.py
index 717e6cbf11..2555f0df1f 100644
--- a/unsloth/models/rl_replacements.py
+++ b/unsloth/models/rl_replacements.py
@@ -556,7 +556,7 @@ def grpo_trainer_fix_batch_size(RLTrainer_source, RLConfig_source):
     "    per_device_train_batch_size = num_generations\n"
     return check_batch_size
 pass
-# RL_CONFIG_CHANGES["grpo_trainer"].append(grpo_trainer_fix_batch_size)
+RL_CONFIG_CHANGES["grpo_trainer"].append(grpo_trainer_fix_batch_size)
 
 
 # Add other reward function names

From 5349cd0fa072105ab6904b5339b814eb7ed47b1e Mon Sep 17 00:00:00 2001
From: Daniel Han <danielhanchen@gmail.com>
Date: Wed, 20 Aug 2025 00:10:48 -0700
Subject: [PATCH 049/272] Update loader.py

---
 unsloth/models/loader.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/unsloth/models/loader.py b/unsloth/models/loader.py
index fae6ae0770..ce09049050 100644
--- a/unsloth/models/loader.py
+++ b/unsloth/models/loader.py
@@ -641,7 +641,7 @@ def from_pretrained(
                 os.environ["UNSLOTH_FORCE_CUSTOM_DTYPE"] = \
                     "all;None;None;"\
                     "if 'down_projs' in name and hasattr(module, 'weight') and "\
-                    "torch.amax(dequantize_module_weight(module)) >= 1024:"\
+                    "torch.amax(dequantize_module_weight(module)) >= 512:"\
                     "module._pre_set_compute_dtype = torch.float32\n"\
                     ""\
                     ";"

From 5a344c2017830ee4a8ee02e81f0383ffd8b2016f Mon Sep 17 00:00:00 2001
From: Daniel Han <danielhanchen@gmail.com>
Date: Wed, 20 Aug 2025 00:14:49 -0700
Subject: [PATCH 050/272] UNSLOTH_ENABLE_CCE

---
 unsloth/__init__.py      | 6 ++++++
 unsloth/models/loader.py | 3 ---
 2 files changed, 6 insertions(+), 3 deletions(-)

diff --git a/unsloth/__init__.py b/unsloth/__init__.py
index a43dc4f70f..c6851546b5 100644
--- a/unsloth/__init__.py
+++ b/unsloth/__init__.py
@@ -104,6 +104,12 @@ def get_device_count():
     del os.environ["PYTORCH_CUDA_ALLOC_CONF"]
 pass
 
+# CCE fails on Torch 2.8 and above
+# OutOfResources: out of resource: shared memory, Required: 98304, Hardware limit: 65536. Reducing block sizes or `num_stages`
+if (major_torch >= 2 and minor_torch >= 8) or (major_torch > 2):
+    os.environ["UNSLOTH_ENABLE_CCE"] = "0"
+pass
+
 # Fix Xformers performance issues since 0.0.25
 import importlib.util
 from pathlib import Path
diff --git a/unsloth/models/loader.py b/unsloth/models/loader.py
index ce09049050..94fd81d16d 100644
--- a/unsloth/models/loader.py
+++ b/unsloth/models/loader.py
@@ -618,9 +618,6 @@ def from_pretrained(
                 "os.environ['TRITON_F32_DEFAULT'] = 'ieee'"
         elif "gpt-oss" in lowered_model_name:
             os.environ["UNSLOTH_DISABLE_STATIC_GENERATION"] = "1"
-            # CCE fails on Tesla T4
-            # OutOfResources: out of resource: shared memory, Required: 98304, Hardware limit: 65536. Reducing block sizes or `num_stages`
-            os.environ["UNSLOTH_ENABLE_CCE"] = "0"
             if not load_in_4bit:
                 # Only upcast MoE biases for MXFP4, not BnB
                 # Set norms to float32 since anyways they get upcasted to float32

From e56363c9dcd8e7e34619261871ccf798872e0fe3 Mon Sep 17 00:00:00 2001
From: Daniel Han <danielhanchen@gmail.com>
Date: Wed, 20 Aug 2025 00:40:23 -0700
Subject: [PATCH 051/272] Fix

---
 unsloth/__init__.py      | 2 +-
 unsloth/models/loader.py | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/unsloth/__init__.py b/unsloth/__init__.py
index c6851546b5..2c72092b57 100644
--- a/unsloth/__init__.py
+++ b/unsloth/__init__.py
@@ -93,7 +93,7 @@ def get_device_count():
 
 # We support Pytorch 2
 # Fixes https://github.com/unslothai/unsloth/issues/38
-torch_version = str(torch.__version__).split(".")
+torch_version = str(re.match(r"[0-9\.]{3,}", str(torch.__version__)).group(0)).split(".")
 major_torch, minor_torch = torch_version[0], torch_version[1]
 major_torch, minor_torch = int(major_torch), int(minor_torch)
 if (major_torch < 2):
diff --git a/unsloth/models/loader.py b/unsloth/models/loader.py
index 94fd81d16d..00e942ea93 100644
--- a/unsloth/models/loader.py
+++ b/unsloth/models/loader.py
@@ -638,7 +638,7 @@ def from_pretrained(
                 os.environ["UNSLOTH_FORCE_CUSTOM_DTYPE"] = \
                     "all;None;None;"\
                     "if 'down_projs' in name and hasattr(module, 'weight') and "\
-                    "torch.amax(dequantize_module_weight(module)) >= 512:"\
+                    "torch.amax(dequantize_module_weight(module)) >= 128:"\
                     "module._pre_set_compute_dtype = torch.float32\n"\
                     ""\
                     ";"

From c79aece5377480352b1b9eb5339d175551434745 Mon Sep 17 00:00:00 2001
From: Daniel Han <danielhanchen@gmail.com>
Date: Wed, 20 Aug 2025 01:12:42 -0700
Subject: [PATCH 052/272] Update loader.py

---
 unsloth/models/loader.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/unsloth/models/loader.py b/unsloth/models/loader.py
index 00e942ea93..050e077a39 100644
--- a/unsloth/models/loader.py
+++ b/unsloth/models/loader.py
@@ -638,7 +638,7 @@ def from_pretrained(
                 os.environ["UNSLOTH_FORCE_CUSTOM_DTYPE"] = \
                     "all;None;None;"\
                     "if 'down_projs' in name and hasattr(module, 'weight') and "\
-                    "torch.amax(dequantize_module_weight(module)) >= 128:"\
+                    "torch.amax(dequantize_module_weight(module)) >= 0:"\
                     "module._pre_set_compute_dtype = torch.float32\n"\
                     ""\
                     ";"

From c4b530cc29c08693ce139f4c8decdfb80aed6370 Mon Sep 17 00:00:00 2001
From: Daniel Han <danielhanchen@gmail.com>
Date: Wed, 20 Aug 2025 01:32:44 -0700
Subject: [PATCH 053/272] Update loader.py

---
 unsloth/models/loader.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/unsloth/models/loader.py b/unsloth/models/loader.py
index 050e077a39..0ff765bf4c 100644
--- a/unsloth/models/loader.py
+++ b/unsloth/models/loader.py
@@ -638,7 +638,7 @@ def from_pretrained(
                 os.environ["UNSLOTH_FORCE_CUSTOM_DTYPE"] = \
                     "all;None;None;"\
                     "if 'down_projs' in name and hasattr(module, 'weight') and "\
-                    "torch.amax(dequantize_module_weight(module)) >= 0:"\
+                    "torch.amax(dequantize_module_weight(module)) >= 1024:"\
                     "module._pre_set_compute_dtype = torch.float32\n"\
                     ""\
                     ";"

From 0913b585eaa4d81df1ab0d2fae09f7944f5178cb Mon Sep 17 00:00:00 2001
From: Daniel Han <danielhanchen@gmail.com>
Date: Wed, 20 Aug 2025 01:47:06 -0700
Subject: [PATCH 054/272] Update __init__.py

---
 unsloth/__init__.py | 51 +++++++++++++++++++++++----------------------
 1 file changed, 26 insertions(+), 25 deletions(-)

diff --git a/unsloth/__init__.py b/unsloth/__init__.py
index 2c72092b57..3cb3c2e492 100644
--- a/unsloth/__init__.py
+++ b/unsloth/__init__.py
@@ -53,6 +53,32 @@
 # Log Unsloth is being used
 os.environ["UNSLOTH_IS_PRESENT"] = "1"
 
+# Fix up AttributeError: 'MessageFactory' object has no attribute 'GetPrototype'
+# MUST do this at the start primarily due to tensorflow causing issues
+try:
+    import google.protobuf.message_factory
+    class MessageFactory:
+        def CreatePrototype(self, *args, **kwargs): return
+        def GetMessages(self, *args, **kwargs): return
+        def GetPrototype(self, *args, **kwargs): return
+    if not hasattr(google.protobuf.message_factory, "MessageFactory"):
+        google.protobuf.message_factory.MessageFactory = MessageFactory
+    elif hasattr(google.protobuf.message_factory, "MessageFactory") and \
+        not hasattr(google.protobuf.message_factory.MessageFactory, "GetPrototype") and \
+        not hasattr(google.protobuf.message_factory, "GetMessageClass"):
+        google.protobuf.message_factory.MessageFactory = MessageFactory
+    elif hasattr(google.protobuf.message_factory, "MessageFactory") and \
+        not hasattr(google.protobuf.message_factory.MessageFactory, "GetPrototype") and \
+        hasattr(google.protobuf.message_factory, "GetMessageClass"):
+        GetMessageClass = google.protobuf.message_factory.GetMessageClass
+        def GetPrototype(self, descriptor):
+            return GetMessageClass(descriptor)
+        google.protobuf.message_factory.MessageFactory.GetPrototype = GetPrototype
+    pass
+except:
+    pass
+
+# Try importing PyTorch and check version
 try:
     import torch
 except ModuleNotFoundError:
@@ -246,31 +272,6 @@ def is_bf16_supported(): return SUPPORTS_BFLOAT16
     raise ImportError("Unsloth: Please install unsloth_zoo via `pip install unsloth_zoo`")
 pass
 
-try:
-    # Fix up AttributeError: 'MessageFactory' object has no attribute 'GetPrototype'
-    # MUST do this at the start primarily due to tensorflow causing issues
-    import google.protobuf.message_factory
-    class MessageFactory:
-        def CreatePrototype(self, *args, **kwargs): return
-        def GetMessages(self, *args, **kwargs): return
-        def GetPrototype(self, *args, **kwargs): return
-    if not hasattr(google.protobuf.message_factory, "MessageFactory"):
-        google.protobuf.message_factory.MessageFactory = MessageFactory
-    elif hasattr(google.protobuf.message_factory, "MessageFactory") and \
-        not hasattr(google.protobuf.message_factory.MessageFactory, "GetPrototype") and \
-        not hasattr(google.protobuf.message_factory, "GetMessageClass"):
-        google.protobuf.message_factory.MessageFactory = MessageFactory
-    elif hasattr(google.protobuf.message_factory, "MessageFactory") and \
-        not hasattr(google.protobuf.message_factory.MessageFactory, "GetPrototype") and \
-        hasattr(google.protobuf.message_factory, "GetMessageClass"):
-        GetMessageClass = google.protobuf.message_factory.GetMessageClass
-        def GetPrototype(self, descriptor):
-            return GetMessageClass(descriptor)
-        google.protobuf.message_factory.MessageFactory.GetPrototype = GetPrototype
-    pass
-except:
-    pass
-
 from .models import *
 from .models import __version__
 from .save import *

From 374f703ee909c56536265e1cca71306a873abd46 Mon Sep 17 00:00:00 2001
From: Daniel Han <danielhanchen@gmail.com>
Date: Wed, 20 Aug 2025 01:49:57 -0700
Subject: [PATCH 055/272] Update __init__.py

---
 unsloth/__init__.py | 50 ++++++++++++++++++++++-----------------------
 1 file changed, 25 insertions(+), 25 deletions(-)

diff --git a/unsloth/__init__.py b/unsloth/__init__.py
index 3cb3c2e492..0430e5704d 100644
--- a/unsloth/__init__.py
+++ b/unsloth/__init__.py
@@ -53,31 +53,6 @@
 # Log Unsloth is being used
 os.environ["UNSLOTH_IS_PRESENT"] = "1"
 
-# Fix up AttributeError: 'MessageFactory' object has no attribute 'GetPrototype'
-# MUST do this at the start primarily due to tensorflow causing issues
-try:
-    import google.protobuf.message_factory
-    class MessageFactory:
-        def CreatePrototype(self, *args, **kwargs): return
-        def GetMessages(self, *args, **kwargs): return
-        def GetPrototype(self, *args, **kwargs): return
-    if not hasattr(google.protobuf.message_factory, "MessageFactory"):
-        google.protobuf.message_factory.MessageFactory = MessageFactory
-    elif hasattr(google.protobuf.message_factory, "MessageFactory") and \
-        not hasattr(google.protobuf.message_factory.MessageFactory, "GetPrototype") and \
-        not hasattr(google.protobuf.message_factory, "GetMessageClass"):
-        google.protobuf.message_factory.MessageFactory = MessageFactory
-    elif hasattr(google.protobuf.message_factory, "MessageFactory") and \
-        not hasattr(google.protobuf.message_factory.MessageFactory, "GetPrototype") and \
-        hasattr(google.protobuf.message_factory, "GetMessageClass"):
-        GetMessageClass = google.protobuf.message_factory.GetMessageClass
-        def GetPrototype(self, descriptor):
-            return GetMessageClass(descriptor)
-        google.protobuf.message_factory.MessageFactory.GetPrototype = GetPrototype
-    pass
-except:
-    pass
-
 # Try importing PyTorch and check version
 try:
     import torch
@@ -136,6 +111,31 @@ def get_device_count():
     os.environ["UNSLOTH_ENABLE_CCE"] = "0"
 pass
 
+# Fix up AttributeError: 'MessageFactory' object has no attribute 'GetPrototype'
+# MUST do this at the start primarily due to tensorflow causing issues
+try:
+    import google.protobuf.message_factory
+    class MessageFactory:
+        def CreatePrototype(self, *args, **kwargs): return
+        def GetMessages(self, *args, **kwargs): return
+        def GetPrototype(self, *args, **kwargs): return
+    if not hasattr(google.protobuf.message_factory, "MessageFactory"):
+        google.protobuf.message_factory.MessageFactory = MessageFactory
+    elif hasattr(google.protobuf.message_factory, "MessageFactory") and \
+        not hasattr(google.protobuf.message_factory.MessageFactory, "GetPrototype") and \
+        not hasattr(google.protobuf.message_factory, "GetMessageClass"):
+        google.protobuf.message_factory.MessageFactory = MessageFactory
+    elif hasattr(google.protobuf.message_factory, "MessageFactory") and \
+        not hasattr(google.protobuf.message_factory.MessageFactory, "GetPrototype") and \
+        hasattr(google.protobuf.message_factory, "GetMessageClass"):
+        GetMessageClass = google.protobuf.message_factory.GetMessageClass
+        def GetPrototype(self, descriptor):
+            return GetMessageClass(descriptor)
+        google.protobuf.message_factory.MessageFactory.GetPrototype = GetPrototype
+    pass
+except:
+    pass
+
 # Fix Xformers performance issues since 0.0.25
 import importlib.util
 from pathlib import Path

From c0efbec6918a125859e10fa8c412d42e360548be Mon Sep 17 00:00:00 2001
From: Daniel Han <danielhanchen@gmail.com>
Date: Wed, 20 Aug 2025 01:51:18 -0700
Subject: [PATCH 056/272] Update __init__.py

---
 unsloth/__init__.py | 50 ++++++++++++++++++++++-----------------------
 1 file changed, 25 insertions(+), 25 deletions(-)

diff --git a/unsloth/__init__.py b/unsloth/__init__.py
index 0430e5704d..f34645651b 100644
--- a/unsloth/__init__.py
+++ b/unsloth/__init__.py
@@ -111,31 +111,6 @@ def get_device_count():
     os.environ["UNSLOTH_ENABLE_CCE"] = "0"
 pass
 
-# Fix up AttributeError: 'MessageFactory' object has no attribute 'GetPrototype'
-# MUST do this at the start primarily due to tensorflow causing issues
-try:
-    import google.protobuf.message_factory
-    class MessageFactory:
-        def CreatePrototype(self, *args, **kwargs): return
-        def GetMessages(self, *args, **kwargs): return
-        def GetPrototype(self, *args, **kwargs): return
-    if not hasattr(google.protobuf.message_factory, "MessageFactory"):
-        google.protobuf.message_factory.MessageFactory = MessageFactory
-    elif hasattr(google.protobuf.message_factory, "MessageFactory") and \
-        not hasattr(google.protobuf.message_factory.MessageFactory, "GetPrototype") and \
-        not hasattr(google.protobuf.message_factory, "GetMessageClass"):
-        google.protobuf.message_factory.MessageFactory = MessageFactory
-    elif hasattr(google.protobuf.message_factory, "MessageFactory") and \
-        not hasattr(google.protobuf.message_factory.MessageFactory, "GetPrototype") and \
-        hasattr(google.protobuf.message_factory, "GetMessageClass"):
-        GetMessageClass = google.protobuf.message_factory.GetMessageClass
-        def GetPrototype(self, descriptor):
-            return GetMessageClass(descriptor)
-        google.protobuf.message_factory.MessageFactory.GetPrototype = GetPrototype
-    pass
-except:
-    pass
-
 # Fix Xformers performance issues since 0.0.25
 import importlib.util
 from pathlib import Path
@@ -272,6 +247,31 @@ def is_bf16_supported(): return SUPPORTS_BFLOAT16
     raise ImportError("Unsloth: Please install unsloth_zoo via `pip install unsloth_zoo`")
 pass
 
+# Fix up AttributeError: 'MessageFactory' object has no attribute 'GetPrototype'
+# MUST do this at the start primarily due to tensorflow causing issues
+try:
+    import google.protobuf.message_factory
+    class MessageFactory:
+        def CreatePrototype(self, *args, **kwargs): return
+        def GetMessages(self, *args, **kwargs): return
+        def GetPrototype(self, *args, **kwargs): return
+    if not hasattr(google.protobuf.message_factory, "MessageFactory"):
+        google.protobuf.message_factory.MessageFactory = MessageFactory
+    elif hasattr(google.protobuf.message_factory, "MessageFactory") and \
+        not hasattr(google.protobuf.message_factory.MessageFactory, "GetPrototype") and \
+        not hasattr(google.protobuf.message_factory, "GetMessageClass"):
+        google.protobuf.message_factory.MessageFactory = MessageFactory
+    elif hasattr(google.protobuf.message_factory, "MessageFactory") and \
+        not hasattr(google.protobuf.message_factory.MessageFactory, "GetPrototype") and \
+        hasattr(google.protobuf.message_factory, "GetMessageClass"):
+        GetMessageClass = google.protobuf.message_factory.GetMessageClass
+        def GetPrototype(self, descriptor):
+            return GetMessageClass(descriptor)
+        google.protobuf.message_factory.MessageFactory.GetPrototype = GetPrototype
+    pass
+except:
+    pass
+
 from .models import *
 from .models import __version__
 from .save import *

From 761a4454a95b3ff9a6bc28c2f4ed5619df9b828f Mon Sep 17 00:00:00 2001
From: Daniel Han <danielhanchen@gmail.com>
Date: Wed, 20 Aug 2025 01:53:53 -0700
Subject: [PATCH 057/272] Update __init__.py

---
 unsloth/__init__.py | 42 +++++++++++++++++++++---------------------
 1 file changed, 21 insertions(+), 21 deletions(-)

diff --git a/unsloth/__init__.py b/unsloth/__init__.py
index f34645651b..95035b91b0 100644
--- a/unsloth/__init__.py
+++ b/unsloth/__init__.py
@@ -226,27 +226,6 @@ def is_bf16_supported(): return SUPPORTS_BFLOAT16
     # TODO: check triton for intel installed properly.
     pass
 
-# Check for unsloth_zoo
-try:
-    unsloth_zoo_version = importlib_version("unsloth_zoo")
-    if Version(unsloth_zoo_version) < Version("2025.8.1"):
-        print(
-            "Unsloth: Please update Unsloth and Unsloth-Zoo to the latest version!\n"\
-            "Do this via `pip install --upgrade --force-reinstall --no-cache-dir --no-deps unsloth unsloth_zoo`"
-        )
-        # if os.environ.get("UNSLOTH_DISABLE_AUTO_UPDATES", "0") == "0":
-        #     try:
-        #         os.system("pip install --upgrade --no-cache-dir --no-deps unsloth_zoo")
-        #     except:
-        #         try:
-        #             os.system("pip install --upgrade --no-cache-dir --no-deps --user unsloth_zoo")
-        #         except:
-        #             raise ImportError("Unsloth: Please update unsloth_zoo via `pip install --upgrade --no-cache-dir --no-deps unsloth_zoo`")
-    import unsloth_zoo
-except:
-    raise ImportError("Unsloth: Please install unsloth_zoo via `pip install unsloth_zoo`")
-pass
-
 # Fix up AttributeError: 'MessageFactory' object has no attribute 'GetPrototype'
 # MUST do this at the start primarily due to tensorflow causing issues
 try:
@@ -272,6 +251,27 @@ def GetPrototype(self, descriptor):
 except:
     pass
 
+# Check for unsloth_zoo
+try:
+    unsloth_zoo_version = importlib_version("unsloth_zoo")
+    if Version(unsloth_zoo_version) < Version("2025.8.1"):
+        print(
+            "Unsloth: Please update Unsloth and Unsloth-Zoo to the latest version!\n"\
+            "Do this via `pip install --upgrade --force-reinstall --no-cache-dir --no-deps unsloth unsloth_zoo`"
+        )
+        # if os.environ.get("UNSLOTH_DISABLE_AUTO_UPDATES", "0") == "0":
+        #     try:
+        #         os.system("pip install --upgrade --no-cache-dir --no-deps unsloth_zoo")
+        #     except:
+        #         try:
+        #             os.system("pip install --upgrade --no-cache-dir --no-deps --user unsloth_zoo")
+        #         except:
+        #             raise ImportError("Unsloth: Please update unsloth_zoo via `pip install --upgrade --no-cache-dir --no-deps unsloth_zoo`")
+    import unsloth_zoo
+except:
+    raise ImportError("Unsloth: Please install unsloth_zoo via `pip install unsloth_zoo`")
+pass
+
 from .models import *
 from .models import __version__
 from .save import *

From 30ea44c17f2b4e60b77240c1cb1ec93610c57861 Mon Sep 17 00:00:00 2001
From: Daniel Han <danielhanchen@gmail.com>
Date: Wed, 20 Aug 2025 01:56:58 -0700
Subject: [PATCH 058/272] Import fixes

---
 unsloth/__init__.py     | 30 ++++--------------------------
 unsloth/import_fixes.py | 40 ++++++++++++++++++++++++++++++++++++++++
 2 files changed, 44 insertions(+), 26 deletions(-)
 create mode 100644 unsloth/import_fixes.py

diff --git a/unsloth/__init__.py b/unsloth/__init__.py
index 95035b91b0..fd6bd7d499 100644
--- a/unsloth/__init__.py
+++ b/unsloth/__init__.py
@@ -17,6 +17,10 @@
 import os, re, subprocess, inspect
 import numpy as np
 
+# Fix some issues before importing other packages
+from .import_fixes import fix_message_factory_issue
+fix_message_factory_issue(); del fix_message_factory_issue;
+
 # Check if modules that need patching are already imported
 critical_modules = ['trl', 'transformers', 'peft']
 already_imported = [mod for mod in critical_modules if mod in sys.modules]
@@ -161,7 +165,6 @@ def is_bf16_supported(): return SUPPORTS_BFLOAT16
     SUPPORTS_BFLOAT16 = torch.xpu.is_bf16_supported()
 pass
 
-
 # For Gradio HF Spaces?
 # if "SPACE_AUTHOR_NAME" not in os.environ and "SPACE_REPO_NAME" not in os.environ:
 import triton
@@ -226,31 +229,6 @@ def is_bf16_supported(): return SUPPORTS_BFLOAT16
     # TODO: check triton for intel installed properly.
     pass
 
-# Fix up AttributeError: 'MessageFactory' object has no attribute 'GetPrototype'
-# MUST do this at the start primarily due to tensorflow causing issues
-try:
-    import google.protobuf.message_factory
-    class MessageFactory:
-        def CreatePrototype(self, *args, **kwargs): return
-        def GetMessages(self, *args, **kwargs): return
-        def GetPrototype(self, *args, **kwargs): return
-    if not hasattr(google.protobuf.message_factory, "MessageFactory"):
-        google.protobuf.message_factory.MessageFactory = MessageFactory
-    elif hasattr(google.protobuf.message_factory, "MessageFactory") and \
-        not hasattr(google.protobuf.message_factory.MessageFactory, "GetPrototype") and \
-        not hasattr(google.protobuf.message_factory, "GetMessageClass"):
-        google.protobuf.message_factory.MessageFactory = MessageFactory
-    elif hasattr(google.protobuf.message_factory, "MessageFactory") and \
-        not hasattr(google.protobuf.message_factory.MessageFactory, "GetPrototype") and \
-        hasattr(google.protobuf.message_factory, "GetMessageClass"):
-        GetMessageClass = google.protobuf.message_factory.GetMessageClass
-        def GetPrototype(self, descriptor):
-            return GetMessageClass(descriptor)
-        google.protobuf.message_factory.MessageFactory.GetPrototype = GetPrototype
-    pass
-except:
-    pass
-
 # Check for unsloth_zoo
 try:
     unsloth_zoo_version = importlib_version("unsloth_zoo")
diff --git a/unsloth/import_fixes.py b/unsloth/import_fixes.py
new file mode 100644
index 0000000000..d265a09df0
--- /dev/null
+++ b/unsloth/import_fixes.py
@@ -0,0 +1,40 @@
+# Copyright 2023-present Daniel Han-Chen & the Unsloth team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+def fix_message_factory_issue():
+    # Fix up AttributeError: 'MessageFactory' object has no attribute 'GetPrototype'
+    # MUST do this at the start primarily due to tensorflow causing issues
+    try:
+        import google.protobuf.message_factory
+        class MessageFactory:
+            def CreatePrototype(self, *args, **kwargs): return
+            def GetMessages(self, *args, **kwargs): return
+            def GetPrototype(self, *args, **kwargs): return
+        if not hasattr(google.protobuf.message_factory, "MessageFactory"):
+            google.protobuf.message_factory.MessageFactory = MessageFactory
+        elif hasattr(google.protobuf.message_factory, "MessageFactory") and \
+            not hasattr(google.protobuf.message_factory.MessageFactory, "GetPrototype") and \
+            not hasattr(google.protobuf.message_factory, "GetMessageClass"):
+            google.protobuf.message_factory.MessageFactory = MessageFactory
+        elif hasattr(google.protobuf.message_factory, "MessageFactory") and \
+            not hasattr(google.protobuf.message_factory.MessageFactory, "GetPrototype") and \
+            hasattr(google.protobuf.message_factory, "GetMessageClass"):
+            GetMessageClass = google.protobuf.message_factory.GetMessageClass
+            def GetPrototype(self, descriptor):
+                return GetMessageClass(descriptor)
+            google.protobuf.message_factory.MessageFactory.GetPrototype = GetPrototype
+        pass
+    except:
+        pass
+pass

From c45467cfd91d5d66308f5cbc8a6ab3cc90bec5d5 Mon Sep 17 00:00:00 2001
From: Daniel Han <danielhanchen@gmail.com>
Date: Wed, 20 Aug 2025 02:00:51 -0700
Subject: [PATCH 059/272] Update loader.py

---
 unsloth/models/loader.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/unsloth/models/loader.py b/unsloth/models/loader.py
index 0ff765bf4c..72655782f9 100644
--- a/unsloth/models/loader.py
+++ b/unsloth/models/loader.py
@@ -637,8 +637,8 @@ def from_pretrained(
                 # Set norms to float32 since anyways they get upcasted to float32
                 os.environ["UNSLOTH_FORCE_CUSTOM_DTYPE"] = \
                     "all;None;None;"\
-                    "if 'down_projs' in name and hasattr(module, 'weight') and "\
-                    "torch.amax(dequantize_module_weight(module)) >= 1024:"\
+                    "if hasattr(module, 'weight') and "\
+                    "torch.amax(dequantize_module_weight(module)) >= 1:"\
                     "module._pre_set_compute_dtype = torch.float32\n"\
                     ""\
                     ";"

From 55e4c78a943a52b9e0b46b29afae0f79e371573c Mon Sep 17 00:00:00 2001
From: Daniel Han <danielhanchen@gmail.com>
Date: Wed, 20 Aug 2025 02:15:10 -0700
Subject: [PATCH 060/272] Fix aimv2 issue

---
 unsloth/__init__.py     | 30 +++-------------
 unsloth/import_fixes.py | 79 +++++++++++++++++++++++++++++++++++++++--
 2 files changed, 82 insertions(+), 27 deletions(-)

diff --git a/unsloth/__init__.py b/unsloth/__init__.py
index fd6bd7d499..335db48775 100644
--- a/unsloth/__init__.py
+++ b/unsloth/__init__.py
@@ -115,35 +115,15 @@ def get_device_count():
     os.environ["UNSLOTH_ENABLE_CCE"] = "0"
 pass
 
-# Fix Xformers performance issues since 0.0.25
+# Fix other issues
 import importlib.util
 from pathlib import Path
 from importlib.metadata import version as importlib_version
 from packaging.version import Version
-try:
-    xformers_version = importlib_version("xformers")
-    if Version(xformers_version) < Version("0.0.29"):
-        xformers_location = importlib.util.find_spec("xformers").origin
-        xformers_location = os.path.split(xformers_location)[0]
-        cutlass = Path(xformers_location) / "ops" / "fmha" / "cutlass.py"
-
-        if cutlass.exists():
-            with open(cutlass, "r+", encoding = "utf-8") as f:
-                text = f.read()
-                # See https://github.com/facebookresearch/xformers/issues/1176#issuecomment-2545829591
-                if "num_splits_key=-1," in text:
-                    text = text.replace("num_splits_key=-1,", "num_splits_key=None,")
-                    f.seek(0)
-                    f.write(text)
-                    f.truncate()
-                    print("Unsloth: Patching Xformers to fix some performance issues.")
-                pass
-            pass
-        pass
-    pass
-except:
-    pass
-pass
+from .import_fixes import fix_xformers_performance_issue
+fix_xformers_performance_issue(); del fix_xformers_performance_issue;
+from .import_fixes import fix_vllm_aimv2_issue
+fix_vllm_aimv2_issue(); del fix_vllm_aimv2_issue;
 
 # Torch 2.4 has including_emulation
 if DEVICE_TYPE == "cuda":
diff --git a/unsloth/import_fixes.py b/unsloth/import_fixes.py
index d265a09df0..126aac6365 100644
--- a/unsloth/import_fixes.py
+++ b/unsloth/import_fixes.py
@@ -12,9 +12,16 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
+import os
+import importlib.util
+from pathlib import Path
+from importlib.metadata import version as importlib_version
+from packaging.version import Version
+UNSLOTH_ENABLE_LOGGING = os.environ.get("UNSLOTH_ENABLE_LOGGING", "0") == "1"
+
+# Fix up AttributeError: 'MessageFactory' object has no attribute 'GetPrototype'
+# MUST do this at the start primarily due to tensorflow causing issues
 def fix_message_factory_issue():
-    # Fix up AttributeError: 'MessageFactory' object has no attribute 'GetPrototype'
-    # MUST do this at the start primarily due to tensorflow causing issues
     try:
         import google.protobuf.message_factory
         class MessageFactory:
@@ -22,11 +29,15 @@ def CreatePrototype(self, *args, **kwargs): return
             def GetMessages(self, *args, **kwargs): return
             def GetPrototype(self, *args, **kwargs): return
         if not hasattr(google.protobuf.message_factory, "MessageFactory"):
+            if UNSLOTH_ENABLE_LOGGING:
+                print("Unsloth: Patching protobuf.MessageFactory as it doesn't exist")
             google.protobuf.message_factory.MessageFactory = MessageFactory
         elif hasattr(google.protobuf.message_factory, "MessageFactory") and \
             not hasattr(google.protobuf.message_factory.MessageFactory, "GetPrototype") and \
             not hasattr(google.protobuf.message_factory, "GetMessageClass"):
             google.protobuf.message_factory.MessageFactory = MessageFactory
+            if UNSLOTH_ENABLE_LOGGING:
+                print("Unsloth: Patching protobuf.MessageFactory as it doesn't exist")
         elif hasattr(google.protobuf.message_factory, "MessageFactory") and \
             not hasattr(google.protobuf.message_factory.MessageFactory, "GetPrototype") and \
             hasattr(google.protobuf.message_factory, "GetMessageClass"):
@@ -34,7 +45,71 @@ def GetPrototype(self, *args, **kwargs): return
             def GetPrototype(self, descriptor):
                 return GetMessageClass(descriptor)
             google.protobuf.message_factory.MessageFactory.GetPrototype = GetPrototype
+            if UNSLOTH_ENABLE_LOGGING:
+                print("Unsloth: Patching protobuf.MessageFactory.GetPrototype")
         pass
     except:
         pass
 pass
+
+# Fix Xformers performance issues since 0.0.25
+def fix_xformers_performance_issue():
+    xformers_version = importlib_version("xformers")
+    if Version(xformers_version) < Version("0.0.29"):
+        xformers_location = importlib.util.find_spec("xformers").origin
+        xformers_location = os.path.split(xformers_location)[0]
+        cutlass = Path(xformers_location) / "ops" / "fmha" / "cutlass.py"
+        try:
+            if cutlass.exists():
+                with open(cutlass, "r+", encoding = "utf-8") as f:
+                    text = f.read()
+                    # See https://github.com/facebookresearch/xformers/issues/1176#issuecomment-2545829591
+                    if "num_splits_key=-1," in text:
+                        text = text.replace(
+                            "num_splits_key=-1,",
+                            "num_splits_key=None,",
+                        )
+                        f.seek(0)
+                        f.write(text)
+                        f.truncate()
+                        if UNSLOTH_ENABLE_LOGGING:
+                            print("Unsloth: Patching Xformers to fix some performance issues.")
+        except:
+            pass
+pass
+
+# ValueError: 'aimv2' is already used by a Transformers config, pick another name.
+def fix_vllm_aimv2_issue():
+    vllm_version = importlib_version("vllm")
+    if Version(vllm_version) < Version("0.10.1"):
+        vllm_version = importlib.util.find_spec("xformers").origin
+        vllm_version = os.path.split(vllm_version)[0]
+        ovis_config = Path(vllm_version) / "transformers_utils" / "configs" / "ovis.py"
+        try:
+            if ovis_config.exists():
+                with open(ovis_config, "r+", encoding = "utf-8") as f:
+                    text = f.read()
+                    # See https://github.com/vllm-project/vllm-ascend/issues/2046
+                    if 'AutoConfig.register("aimv2", AIMv2Config)' in text:
+                        text = text.replace(
+                            'AutoConfig.register("aimv2", AIMv2Config)',
+                            '',
+                        )
+                        text = text.replace(
+                            '''backbone_config.pop('model_type')
+                backbone_config = AutoConfig.for_model(model_type,
+                                                       **backbone_config)''',
+                            '''if model_type != "aimv2":
+                    backbone_config.pop('model_type')
+                    backbone_config = AutoConfig.for_model(model_type, **backbone_config)
+                else:
+                    backbone_config = AIMv2Config(**backbone_config)'''
+                        )
+                        f.seek(0)
+                        f.write(text)
+                        f.truncate()
+                        if UNSLOTH_ENABLE_LOGGING:
+                            print("Unsloth: Patching vLLM to fix `'aimv2' is already used by a Transformers config, pick another name.`")
+        except:
+            pass
+pass

From a160e42ad8250f40b25e72e2a1b2e2d550986a65 Mon Sep 17 00:00:00 2001
From: Daniel Han <danielhanchen@gmail.com>
Date: Wed, 20 Aug 2025 02:20:31 -0700
Subject: [PATCH 061/272] Update loader.py

---
 unsloth/models/loader.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/unsloth/models/loader.py b/unsloth/models/loader.py
index 72655782f9..0ff765bf4c 100644
--- a/unsloth/models/loader.py
+++ b/unsloth/models/loader.py
@@ -637,8 +637,8 @@ def from_pretrained(
                 # Set norms to float32 since anyways they get upcasted to float32
                 os.environ["UNSLOTH_FORCE_CUSTOM_DTYPE"] = \
                     "all;None;None;"\
-                    "if hasattr(module, 'weight') and "\
-                    "torch.amax(dequantize_module_weight(module)) >= 1:"\
+                    "if 'down_projs' in name and hasattr(module, 'weight') and "\
+                    "torch.amax(dequantize_module_weight(module)) >= 1024:"\
                     "module._pre_set_compute_dtype = torch.float32\n"\
                     ""\
                     ";"

From 675c4effe78a3ef5bb3f21f6892f3edc54e1e935 Mon Sep 17 00:00:00 2001
From: Daniel Han <danielhanchen@gmail.com>
Date: Wed, 20 Aug 2025 02:23:21 -0700
Subject: [PATCH 062/272] Update import_fixes.py

---
 unsloth/import_fixes.py | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/unsloth/import_fixes.py b/unsloth/import_fixes.py
index 126aac6365..1a4172e01f 100644
--- a/unsloth/import_fixes.py
+++ b/unsloth/import_fixes.py
@@ -54,6 +54,7 @@ def GetPrototype(self, descriptor):
 
 # Fix Xformers performance issues since 0.0.25
 def fix_xformers_performance_issue():
+    if importlib.util.find_spec("xformers") is None: return
     xformers_version = importlib_version("xformers")
     if Version(xformers_version) < Version("0.0.29"):
         xformers_location = importlib.util.find_spec("xformers").origin
@@ -80,6 +81,7 @@ def fix_xformers_performance_issue():
 
 # ValueError: 'aimv2' is already used by a Transformers config, pick another name.
 def fix_vllm_aimv2_issue():
+    if importlib.util.find_spec("vllm") is None: return
     vllm_version = importlib_version("vllm")
     if Version(vllm_version) < Version("0.10.1"):
         vllm_version = importlib.util.find_spec("xformers").origin

From a99d6b273c59f0908385559ba2d8b441751b6249 Mon Sep 17 00:00:00 2001
From: Daniel Han <danielhanchen@gmail.com>
Date: Wed, 20 Aug 2025 02:26:23 -0700
Subject: [PATCH 063/272] Update import_fixes.py

---
 unsloth/import_fixes.py | 12 +++++++-----
 1 file changed, 7 insertions(+), 5 deletions(-)

diff --git a/unsloth/import_fixes.py b/unsloth/import_fixes.py
index 1a4172e01f..a07f9970f8 100644
--- a/unsloth/import_fixes.py
+++ b/unsloth/import_fixes.py
@@ -75,8 +75,9 @@ def fix_xformers_performance_issue():
                         f.truncate()
                         if UNSLOTH_ENABLE_LOGGING:
                             print("Unsloth: Patching Xformers to fix some performance issues.")
-        except:
-            pass
+        except Exception as e:
+            if UNSLOTH_ENABLE_LOGGING:
+                print(f"Unsloth: Failed patching Xformers with error = {str(e)}")
 pass
 
 # ValueError: 'aimv2' is already used by a Transformers config, pick another name.
@@ -84,7 +85,7 @@ def fix_vllm_aimv2_issue():
     if importlib.util.find_spec("vllm") is None: return
     vllm_version = importlib_version("vllm")
     if Version(vllm_version) < Version("0.10.1"):
-        vllm_version = importlib.util.find_spec("xformers").origin
+        vllm_version = importlib.util.find_spec("vllm").origin
         vllm_version = os.path.split(vllm_version)[0]
         ovis_config = Path(vllm_version) / "transformers_utils" / "configs" / "ovis.py"
         try:
@@ -112,6 +113,7 @@ def fix_vllm_aimv2_issue():
                         f.truncate()
                         if UNSLOTH_ENABLE_LOGGING:
                             print("Unsloth: Patching vLLM to fix `'aimv2' is already used by a Transformers config, pick another name.`")
-        except:
-            pass
+        except Exception as e:
+            if UNSLOTH_ENABLE_LOGGING:
+                print(f"Unsloth: Failed patching vLLM with error = {str(e)}")
 pass

From 7e8262303ef06bc39367a17acf0e783abb37c1b4 Mon Sep 17 00:00:00 2001
From: Daniel Han <danielhanchen@gmail.com>
Date: Wed, 20 Aug 2025 02:38:39 -0700
Subject: [PATCH 064/272] Update loader.py

---
 unsloth/models/loader.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/unsloth/models/loader.py b/unsloth/models/loader.py
index 0ff765bf4c..050e077a39 100644
--- a/unsloth/models/loader.py
+++ b/unsloth/models/loader.py
@@ -638,7 +638,7 @@ def from_pretrained(
                 os.environ["UNSLOTH_FORCE_CUSTOM_DTYPE"] = \
                     "all;None;None;"\
                     "if 'down_projs' in name and hasattr(module, 'weight') and "\
-                    "torch.amax(dequantize_module_weight(module)) >= 1024:"\
+                    "torch.amax(dequantize_module_weight(module)) >= 0:"\
                     "module._pre_set_compute_dtype = torch.float32\n"\
                     ""\
                     ";"

From 0e678d6fe9ef0aeced0380184bfb9e7c9b1a1778 Mon Sep 17 00:00:00 2001
From: Daniel Han <danielhanchen@gmail.com>
Date: Wed, 20 Aug 2025 03:38:26 -0700
Subject: [PATCH 065/272] Update loader.py

---
 unsloth/models/loader.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/unsloth/models/loader.py b/unsloth/models/loader.py
index 050e077a39..1b110ca513 100644
--- a/unsloth/models/loader.py
+++ b/unsloth/models/loader.py
@@ -637,7 +637,7 @@ def from_pretrained(
                 # Set norms to float32 since anyways they get upcasted to float32
                 os.environ["UNSLOTH_FORCE_CUSTOM_DTYPE"] = \
                     "all;None;None;"\
-                    "if 'down_projs' in name and hasattr(module, 'weight') and "\
+                    "if ('down_projs' in name or 'gate_up_projs' in name) and hasattr(module, 'weight') and "\
                     "torch.amax(dequantize_module_weight(module)) >= 0:"\
                     "module._pre_set_compute_dtype = torch.float32\n"\
                     ""\

From 9b82317a699779d8b96e986fe8ef7a3f16494247 Mon Sep 17 00:00:00 2001
From: Daniel Han <danielhanchen@gmail.com>
Date: Wed, 20 Aug 2025 04:09:24 -0700
Subject: [PATCH 066/272] Update loader.py

---
 unsloth/models/loader.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/unsloth/models/loader.py b/unsloth/models/loader.py
index 1b110ca513..0da6b83d12 100644
--- a/unsloth/models/loader.py
+++ b/unsloth/models/loader.py
@@ -637,8 +637,8 @@ def from_pretrained(
                 # Set norms to float32 since anyways they get upcasted to float32
                 os.environ["UNSLOTH_FORCE_CUSTOM_DTYPE"] = \
                     "all;None;None;"\
-                    "if ('down_projs' in name or 'gate_up_projs' in name) and hasattr(module, 'weight') and "\
-                    "torch.amax(dequantize_module_weight(module)) >= 0:"\
+                    "if ('down_projs' in name) and hasattr(module, 'weight') and "\
+                    "torch.amax(dequantize_module_weight(module)) >= 1024:"\
                     "module._pre_set_compute_dtype = torch.float32\n"\
                     ""\
                     ";"

From 8a76fd32bdf05d3e63dd6df309b52d861e11ef3f Mon Sep 17 00:00:00 2001
From: Daniel Han <danielhanchen@gmail.com>
Date: Wed, 20 Aug 2025 04:39:30 -0700
Subject: [PATCH 067/272] Upgrade

---
 pyproject.toml           | 4 ++--
 unsloth/__init__.py      | 2 +-
 unsloth/models/_utils.py | 2 +-
 3 files changed, 4 insertions(+), 4 deletions(-)

diff --git a/pyproject.toml b/pyproject.toml
index c4c3ebe6f5..83b75b0a00 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -37,7 +37,7 @@ triton = [
 ]
 
 huggingface = [
-    "unsloth_zoo>=2025.8.7",
+    "unsloth_zoo>=2025.8.8",
     "packaging",
     "tyro",
     "transformers>=4.51.3,!=4.47.0,!=4.52.0,!=4.52.1,!=4.52.2,!=4.52.3,!=4.53.0,!=4.54.0,!=4.55.0,!=4.55.1",
@@ -453,7 +453,7 @@ colab-ampere-torch220 = [
     "flash-attn>=2.6.3",
 ]
 colab-new = [
-    "unsloth_zoo>=2025.8.7",
+    "unsloth_zoo>=2025.8.8",
     "packaging",
     "tyro",
     "transformers>=4.51.3,!=4.47.0,!=4.52.0,!=4.52.1,!=4.52.2,!=4.52.3,!=4.53.0,!=4.54.0,!=4.55.0,!=4.55.1",
diff --git a/unsloth/__init__.py b/unsloth/__init__.py
index 335db48775..a6ea8f4c9f 100644
--- a/unsloth/__init__.py
+++ b/unsloth/__init__.py
@@ -212,7 +212,7 @@ def is_bf16_supported(): return SUPPORTS_BFLOAT16
 # Check for unsloth_zoo
 try:
     unsloth_zoo_version = importlib_version("unsloth_zoo")
-    if Version(unsloth_zoo_version) < Version("2025.8.1"):
+    if Version(unsloth_zoo_version) < Version("2025.8.8"):
         print(
             "Unsloth: Please update Unsloth and Unsloth-Zoo to the latest version!\n"\
             "Do this via `pip install --upgrade --force-reinstall --no-cache-dir --no-deps unsloth unsloth_zoo`"
diff --git a/unsloth/models/_utils.py b/unsloth/models/_utils.py
index 85f1a9a960..fde776a5e6 100644
--- a/unsloth/models/_utils.py
+++ b/unsloth/models/_utils.py
@@ -12,7 +12,7 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-__version__ = "2025.8.8"
+__version__ = "2025.8.9"
 
 __all__ = [
     "SUPPORTS_BFLOAT16",

From 94bcb28818558f7de378ef4356b5ac6651e545fa Mon Sep 17 00:00:00 2001
From: Daniel Han <danielhanchen@gmail.com>
Date: Wed, 20 Aug 2025 04:40:24 -0700
Subject: [PATCH 068/272] Update loader.py

---
 unsloth/models/loader.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/unsloth/models/loader.py b/unsloth/models/loader.py
index 0da6b83d12..54d2fa2ce6 100644
--- a/unsloth/models/loader.py
+++ b/unsloth/models/loader.py
@@ -638,7 +638,7 @@ def from_pretrained(
                 os.environ["UNSLOTH_FORCE_CUSTOM_DTYPE"] = \
                     "all;None;None;"\
                     "if ('down_projs' in name) and hasattr(module, 'weight') and "\
-                    "torch.amax(dequantize_module_weight(module)) >= 1024:"\
+                    "torch.amax(dequantize_module_weight(module)) >= 0:"\
                     "module._pre_set_compute_dtype = torch.float32\n"\
                     ""\
                     ";"

From 7d7a1156843603b2b283f77e283801feffbb0ac6 Mon Sep 17 00:00:00 2001
From: Daniel Han <danielhanchen@gmail.com>
Date: Wed, 20 Aug 2025 05:17:37 -0700
Subject: [PATCH 069/272] Update loader.py

---
 unsloth/models/loader.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/unsloth/models/loader.py b/unsloth/models/loader.py
index 54d2fa2ce6..878a7c4a4c 100644
--- a/unsloth/models/loader.py
+++ b/unsloth/models/loader.py
@@ -637,7 +637,7 @@ def from_pretrained(
                 # Set norms to float32 since anyways they get upcasted to float32
                 os.environ["UNSLOTH_FORCE_CUSTOM_DTYPE"] = \
                     "all;None;None;"\
-                    "if ('down_projs' in name) and hasattr(module, 'weight') and "\
+                    "if hasattr(module, 'weight') and "\
                     "torch.amax(dequantize_module_weight(module)) >= 0:"\
                     "module._pre_set_compute_dtype = torch.float32\n"\
                     ""\

From 031f5e12487786462fc2f0306ff6792697b2dec7 Mon Sep 17 00:00:00 2001
From: Daniel Han <danielhanchen@gmail.com>
Date: Wed, 20 Aug 2025 07:20:46 -0700
Subject: [PATCH 070/272] Update loader.py

---
 unsloth/models/loader.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/unsloth/models/loader.py b/unsloth/models/loader.py
index 878a7c4a4c..3af8200ebb 100644
--- a/unsloth/models/loader.py
+++ b/unsloth/models/loader.py
@@ -636,8 +636,8 @@ def from_pretrained(
                 # Set down projection compute dtype to be float32 for float16 machines
                 # Set norms to float32 since anyways they get upcasted to float32
                 os.environ["UNSLOTH_FORCE_CUSTOM_DTYPE"] = \
-                    "all;None;None;"\
-                    "if hasattr(module, 'weight') and "\
+                    "torch.float16;torch.bfloat16;torch.bfloat16;"\
+                    "if ('down_projs' in name) and hasattr(module, 'weight') and "\
                     "torch.amax(dequantize_module_weight(module)) >= 0:"\
                     "module._pre_set_compute_dtype = torch.float32\n"\
                     ""\

From 98bee64be03b6988613e2e3b1dbc5013bff3242b Mon Sep 17 00:00:00 2001
From: Daniel Han <danielhanchen@gmail.com>
Date: Wed, 20 Aug 2025 07:34:42 -0700
Subject: [PATCH 071/272] Update loader.py

---
 unsloth/models/loader.py | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/unsloth/models/loader.py b/unsloth/models/loader.py
index 3af8200ebb..3aed8654f8 100644
--- a/unsloth/models/loader.py
+++ b/unsloth/models/loader.py
@@ -636,11 +636,13 @@ def from_pretrained(
                 # Set down projection compute dtype to be float32 for float16 machines
                 # Set norms to float32 since anyways they get upcasted to float32
                 os.environ["UNSLOTH_FORCE_CUSTOM_DTYPE"] = \
-                    "torch.float16;torch.bfloat16;torch.bfloat16;"\
+                    "torch.float16;torch.bfloat16;torch.float16;"\
                     "if ('down_projs' in name) and hasattr(module, 'weight') and "\
                     "torch.amax(dequantize_module_weight(module)) >= 0:"\
                     "module._pre_set_compute_dtype = torch.float32\n"\
                     ""\
+                    "if ('mlp.router' in name) and hasattr(module, 'weight'):"\
+                    "module._pre_set_compute_dtype = torch.float32\n"\
                     ";"
             # Set norms to float32 since anyways they get upcasted to float32
             os.environ["UNSLOTH_HIGH_PRECISION_LAYERNORM"] = "1"

From 2ba900880d41c43e5322837d046f00425f3a249c Mon Sep 17 00:00:00 2001
From: Daniel Han <danielhanchen@gmail.com>
Date: Wed, 20 Aug 2025 17:24:53 -0700
Subject: [PATCH 072/272] Update vision.py

---
 unsloth/models/vision.py | 4 ++++
 1 file changed, 4 insertions(+)

diff --git a/unsloth/models/vision.py b/unsloth/models/vision.py
index 6790c5cd12..2d3e0a2002 100644
--- a/unsloth/models/vision.py
+++ b/unsloth/models/vision.py
@@ -245,6 +245,7 @@ def unsloth_base_fast_generate(
     return output
 pass
 
+global partial_model
 
 class FastBaseModel:
 
@@ -454,6 +455,9 @@ def from_pretrained(
         raise_handler.remove()
         # Return old flag
         os.environ["HF_HUB_ENABLE_HF_TRANSFER"] = old_hf_transfer
+        global partial_model
+        partial_model = model
+        raise
 
         # Check float32 norm weights
         if os.environ.get("UNSLOTH_HIGH_PRECISION_LAYERNORM", "0") == "1":

From ea435e6d06712d59ebe00f8e23c86edacc96173a Mon Sep 17 00:00:00 2001
From: Daniel Han <danielhanchen@gmail.com>
Date: Wed, 20 Aug 2025 18:17:55 -0700
Subject: [PATCH 073/272] Update vision.py

---
 unsloth/models/vision.py | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/unsloth/models/vision.py b/unsloth/models/vision.py
index 2d3e0a2002..a61337b791 100644
--- a/unsloth/models/vision.py
+++ b/unsloth/models/vision.py
@@ -455,9 +455,6 @@ def from_pretrained(
         raise_handler.remove()
         # Return old flag
         os.environ["HF_HUB_ENABLE_HF_TRANSFER"] = old_hf_transfer
-        global partial_model
-        partial_model = model
-        raise
 
         # Check float32 norm weights
         if os.environ.get("UNSLOTH_HIGH_PRECISION_LAYERNORM", "0") == "1":
@@ -525,6 +522,9 @@ def from_pretrained(
         )
         model, tokenizer = patch_tokenizer(model, tokenizer)
         model = post_patch_loss_function(model)
+        global partial_model
+        partial_model = model
+        raise
 
         # Log Unsloth version for future fastpaths for inference
         if hasattr(model, "config"):

From 5bebfa9f37b933a3b000a5aa3f22448ac8fde7c0 Mon Sep 17 00:00:00 2001
From: Daniel Han <danielhanchen@gmail.com>
Date: Wed, 20 Aug 2025 19:13:42 -0700
Subject: [PATCH 074/272] custom_datatype

---
 unsloth/models/loader.py | 2 +-
 unsloth/models/vision.py | 1 +
 2 files changed, 2 insertions(+), 1 deletion(-)

diff --git a/unsloth/models/loader.py b/unsloth/models/loader.py
index 3aed8654f8..9ab990133c 100644
--- a/unsloth/models/loader.py
+++ b/unsloth/models/loader.py
@@ -640,7 +640,7 @@ def from_pretrained(
                     "if ('down_projs' in name) and hasattr(module, 'weight') and "\
                     "torch.amax(dequantize_module_weight(module)) >= 0:"\
                     "module._pre_set_compute_dtype = torch.float32\n"\
-                    ""\
+                    "\n"\
                     "if ('mlp.router' in name) and hasattr(module, 'weight'):"\
                     "module._pre_set_compute_dtype = torch.float32\n"\
                     ";"
diff --git a/unsloth/models/vision.py b/unsloth/models/vision.py
index a61337b791..c57fd80ef5 100644
--- a/unsloth/models/vision.py
+++ b/unsloth/models/vision.py
@@ -463,6 +463,7 @@ def from_pretrained(
                     module._pre_set_compute_dtype = torch.float32
         pass
         # Edit data-types
+        print("custom_datatype", custom_datatype)
         if custom_datatype is not None:
             with torch.no_grad():
                 for jj, (name, module) in enumerate(model.named_modules()):

From 356789a65805931f09ffca007227d203f19d1ebc Mon Sep 17 00:00:00 2001
From: Daniel Han <danielhanchen@gmail.com>
Date: Wed, 20 Aug 2025 19:18:03 -0700
Subject: [PATCH 075/272] recheck

---
 unsloth/models/loader.py | 1 +
 unsloth/models/vision.py | 2 +-
 2 files changed, 2 insertions(+), 1 deletion(-)

diff --git a/unsloth/models/loader.py b/unsloth/models/loader.py
index 9ab990133c..3de0943917 100644
--- a/unsloth/models/loader.py
+++ b/unsloth/models/loader.py
@@ -646,6 +646,7 @@ def from_pretrained(
                     ";"
             # Set norms to float32 since anyways they get upcasted to float32
             os.environ["UNSLOTH_HIGH_PRECISION_LAYERNORM"] = "1"
+            print(os.environ["UNSLOTH_FORCE_CUSTOM_DTYPE"])
         else:
             for check_model_name in DISABLE_COMPILE_MODEL_NAMES:
                 if check_model_name in lowered_model_name:
diff --git a/unsloth/models/vision.py b/unsloth/models/vision.py
index c57fd80ef5..419d760f7a 100644
--- a/unsloth/models/vision.py
+++ b/unsloth/models/vision.py
@@ -359,7 +359,7 @@ def from_pretrained(
             custom_datatype = os.environ["UNSLOTH_FORCE_CUSTOM_DTYPE"]
             assert custom_datatype.count(";") >= 4
             checker, _dtype, _bnb_compute_dtype, _custom_datatype, execute_code = custom_datatype.split(";", 4)
-
+            print(checker, _dtype, _bnb_compute_dtype, _custom_datatype, execute_code)
             # Allow custom dtypes on all runs
             allow_all_runs = (checker == "all")
             # Allow only on float16 datatypes

From d0f97a9a0f295fbe08f3c6b4401b34bcea125ac1 Mon Sep 17 00:00:00 2001
From: Daniel Han <danielhanchen@gmail.com>
Date: Wed, 20 Aug 2025 19:21:21 -0700
Subject: [PATCH 076/272] Float16

---
 unsloth/models/loader.py | 5 ++---
 unsloth/models/vision.py | 5 ++++-
 2 files changed, 6 insertions(+), 4 deletions(-)

diff --git a/unsloth/models/loader.py b/unsloth/models/loader.py
index 3de0943917..a7d3da17bd 100644
--- a/unsloth/models/loader.py
+++ b/unsloth/models/loader.py
@@ -601,7 +601,7 @@ def from_pretrained(
                 raise RuntimeError("Unsloth: Gemma 3N only works on transformers >= 4.53.0" + LATEST)
             os.environ["UNSLOTH_DISABLE_STATIC_GENERATION"] = "1"
             os.environ["UNSLOTH_FORCE_CUSTOM_DTYPE"] = \
-                "float16;torch.float16;torch.float16;"\
+                "torch.float16;torch.float16;torch.float16;"\
                 "if name.endswith('norm'): "\
                 "module._pre_set_compute_dtype = torch.float32\n"\
                 ";"\
@@ -612,7 +612,7 @@ def from_pretrained(
             # Falcon must use float32 Triton ie TRITON_F32_DEFAULT = 'ieee'
             # since Mamba kernels error out on using lower precision
             os.environ["UNSLOTH_FORCE_CUSTOM_DTYPE"] = \
-                "float16;torch.float32;torch.float16;"\
+                "torch.float16;torch.float32;torch.float16;"\
                 "if name.endswith(('q_proj', 'k_proj', 'v_proj', 'o_proj', 'gate_proj', 'up_proj', 'down_proj', 'head')): module.to(torch.float16)"\
                 ";"\
                 "os.environ['TRITON_F32_DEFAULT'] = 'ieee'"
@@ -646,7 +646,6 @@ def from_pretrained(
                     ";"
             # Set norms to float32 since anyways they get upcasted to float32
             os.environ["UNSLOTH_HIGH_PRECISION_LAYERNORM"] = "1"
-            print(os.environ["UNSLOTH_FORCE_CUSTOM_DTYPE"])
         else:
             for check_model_name in DISABLE_COMPILE_MODEL_NAMES:
                 if check_model_name in lowered_model_name:
diff --git a/unsloth/models/vision.py b/unsloth/models/vision.py
index 419d760f7a..12ec00c3bd 100644
--- a/unsloth/models/vision.py
+++ b/unsloth/models/vision.py
@@ -363,7 +363,10 @@ def from_pretrained(
             # Allow custom dtypes on all runs
             allow_all_runs = (checker == "all")
             # Allow only on float16 datatypes
-            allow_float16_runs = (checker == "float16" and dtype == torch.float16)
+            allow_float16_runs = (
+                (checker == "float16" or checker == "torch.float16") and \
+                (dtype == torch.float16)
+            )
 
             if allow_all_runs or allow_float16_runs:
                 if eval(_dtype) is not None:

From d83767f321203359cd31a096b502b6d81181fe77 Mon Sep 17 00:00:00 2001
From: Daniel Han <danielhanchen@gmail.com>
Date: Wed, 20 Aug 2025 19:24:26 -0700
Subject: [PATCH 077/272] Update vision.py

---
 unsloth/models/vision.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/unsloth/models/vision.py b/unsloth/models/vision.py
index 12ec00c3bd..705647cb28 100644
--- a/unsloth/models/vision.py
+++ b/unsloth/models/vision.py
@@ -359,7 +359,6 @@ def from_pretrained(
             custom_datatype = os.environ["UNSLOTH_FORCE_CUSTOM_DTYPE"]
             assert custom_datatype.count(";") >= 4
             checker, _dtype, _bnb_compute_dtype, _custom_datatype, execute_code = custom_datatype.split(";", 4)
-            print(checker, _dtype, _bnb_compute_dtype, _custom_datatype, execute_code)
             # Allow custom dtypes on all runs
             allow_all_runs = (checker == "all")
             # Allow only on float16 datatypes
@@ -367,6 +366,7 @@ def from_pretrained(
                 (checker == "float16" or checker == "torch.float16") and \
                 (dtype == torch.float16)
             )
+            print([checker], [_dtype], [_bnb_compute_dtype], [_custom_datatype], [execute_code] )
 
             if allow_all_runs or allow_float16_runs:
                 if eval(_dtype) is not None:
@@ -387,7 +387,7 @@ def from_pretrained(
         if not ("attn_implementation" in kwargs):
             kwargs["attn_implementation"] = "sdpa"
         if not supports_sdpa:
-            print(f"Unsloth: {model_type_arch.title()} does not support SDPA - switching to eager!")
+            print(f"Unsloth: {model_type_arch.title()} does not support SDPA - switching to fast eager.")
             del kwargs["attn_implementation"]
         pass
 

From 5b575d87ef24302cb434743868836bcd95acc2f2 Mon Sep 17 00:00:00 2001
From: Daniel Han <danielhanchen@gmail.com>
Date: Wed, 20 Aug 2025 19:27:58 -0700
Subject: [PATCH 078/272] Update vision.py

---
 unsloth/models/vision.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/unsloth/models/vision.py b/unsloth/models/vision.py
index 705647cb28..44f62d850d 100644
--- a/unsloth/models/vision.py
+++ b/unsloth/models/vision.py
@@ -366,7 +366,7 @@ def from_pretrained(
                 (checker == "float16" or checker == "torch.float16") and \
                 (dtype == torch.float16)
             )
-            print([checker], [_dtype], [_bnb_compute_dtype], [_custom_datatype], [execute_code] )
+            print([allow_float16_runs], [checker], [_dtype], [_bnb_compute_dtype], [_custom_datatype], [execute_code] )
 
             if allow_all_runs or allow_float16_runs:
                 if eval(_dtype) is not None:

From 66eee4deea47e76281497aeabc0be1a215ab9f39 Mon Sep 17 00:00:00 2001
From: Daniel Han <danielhanchen@gmail.com>
Date: Wed, 20 Aug 2025 19:29:05 -0700
Subject: [PATCH 079/272] Update vision.py

---
 unsloth/models/vision.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/unsloth/models/vision.py b/unsloth/models/vision.py
index 44f62d850d..3ce03e6da7 100644
--- a/unsloth/models/vision.py
+++ b/unsloth/models/vision.py
@@ -366,7 +366,7 @@ def from_pretrained(
                 (checker == "float16" or checker == "torch.float16") and \
                 (dtype == torch.float16)
             )
-            print([allow_float16_runs], [checker], [_dtype], [_bnb_compute_dtype], [_custom_datatype], [execute_code] )
+            print([(checker == "float16" or checker == "torch.float16")], [dtype], [allow_float16_runs], [checker], [_dtype], [_bnb_compute_dtype], [_custom_datatype], [execute_code] )
 
             if allow_all_runs or allow_float16_runs:
                 if eval(_dtype) is not None:

From 27d044e47840785f40a195aa7ee77dcab1149046 Mon Sep 17 00:00:00 2001
From: Daniel Han <danielhanchen@gmail.com>
Date: Wed, 20 Aug 2025 20:38:44 -0700
Subject: [PATCH 080/272] Update vision.py

---
 unsloth/models/vision.py | 4 +---
 1 file changed, 1 insertion(+), 3 deletions(-)

diff --git a/unsloth/models/vision.py b/unsloth/models/vision.py
index 3ce03e6da7..e125824c63 100644
--- a/unsloth/models/vision.py
+++ b/unsloth/models/vision.py
@@ -364,10 +364,8 @@ def from_pretrained(
             # Allow only on float16 datatypes
             allow_float16_runs = (
                 (checker == "float16" or checker == "torch.float16") and \
-                (dtype == torch.float16)
+                (dtype == torch.float16 or os.environ.get("UNSLOTH_FORCE_FLOAT32", "0") == "1")
             )
-            print([(checker == "float16" or checker == "torch.float16")], [dtype], [allow_float16_runs], [checker], [_dtype], [_bnb_compute_dtype], [_custom_datatype], [execute_code] )
-
             if allow_all_runs or allow_float16_runs:
                 if eval(_dtype) is not None:
                     dtype = eval(_dtype)

From 34d07d89463c21cbb33275ccffaf044e3d7df243 Mon Sep 17 00:00:00 2001
From: Daniel Han <danielhanchen@gmail.com>
Date: Wed, 20 Aug 2025 20:42:24 -0700
Subject: [PATCH 081/272] Update vision.py

---
 unsloth/models/vision.py | 4 ----
 1 file changed, 4 deletions(-)

diff --git a/unsloth/models/vision.py b/unsloth/models/vision.py
index e125824c63..23e2bb088a 100644
--- a/unsloth/models/vision.py
+++ b/unsloth/models/vision.py
@@ -464,7 +464,6 @@ def from_pretrained(
                     module._pre_set_compute_dtype = torch.float32
         pass
         # Edit data-types
-        print("custom_datatype", custom_datatype)
         if custom_datatype is not None:
             with torch.no_grad():
                 for jj, (name, module) in enumerate(model.named_modules()):
@@ -524,9 +523,6 @@ def from_pretrained(
         )
         model, tokenizer = patch_tokenizer(model, tokenizer)
         model = post_patch_loss_function(model)
-        global partial_model
-        partial_model = model
-        raise
 
         # Log Unsloth version for future fastpaths for inference
         if hasattr(model, "config"):

From 3ad756145f638cfaa2f15a21f24d4b97d58d4ad1 Mon Sep 17 00:00:00 2001
From: Daniel Han <danielhanchen@gmail.com>
Date: Wed, 20 Aug 2025 21:29:44 -0700
Subject: [PATCH 082/272] Update loader.py

---
 unsloth/models/loader.py | 4 +---
 1 file changed, 1 insertion(+), 3 deletions(-)

diff --git a/unsloth/models/loader.py b/unsloth/models/loader.py
index a7d3da17bd..0156e2f059 100644
--- a/unsloth/models/loader.py
+++ b/unsloth/models/loader.py
@@ -638,11 +638,9 @@ def from_pretrained(
                 os.environ["UNSLOTH_FORCE_CUSTOM_DTYPE"] = \
                     "torch.float16;torch.bfloat16;torch.float16;"\
                     "if ('down_projs' in name) and hasattr(module, 'weight') and "\
-                    "torch.amax(dequantize_module_weight(module)) >= 0:"\
+                    "torch.amax(dequantize_module_weight(module)) >= 1024:"\
                     "module._pre_set_compute_dtype = torch.float32\n"\
                     "\n"\
-                    "if ('mlp.router' in name) and hasattr(module, 'weight'):"\
-                    "module._pre_set_compute_dtype = torch.float32\n"\
                     ";"
             # Set norms to float32 since anyways they get upcasted to float32
             os.environ["UNSLOTH_HIGH_PRECISION_LAYERNORM"] = "1"

From b75729795a21149ff23f513469f603f21ddf7a0b Mon Sep 17 00:00:00 2001
From: Daniel Han <danielhanchen@gmail.com>
Date: Wed, 20 Aug 2025 21:31:05 -0700
Subject: [PATCH 083/272] Update loader.py

---
 unsloth/models/loader.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/unsloth/models/loader.py b/unsloth/models/loader.py
index 0156e2f059..14baa60d66 100644
--- a/unsloth/models/loader.py
+++ b/unsloth/models/loader.py
@@ -638,7 +638,7 @@ def from_pretrained(
                 os.environ["UNSLOTH_FORCE_CUSTOM_DTYPE"] = \
                     "torch.float16;torch.bfloat16;torch.float16;"\
                     "if ('down_projs' in name) and hasattr(module, 'weight') and "\
-                    "torch.amax(dequantize_module_weight(module)) >= 1024:"\
+                    "torch.amax(dequantize_module_weight(module)) >= 102400:"\
                     "module._pre_set_compute_dtype = torch.float32\n"\
                     "\n"\
                     ";"

From ceeca866ae8cb9774a830d3fba84c9238c281d77 Mon Sep 17 00:00:00 2001
From: Daniel Han <danielhanchen@gmail.com>
Date: Wed, 20 Aug 2025 21:44:30 -0700
Subject: [PATCH 084/272] Update loader.py

---
 unsloth/models/loader.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/unsloth/models/loader.py b/unsloth/models/loader.py
index 14baa60d66..4e0365ce1e 100644
--- a/unsloth/models/loader.py
+++ b/unsloth/models/loader.py
@@ -638,7 +638,7 @@ def from_pretrained(
                 os.environ["UNSLOTH_FORCE_CUSTOM_DTYPE"] = \
                     "torch.float16;torch.bfloat16;torch.float16;"\
                     "if ('down_projs' in name) and hasattr(module, 'weight') and "\
-                    "torch.amax(dequantize_module_weight(module)) >= 102400:"\
+                    "torch.amax(dequantize_module_weight(module)) >= 512:"\
                     "module._pre_set_compute_dtype = torch.float32\n"\
                     "\n"\
                     ";"

From 87758b98edf6cc2aa8addbd19cfba4678fa3cc2c Mon Sep 17 00:00:00 2001
From: Daniel Han <danielhanchen@gmail.com>
Date: Wed, 20 Aug 2025 21:51:36 -0700
Subject: [PATCH 085/272] Update loader.py

---
 unsloth/models/loader.py | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/unsloth/models/loader.py b/unsloth/models/loader.py
index 4e0365ce1e..85696859ae 100644
--- a/unsloth/models/loader.py
+++ b/unsloth/models/loader.py
@@ -638,9 +638,11 @@ def from_pretrained(
                 os.environ["UNSLOTH_FORCE_CUSTOM_DTYPE"] = \
                     "torch.float16;torch.bfloat16;torch.float16;"\
                     "if ('down_projs' in name) and hasattr(module, 'weight') and "\
-                    "torch.amax(dequantize_module_weight(module)) >= 512:"\
+                    "torch.amax(dequantize_module_weight(module)) >= 256:"\
                     "module._pre_set_compute_dtype = torch.float32\n"\
                     "\n"\
+                    "if ('mlp.router' in name) and hasattr(module, 'weight'):"\
+                    "module._pre_set_compute_dtype = torch.float32\n"\
                     ";"
             # Set norms to float32 since anyways they get upcasted to float32
             os.environ["UNSLOTH_HIGH_PRECISION_LAYERNORM"] = "1"

From 97d34d48536b35c0d2fd7d60995c099aea8a6d83 Mon Sep 17 00:00:00 2001
From: Daniel Han <danielhanchen@gmail.com>
Date: Thu, 21 Aug 2025 00:23:27 -0700
Subject: [PATCH 086/272] Update loader.py

---
 unsloth/models/loader.py | 4 +---
 1 file changed, 1 insertion(+), 3 deletions(-)

diff --git a/unsloth/models/loader.py b/unsloth/models/loader.py
index 85696859ae..4e0365ce1e 100644
--- a/unsloth/models/loader.py
+++ b/unsloth/models/loader.py
@@ -638,11 +638,9 @@ def from_pretrained(
                 os.environ["UNSLOTH_FORCE_CUSTOM_DTYPE"] = \
                     "torch.float16;torch.bfloat16;torch.float16;"\
                     "if ('down_projs' in name) and hasattr(module, 'weight') and "\
-                    "torch.amax(dequantize_module_weight(module)) >= 256:"\
+                    "torch.amax(dequantize_module_weight(module)) >= 512:"\
                     "module._pre_set_compute_dtype = torch.float32\n"\
                     "\n"\
-                    "if ('mlp.router' in name) and hasattr(module, 'weight'):"\
-                    "module._pre_set_compute_dtype = torch.float32\n"\
                     ";"
             # Set norms to float32 since anyways they get upcasted to float32
             os.environ["UNSLOTH_HIGH_PRECISION_LAYERNORM"] = "1"

From 43bf41f9df86e3bb2bf40e4db8957e0418fbc5e6 Mon Sep 17 00:00:00 2001
From: Daniel Han <danielhanchen@gmail.com>
Date: Thu, 21 Aug 2025 00:24:39 -0700
Subject: [PATCH 087/272] Update loader.py

---
 unsloth/models/loader.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/unsloth/models/loader.py b/unsloth/models/loader.py
index 4e0365ce1e..94a07bf06a 100644
--- a/unsloth/models/loader.py
+++ b/unsloth/models/loader.py
@@ -638,7 +638,7 @@ def from_pretrained(
                 os.environ["UNSLOTH_FORCE_CUSTOM_DTYPE"] = \
                     "torch.float16;torch.bfloat16;torch.float16;"\
                     "if ('down_projs' in name) and hasattr(module, 'weight') and "\
-                    "torch.amax(dequantize_module_weight(module)) >= 512:"\
+                    "torch.amax(dequantize_module_weight(module)) >= 256:"\
                     "module._pre_set_compute_dtype = torch.float32\n"\
                     "\n"\
                     ";"

From 6e7ad5259d13c959cb08ee81a97547425144d639 Mon Sep 17 00:00:00 2001
From: Daniel Han <danielhanchen@gmail.com>
Date: Thu, 21 Aug 2025 00:26:49 -0700
Subject: [PATCH 088/272] Update loader.py

---
 unsloth/models/loader.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/unsloth/models/loader.py b/unsloth/models/loader.py
index 94a07bf06a..c9c1e05553 100644
--- a/unsloth/models/loader.py
+++ b/unsloth/models/loader.py
@@ -638,7 +638,7 @@ def from_pretrained(
                 os.environ["UNSLOTH_FORCE_CUSTOM_DTYPE"] = \
                     "torch.float16;torch.bfloat16;torch.float16;"\
                     "if ('down_projs' in name) and hasattr(module, 'weight') and "\
-                    "torch.amax(dequantize_module_weight(module)) >= 256:"\
+                    "torch.amax(dequantize_module_weight(module)) >= 128:"\
                     "module._pre_set_compute_dtype = torch.float32\n"\
                     "\n"\
                     ";"

From d605aa7311bffa8e80ae6ec3e6f34716d209e140 Mon Sep 17 00:00:00 2001
From: Daniel Han <danielhanchen@gmail.com>
Date: Thu, 21 Aug 2025 00:35:38 -0700
Subject: [PATCH 089/272] Update loader.py

---
 unsloth/models/loader.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/unsloth/models/loader.py b/unsloth/models/loader.py
index c9c1e05553..6ec045eb36 100644
--- a/unsloth/models/loader.py
+++ b/unsloth/models/loader.py
@@ -638,7 +638,7 @@ def from_pretrained(
                 os.environ["UNSLOTH_FORCE_CUSTOM_DTYPE"] = \
                     "torch.float16;torch.bfloat16;torch.float16;"\
                     "if ('down_projs' in name) and hasattr(module, 'weight') and "\
-                    "torch.amax(dequantize_module_weight(module)) >= 128:"\
+                    "torch.amax(dequantize_module_weight(module)) >= 0:"\
                     "module._pre_set_compute_dtype = torch.float32\n"\
                     "\n"\
                     ";"

From f417dc882969acfd9e11a4a3d0ed7b548371aa2e Mon Sep 17 00:00:00 2001
From: Daniel Han <danielhanchen@gmail.com>
Date: Thu, 21 Aug 2025 00:51:06 -0700
Subject: [PATCH 090/272] Update loader.py

---
 unsloth/models/loader.py | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/unsloth/models/loader.py b/unsloth/models/loader.py
index 6ec045eb36..a7d3da17bd 100644
--- a/unsloth/models/loader.py
+++ b/unsloth/models/loader.py
@@ -641,6 +641,8 @@ def from_pretrained(
                     "torch.amax(dequantize_module_weight(module)) >= 0:"\
                     "module._pre_set_compute_dtype = torch.float32\n"\
                     "\n"\
+                    "if ('mlp.router' in name) and hasattr(module, 'weight'):"\
+                    "module._pre_set_compute_dtype = torch.float32\n"\
                     ";"
             # Set norms to float32 since anyways they get upcasted to float32
             os.environ["UNSLOTH_HIGH_PRECISION_LAYERNORM"] = "1"

From 05fe3d1fd7d6f202a4f8b50262d5d00127eb72e2 Mon Sep 17 00:00:00 2001
From: Daniel Han <danielhanchen@gmail.com>
Date: Thu, 21 Aug 2025 01:21:10 -0700
Subject: [PATCH 091/272] Update loader.py

---
 unsloth/models/loader.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/unsloth/models/loader.py b/unsloth/models/loader.py
index a7d3da17bd..28bb896760 100644
--- a/unsloth/models/loader.py
+++ b/unsloth/models/loader.py
@@ -637,7 +637,7 @@ def from_pretrained(
                 # Set norms to float32 since anyways they get upcasted to float32
                 os.environ["UNSLOTH_FORCE_CUSTOM_DTYPE"] = \
                     "torch.float16;torch.bfloat16;torch.float16;"\
-                    "if ('down_projs' in name) and hasattr(module, 'weight') and "\
+                    "if ('down_projs' in name or 'gate_up_proj' in name) and hasattr(module, 'weight') and "\
                     "torch.amax(dequantize_module_weight(module)) >= 0:"\
                     "module._pre_set_compute_dtype = torch.float32\n"\
                     "\n"\

From a79d6f6ac880e17b6079b1ba7981b130615a19dc Mon Sep 17 00:00:00 2001
From: Daniel Han <danielhanchen@gmail.com>
Date: Thu, 21 Aug 2025 01:54:48 -0700
Subject: [PATCH 092/272] Update loader.py

---
 unsloth/models/loader.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/unsloth/models/loader.py b/unsloth/models/loader.py
index 28bb896760..a7d3da17bd 100644
--- a/unsloth/models/loader.py
+++ b/unsloth/models/loader.py
@@ -637,7 +637,7 @@ def from_pretrained(
                 # Set norms to float32 since anyways they get upcasted to float32
                 os.environ["UNSLOTH_FORCE_CUSTOM_DTYPE"] = \
                     "torch.float16;torch.bfloat16;torch.float16;"\
-                    "if ('down_projs' in name or 'gate_up_proj' in name) and hasattr(module, 'weight') and "\
+                    "if ('down_projs' in name) and hasattr(module, 'weight') and "\
                     "torch.amax(dequantize_module_weight(module)) >= 0:"\
                     "module._pre_set_compute_dtype = torch.float32\n"\
                     "\n"\

From 59702c494078128468015ccd003761e83ca2451a Mon Sep 17 00:00:00 2001
From: Daniel Han <danielhanchen@gmail.com>
Date: Thu, 21 Aug 2025 02:09:23 -0700
Subject: [PATCH 093/272] Update loader.py

---
 unsloth/models/loader.py | 4 ++++
 1 file changed, 4 insertions(+)

diff --git a/unsloth/models/loader.py b/unsloth/models/loader.py
index a7d3da17bd..b95678a499 100644
--- a/unsloth/models/loader.py
+++ b/unsloth/models/loader.py
@@ -643,6 +643,10 @@ def from_pretrained(
                     "\n"\
                     "if ('mlp.router' in name) and hasattr(module, 'weight'):"\
                     "module._pre_set_compute_dtype = torch.float32\n"\
+                    "\n"\
+                    "if ('self_attn' in name) and hasattr(module, 'sinks'):"\
+                    "module.sinks._pre_set_compute_dtype = torch.float32\n"\
+                    "\n"\
                     ";"
             # Set norms to float32 since anyways they get upcasted to float32
             os.environ["UNSLOTH_HIGH_PRECISION_LAYERNORM"] = "1"

From 1b66aee7b2f395ba51e1a2e69219f2c08701a95c Mon Sep 17 00:00:00 2001
From: Daniel Han <danielhanchen@gmail.com>
Date: Thu, 21 Aug 2025 02:32:41 -0700
Subject: [PATCH 094/272] Update loader.py

---
 unsloth/models/loader.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/unsloth/models/loader.py b/unsloth/models/loader.py
index b95678a499..ef39e636c2 100644
--- a/unsloth/models/loader.py
+++ b/unsloth/models/loader.py
@@ -637,7 +637,7 @@ def from_pretrained(
                 # Set norms to float32 since anyways they get upcasted to float32
                 os.environ["UNSLOTH_FORCE_CUSTOM_DTYPE"] = \
                     "torch.float16;torch.bfloat16;torch.float16;"\
-                    "if ('down_projs' in name) and hasattr(module, 'weight') and "\
+                    "if ('down_projs' in name or '_proj' in name) and hasattr(module, 'weight') and "\
                     "torch.amax(dequantize_module_weight(module)) >= 0:"\
                     "module._pre_set_compute_dtype = torch.float32\n"\
                     "\n"\

From a71fa05c7a7a8e72547a7c054e659ce1149e088e Mon Sep 17 00:00:00 2001
From: Daniel Han <danielhanchen@gmail.com>
Date: Thu, 21 Aug 2025 02:51:45 -0700
Subject: [PATCH 095/272] Update loader.py

---
 unsloth/models/loader.py | 5 ++++-
 1 file changed, 4 insertions(+), 1 deletion(-)

diff --git a/unsloth/models/loader.py b/unsloth/models/loader.py
index ef39e636c2..dd0a3961e7 100644
--- a/unsloth/models/loader.py
+++ b/unsloth/models/loader.py
@@ -637,7 +637,7 @@ def from_pretrained(
                 # Set norms to float32 since anyways they get upcasted to float32
                 os.environ["UNSLOTH_FORCE_CUSTOM_DTYPE"] = \
                     "torch.float16;torch.bfloat16;torch.float16;"\
-                    "if ('down_projs' in name or '_proj' in name) and hasattr(module, 'weight') and "\
+                    "if ('down_projs') and hasattr(module, 'weight') and "\
                     "torch.amax(dequantize_module_weight(module)) >= 0:"\
                     "module._pre_set_compute_dtype = torch.float32\n"\
                     "\n"\
@@ -647,6 +647,9 @@ def from_pretrained(
                     "if ('self_attn' in name) and hasattr(module, 'sinks'):"\
                     "module.sinks._pre_set_compute_dtype = torch.float32\n"\
                     "\n"\
+                    "if ('embed_tokens' in name):"\
+                    "module.sinks._pre_set_compute_dtype = torch.float32\n"\
+                    "\n"\
                     ";"
             # Set norms to float32 since anyways they get upcasted to float32
             os.environ["UNSLOTH_HIGH_PRECISION_LAYERNORM"] = "1"

From d3e8625b1de6703165535f985d54ebf621eec1ae Mon Sep 17 00:00:00 2001
From: Daniel Han <danielhanchen@gmail.com>
Date: Thu, 21 Aug 2025 02:53:29 -0700
Subject: [PATCH 096/272] Update loader.py

---
 unsloth/models/loader.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/unsloth/models/loader.py b/unsloth/models/loader.py
index dd0a3961e7..1c64ae4cfc 100644
--- a/unsloth/models/loader.py
+++ b/unsloth/models/loader.py
@@ -647,8 +647,8 @@ def from_pretrained(
                     "if ('self_attn' in name) and hasattr(module, 'sinks'):"\
                     "module.sinks._pre_set_compute_dtype = torch.float32\n"\
                     "\n"\
-                    "if ('embed_tokens' in name):"\
-                    "module.sinks._pre_set_compute_dtype = torch.float32\n"\
+                    "if ('embed_tokens' in name) and hasattr(module, 'weight'):"\
+                    "module._pre_set_compute_dtype = torch.float32\n"\
                     "\n"\
                     ";"
             # Set norms to float32 since anyways they get upcasted to float32

From fb112cf3c6b48df1afcf51827f775ce1fee951eb Mon Sep 17 00:00:00 2001
From: Daniel Han <danielhanchen@gmail.com>
Date: Thu, 21 Aug 2025 03:09:03 -0700
Subject: [PATCH 097/272] Update loader.py

---
 unsloth/models/loader.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/unsloth/models/loader.py b/unsloth/models/loader.py
index 1c64ae4cfc..e8c410ebd1 100644
--- a/unsloth/models/loader.py
+++ b/unsloth/models/loader.py
@@ -647,7 +647,7 @@ def from_pretrained(
                     "if ('self_attn' in name) and hasattr(module, 'sinks'):"\
                     "module.sinks._pre_set_compute_dtype = torch.float32\n"\
                     "\n"\
-                    "if ('embed_tokens' in name) and hasattr(module, 'weight'):"\
+                    "if ('embed_tokens' in name or 'lm_head' in name) and hasattr(module, 'weight'):"\
                     "module._pre_set_compute_dtype = torch.float32\n"\
                     "\n"\
                     ";"

From 5dbdcc565dd6dc8fa5edc2bf4314ad326ffef18c Mon Sep 17 00:00:00 2001
From: Daniel Han <danielhanchen@gmail.com>
Date: Thu, 21 Aug 2025 03:29:27 -0700
Subject: [PATCH 098/272] Update loader.py

---
 unsloth/models/loader.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/unsloth/models/loader.py b/unsloth/models/loader.py
index e8c410ebd1..c9e0646af7 100644
--- a/unsloth/models/loader.py
+++ b/unsloth/models/loader.py
@@ -648,7 +648,7 @@ def from_pretrained(
                     "module.sinks._pre_set_compute_dtype = torch.float32\n"\
                     "\n"\
                     "if ('embed_tokens' in name or 'lm_head' in name) and hasattr(module, 'weight'):"\
-                    "module._pre_set_compute_dtype = torch.float32\n"\
+                    "module._pre_set_compute_dtype = torch.bfloat16\n"\
                     "\n"\
                     ";"
             # Set norms to float32 since anyways they get upcasted to float32

From fdaa0074093bfffd626632bf8153d52eb7c30a4e Mon Sep 17 00:00:00 2001
From: Daniel Han <danielhanchen@gmail.com>
Date: Thu, 21 Aug 2025 04:02:33 -0700
Subject: [PATCH 099/272] Update loader.py

---
 unsloth/models/loader.py | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/unsloth/models/loader.py b/unsloth/models/loader.py
index c9e0646af7..71459599a5 100644
--- a/unsloth/models/loader.py
+++ b/unsloth/models/loader.py
@@ -639,13 +639,13 @@ def from_pretrained(
                     "torch.float16;torch.bfloat16;torch.float16;"\
                     "if ('down_projs') and hasattr(module, 'weight') and "\
                     "torch.amax(dequantize_module_weight(module)) >= 0:"\
-                    "module._pre_set_compute_dtype = torch.float32\n"\
+                    "module._pre_set_compute_dtype = torch.bfloat16\n"\
                     "\n"\
                     "if ('mlp.router' in name) and hasattr(module, 'weight'):"\
-                    "module._pre_set_compute_dtype = torch.float32\n"\
+                    "module._pre_set_compute_dtype = torch.bfloat16\n"\
                     "\n"\
                     "if ('self_attn' in name) and hasattr(module, 'sinks'):"\
-                    "module.sinks._pre_set_compute_dtype = torch.float32\n"\
+                    "module.sinks._pre_set_compute_dtype = torch.bfloat16\n"\
                     "\n"\
                     "if ('embed_tokens' in name or 'lm_head' in name) and hasattr(module, 'weight'):"\
                     "module._pre_set_compute_dtype = torch.bfloat16\n"\

From ba0eb04d9076811da446e8a7d46717ac91fd2ada Mon Sep 17 00:00:00 2001
From: Daniel Han <danielhanchen@gmail.com>
Date: Thu, 21 Aug 2025 04:19:00 -0700
Subject: [PATCH 100/272] Bug fix

---
 unsloth/models/loader.py | 2 +-
 unsloth/models/vision.py | 3 ++-
 2 files changed, 3 insertions(+), 2 deletions(-)

diff --git a/unsloth/models/loader.py b/unsloth/models/loader.py
index 71459599a5..7b8320c65a 100644
--- a/unsloth/models/loader.py
+++ b/unsloth/models/loader.py
@@ -637,7 +637,7 @@ def from_pretrained(
                 # Set norms to float32 since anyways they get upcasted to float32
                 os.environ["UNSLOTH_FORCE_CUSTOM_DTYPE"] = \
                     "torch.float16;torch.bfloat16;torch.float16;"\
-                    "if ('down_projs') and hasattr(module, 'weight') and "\
+                    "if ('_proj' in name) and hasattr(module, 'weight') and "\
                     "torch.amax(dequantize_module_weight(module)) >= 0:"\
                     "module._pre_set_compute_dtype = torch.bfloat16\n"\
                     "\n"\
diff --git a/unsloth/models/vision.py b/unsloth/models/vision.py
index 23e2bb088a..486a049339 100644
--- a/unsloth/models/vision.py
+++ b/unsloth/models/vision.py
@@ -213,7 +213,8 @@ def unsloth_base_fast_generate(
         cache_implementation = None
     if cache_implementation is not None:
         swa = getattr(getattr(self.config, "text_config", self.config), "sliding_window", None)
-        if swa == 0 or type(swa) is not int:
+        if (swa == 0 or type(swa) is not int) \
+            and (getattr(self, "_can_compile_fullgraph", True) is True):
             cache_implementation = "static"
         else:
             cache_implementation = "hybrid"

From 3f982620a575c0117aafc572c4767d77ced7304b Mon Sep 17 00:00:00 2001
From: Daniel Han <danielhanchen@gmail.com>
Date: Thu, 21 Aug 2025 05:47:58 -0700
Subject: [PATCH 101/272] Update loader.py

---
 unsloth/models/loader.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/unsloth/models/loader.py b/unsloth/models/loader.py
index 7b8320c65a..f6bb23551d 100644
--- a/unsloth/models/loader.py
+++ b/unsloth/models/loader.py
@@ -636,7 +636,7 @@ def from_pretrained(
                 # Set down projection compute dtype to be float32 for float16 machines
                 # Set norms to float32 since anyways they get upcasted to float32
                 os.environ["UNSLOTH_FORCE_CUSTOM_DTYPE"] = \
-                    "torch.float16;torch.bfloat16;torch.float16;"\
+                    "torch.float16;torch.bfloat16;torch.bfloat16;"\
                     "if ('_proj' in name) and hasattr(module, 'weight') and "\
                     "torch.amax(dequantize_module_weight(module)) >= 0:"\
                     "module._pre_set_compute_dtype = torch.bfloat16\n"\

From 3e6511b84f297289bf694893b023db35fd24fc49 Mon Sep 17 00:00:00 2001
From: Daniel Han <danielhanchen@gmail.com>
Date: Thu, 21 Aug 2025 06:37:40 -0700
Subject: [PATCH 102/272] Update loader.py

---
 unsloth/models/loader.py | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/unsloth/models/loader.py b/unsloth/models/loader.py
index f6bb23551d..889d170a17 100644
--- a/unsloth/models/loader.py
+++ b/unsloth/models/loader.py
@@ -785,7 +785,8 @@ def from_pretrained(
         model_types = ["siglip"] + model_types
 
         # Set forced float32 env flag
-        os.environ["UNSLOTH_FORCE_FLOAT32"] = "0"
+        if "UNSLOTH_FORCE_FLOAT32" not in os.environ:
+            os.environ["UNSLOTH_FORCE_FLOAT32"] = "0"
         do_forced_float32 = False
         for model_type_arch in model_types:
             if model_type_arch != "siglip": break

From c9e75375b31d14c66e9f8846e2793f96e9bfee71 Mon Sep 17 00:00:00 2001
From: Daniel Han <danielhanchen@gmail.com>
Date: Thu, 21 Aug 2025 07:00:44 -0700
Subject: [PATCH 103/272] Update loader.py

---
 unsloth/models/loader.py | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/unsloth/models/loader.py b/unsloth/models/loader.py
index 889d170a17..3112f674fe 100644
--- a/unsloth/models/loader.py
+++ b/unsloth/models/loader.py
@@ -641,6 +641,9 @@ def from_pretrained(
                     "torch.amax(dequantize_module_weight(module)) >= 0:"\
                     "module._pre_set_compute_dtype = torch.bfloat16\n"\
                     "\n"\
+                    "if hasattr(module, 'weight'):"\
+                    "module._pre_set_compute_dtype = torch.bfloat16\n"\
+                    "\n"\
                     "if ('mlp.router' in name) and hasattr(module, 'weight'):"\
                     "module._pre_set_compute_dtype = torch.bfloat16\n"\
                     "\n"\

From 2e38e8a9b9e46b5bb4bf026dfff677728d662297 Mon Sep 17 00:00:00 2001
From: Daniel Han <danielhanchen@gmail.com>
Date: Fri, 22 Aug 2025 03:42:08 -0700
Subject: [PATCH 104/272] Update loader.py

---
 unsloth/models/loader.py | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/unsloth/models/loader.py b/unsloth/models/loader.py
index 3112f674fe..9ae1448762 100644
--- a/unsloth/models/loader.py
+++ b/unsloth/models/loader.py
@@ -897,6 +897,8 @@ def from_pretrained(
 
         if load_in_4bit:
             # Fix up bitsandbytes config
+            print("torch_dtype", model.config.to_dict().get("torch_dtype"))
+            print("dtype", model.config.to_dict().get("dtype"))
             quantization_config = \
             {
                 # Sometimes torch_dtype is not a string!!

From 8b3a8bacf4a19133d9d4952fad7fd65d437861a8 Mon Sep 17 00:00:00 2001
From: Daniel Han <danielhanchen@gmail.com>
Date: Fri, 22 Aug 2025 03:44:29 -0700
Subject: [PATCH 105/272] Update loader.py

---
 unsloth/models/loader.py | 10 ++++++----
 1 file changed, 6 insertions(+), 4 deletions(-)

diff --git a/unsloth/models/loader.py b/unsloth/models/loader.py
index 9ae1448762..1b3b2d6011 100644
--- a/unsloth/models/loader.py
+++ b/unsloth/models/loader.py
@@ -436,10 +436,12 @@ def from_pretrained(
 
         if load_in_4bit:
             # Fix up bitsandbytes config
+            config = model.config.to_dict()
+            torch_dtype = config.get("dtype") or config.get("torch_dtype")
             quantization_config = \
             {
                 # Sometimes torch_dtype is not a string!!
-                "bnb_4bit_compute_dtype"           : model.config.to_dict()["torch_dtype"],
+                "bnb_4bit_compute_dtype"           : torch_dtype,
                 "bnb_4bit_quant_type"              : "nf4",
                 "bnb_4bit_use_double_quant"        : True,
                 "llm_int8_enable_fp32_cpu_offload" : False,
@@ -897,12 +899,12 @@ def from_pretrained(
 
         if load_in_4bit:
             # Fix up bitsandbytes config
-            print("torch_dtype", model.config.to_dict().get("torch_dtype"))
-            print("dtype", model.config.to_dict().get("dtype"))
+            config = model.config.to_dict()
+            torch_dtype = config.get("dtype") or config.get("torch_dtype")
             quantization_config = \
             {
                 # Sometimes torch_dtype is not a string!!
-                "bnb_4bit_compute_dtype"           : model.config.to_dict()["torch_dtype"],
+                "bnb_4bit_compute_dtype"           : torch_dtype,
                 "bnb_4bit_quant_type"              : "nf4",
                 "bnb_4bit_use_double_quant"        : True,
                 "llm_int8_enable_fp32_cpu_offload" : False,

From f706d20e56924bdb26190625ebb66bac4eaa63d6 Mon Sep 17 00:00:00 2001
From: Daniel Han <danielhanchen@gmail.com>
Date: Fri, 22 Aug 2025 03:59:09 -0700
Subject: [PATCH 106/272] torch_dtype

---
 unsloth/models/vision.py | 19 ++++++++++++++-----
 unsloth/save.py          | 15 +++++++++++----
 2 files changed, 25 insertions(+), 9 deletions(-)

diff --git a/unsloth/models/vision.py b/unsloth/models/vision.py
index 486a049339..fc31032594 100644
--- a/unsloth/models/vision.py
+++ b/unsloth/models/vision.py
@@ -73,6 +73,9 @@
 PROMPT_LOOPKUP = dict()
 
 from transformers import GenerationConfig, CompileConfig, HybridCache
+from transformers import PretrainedConfig
+HAS_TORCH_DTYPE = "torch_dtype" in PretrainedConfig.__doc__
+
 _compile_config = CompileConfig(
     fullgraph = False,
     dynamic = None,
@@ -118,7 +121,7 @@ def unsloth_base_fast_generate(
     bsz = input_ids.shape[0]
 
     FastBaseModel.for_inference(self)
-    dtype = _get_dtype(self.config.torch_dtype)
+    dtype = _get_dtype(getattr(self.config, "dtype", None) or getattr(self.config, "torch_dtype", None))
 
     # Check if VLM
     is_vlm = any(
@@ -246,8 +249,6 @@ def unsloth_base_fast_generate(
     return output
 pass
 
-global partial_model
-
 class FastBaseModel:
 
     @staticmethod
@@ -443,11 +444,17 @@ def from_pretrained(
         torch_dtype = dtype
         if do_forced_float32: torch_dtype = torch.bfloat16
 
+        if HAS_TORCH_DTYPE:
+            kwargs["torch_dtype"] = torch_dtype
+        else:
+            # Transformers removed torch_dtype
+            kwargs["dtype"] = torch_dtype
+
         raise_handler = RaiseUninitialized()
         model = auto_model.from_pretrained(
             model_name,
             device_map              = device_map,
-            torch_dtype             = torch_dtype,
+            # torch_dtype           = torch_dtype, # Transformers removed torch_dtype
             # quantization_config   = bnb_config,
             token                   = token,
             trust_remote_code       = trust_remote_code,
@@ -698,7 +705,9 @@ def post_patch_model(
         full_finetuning = os.environ.get("UNSLOTH_ENABLE_FULL_FINETUNING", "0") == "1"
 
         float32_mixed_precision = True
-        if _get_dtype(model.config.torch_dtype) == torch.bfloat16 and full_finetuning:
+        if _get_dtype(
+                getattr(model.config, "dtype", None) or getattr(model.config, "torch_dtype", None)
+            ) == torch.bfloat16 and full_finetuning:
             # Use bfloat16 precision for full finetuning
             float32_mixed_precision = False
 
diff --git a/unsloth/save.py b/unsloth/save.py
index 9539b66701..4535c7dc42 100644
--- a/unsloth/save.py
+++ b/unsloth/save.py
@@ -549,11 +549,14 @@ def unsloth_save_model(
     from collections import OrderedDict
     state_dict = OrderedDict()
 
-    torch_dtype = internal_model.config.torch_dtype
+    torch_dtype = \
+        getattr(internal_model.config, "dtype", None) or \
+        getattr(internal_model.config, "torch_dtype", None)
     if type(torch_dtype) is str:
         if   torch_dtype ==  "float16": torch_dtype = torch.float16
         elif torch_dtype == "bfloat16": torch_dtype = torch.bfloat16
-    pass
+    else:
+        torch_dtype = internal_model.model.embed_tokens.weight.dtype
 
     # Check modules to save float32 dtype
     state_dict["model.embed_tokens.weight"] = internal_model.model.embed_tokens.weight.data.to(torch_dtype)
@@ -1880,7 +1883,9 @@ def unsloth_save_pretrained_gguf(
     for _ in range(3):
         gc.collect()
 
-    model_dtype = self.config.torch_dtype
+    model_dtype = \
+        getattr(self.config, "dtype", None) or \
+        getattr(self.config, "torch_dtype", None)
     model_type  = self.config.model_type
     if type(model_dtype) is str:
         assert(model_dtype == "float16" or model_dtype == "bfloat16")
@@ -2058,7 +2063,9 @@ def unsloth_push_to_hub_gguf(
     for _ in range(3):
         gc.collect()
 
-    model_dtype = self.config.torch_dtype
+    model_dtype = \
+        getattr(self.config, "dtype", None) or \
+        getattr(self.config, "torch_dtype", None)
     model_type  = self.config.model_type
     if type(model_dtype) is str:
         assert(model_dtype == "float16" or model_dtype == "bfloat16")

From b56cc1b82cfb64a02bbe7a12afd1c05eaa4bf53d Mon Sep 17 00:00:00 2001
From: Daniel Han <danielhanchen@gmail.com>
Date: Thu, 4 Sep 2025 03:33:54 -0700
Subject: [PATCH 107/272] Update rl.py

---
 unsloth/models/rl.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/unsloth/models/rl.py b/unsloth/models/rl.py
index 0f1fa2dbf6..b1ab96c840 100644
--- a/unsloth/models/rl.py
+++ b/unsloth/models/rl.py
@@ -513,7 +513,7 @@ def _patch_trl_rl_trainers(trainer_file = "grpo_trainer"):
         "fp16"                          : False,
         "include_tokens_per_second"     : False,
         "include_num_input_tokens_seen" : False,
-        "auto_find_batch_size"          : True, # Auto /2 batch size
+        "auto_find_batch_size"          : False, # Auto /2 batch size - too many people complained so removing
         "dataloader_pin_memory"         : True,
         # Might fail so disable for now
         # "dataloader_persistent_workers" : True, # Keeps dataloader in RAM

From c47f9367f53c0495bace2aa145252955d620aa78 Mon Sep 17 00:00:00 2001
From: Daniel Han <danielhanchen@gmail.com>
Date: Thu, 4 Sep 2025 03:55:38 -0700
Subject: [PATCH 108/272] Fix CE Loss

---
 unsloth/models/llama.py   |  4 ++--
 unsloth/models/mistral.py | 29 +++++++++++++++++++++--------
 2 files changed, 23 insertions(+), 10 deletions(-)

diff --git a/unsloth/models/llama.py b/unsloth/models/llama.py
index cf2ca75f75..f978060c9c 100644
--- a/unsloth/models/llama.py
+++ b/unsloth/models/llama.py
@@ -1236,7 +1236,7 @@ def _CausalLM_fast_forward(
             # < 1024 Normal Unsloth uses less VRAM!
             if bsz*q_len <= 1024: RETURN_LOGITS = True
 
-            if not RETURN_LOGITS and HAS_CUT_CROSS_ENTROPY and labels is not None:
+            if not RETURN_LOGITS and labels is not None:
 
                 n_items = kwargs.get("num_items_in_batch", None) or kwargs.get("n_items", None)
 
@@ -1259,7 +1259,7 @@ def _CausalLM_fast_forward(
                     mask                 = None,
                     n_items              = n_items,
                     scaling              = getattr(self, "accelerator_scaler", None),
-                    target_gb            = 1,
+                    target_gb            = None,
                     torch_compile        = True,
                     logit_softcapping    = logit_softcapping,
                 )
diff --git a/unsloth/models/mistral.py b/unsloth/models/mistral.py
index 6274f2e5df..faab2d30b1 100644
--- a/unsloth/models/mistral.py
+++ b/unsloth/models/mistral.py
@@ -300,17 +300,30 @@ def MistralForCausalLM_fast_forward(
         # < 1024 Normal Unsloth uses less VRAM!
         if bsz * q_len <= 1024: RETURN_LOGITS = True
 
-        if not RETURN_LOGITS and HAS_CUT_CROSS_ENTROPY and os.environ.get("UNSLOTH_ENABLE_CCE", "1") != "0" and labels is not None:
+        if not RETURN_LOGITS and labels is not None:
             n_items = kwargs.get("num_items_in_batch", None) or kwargs.get("n_items", None)
             logit_softcapping = getattr(self.config, "final_logit_softcapping", 0)
-            loss = fused_linear_cross_entropy(
-                hidden_states = hidden_states,
-                lm_weight = lm_head,
-                labels = labels,
-                num_items_in_batch = n_items,
-                logit_softcapping = logit_softcapping,
-            )
 
+            # loss = fused_linear_cross_entropy(
+            #     hidden_states = hidden_states,
+            #     lm_weight = lm_head,
+            #     labels = labels,
+            #     num_items_in_batch = n_items,
+            #     logit_softcapping = logit_softcapping,
+            # )
+            loss = unsloth_fused_ce_loss(
+                trainer              = None,
+                hidden_states        = hidden_states,
+                lm_head_weight       = lm_head,
+                lm_head_bias         = None,
+                labels               = labels,
+                mask                 = None,
+                n_items              = n_items,
+                scaling              = getattr(self, "accelerator_scaler", None),
+                target_gb            = None,
+                torch_compile        = True,
+                logit_softcapping    = logit_softcapping,
+            )
             if not return_dict:
                 output = (logits,) + outputs[1:]
                 return (loss,) + output if loss is not None else output

From 0b896c5f93e10a24b6db32d96627bb4482ff7558 Mon Sep 17 00:00:00 2001
From: Daniel Han <danielhanchen@gmail.com>
Date: Thu, 4 Sep 2025 05:11:33 -0700
Subject: [PATCH 109/272] Versioning

---
 pyproject.toml      | 4 ++--
 unsloth/__init__.py | 2 +-
 2 files changed, 3 insertions(+), 3 deletions(-)

diff --git a/pyproject.toml b/pyproject.toml
index 8c60cb5866..160182c2a2 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -37,7 +37,7 @@ triton = [
 ]
 
 huggingface = [
-    "unsloth_zoo>=2025.8.9",
+    "unsloth_zoo>=2025.9.1",
     "packaging",
     "tyro",
     "transformers>=4.51.3,!=4.52.0,!=4.52.1,!=4.52.2,!=4.52.3,!=4.53.0,!=4.54.0,!=4.55.0,!=4.55.1",
@@ -453,7 +453,7 @@ colab-ampere-torch220 = [
     "flash-attn>=2.6.3",
 ]
 colab-new = [
-    "unsloth_zoo>=2025.8.9",
+    "unsloth_zoo>=2025.9.1",
     "packaging",
     "tyro",
     "transformers>=4.51.3,!=4.47.0,!=4.52.0,!=4.52.1,!=4.52.2,!=4.52.3,!=4.53.0,!=4.54.0,!=4.55.0,!=4.55.1",
diff --git a/unsloth/__init__.py b/unsloth/__init__.py
index 1b2a9310ff..25a54165b7 100644
--- a/unsloth/__init__.py
+++ b/unsloth/__init__.py
@@ -214,7 +214,7 @@ def is_bf16_supported(): return SUPPORTS_BFLOAT16
 # Check for unsloth_zoo
 try:
     unsloth_zoo_version = importlib_version("unsloth_zoo")
-    if Version(unsloth_zoo_version) < Version("2025.8.8"):
+    if Version(unsloth_zoo_version) < Version("2025.9.1"):
         print(
             "Unsloth: Please update Unsloth and Unsloth-Zoo to the latest version!\n"\
             "Do this via `pip install --upgrade --force-reinstall --no-cache-dir --no-deps unsloth unsloth_zoo`"

From 7234a62f5b40d2ee96e65570a8e7a769e5449271 Mon Sep 17 00:00:00 2001
From: Daniel Han <danielhanchen@gmail.com>
Date: Tue, 9 Sep 2025 01:59:13 -0700
Subject: [PATCH 110/272] Update loader.py

---
 unsloth/models/loader.py | 1 +
 1 file changed, 1 insertion(+)

diff --git a/unsloth/models/loader.py b/unsloth/models/loader.py
index b1844a1472..952f900ff4 100644
--- a/unsloth/models/loader.py
+++ b/unsloth/models/loader.py
@@ -527,6 +527,7 @@ def from_pretrained(
         qat_scheme                 = None,
         *args, **kwargs,
     ):
+        print("model_name", model_name)
         if token is None: token = get_token()
         # Login to allow private models
         if token is not None:

From 68c1aba08999d4f8801cda2194bcab5234109f31 Mon Sep 17 00:00:00 2001
From: Daniel Han <danielhanchen@gmail.com>
Date: Tue, 9 Sep 2025 02:01:49 -0700
Subject: [PATCH 111/272] Update loader.py

---
 unsloth/models/loader.py | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/unsloth/models/loader.py b/unsloth/models/loader.py
index 952f900ff4..b689b1f3c2 100644
--- a/unsloth/models/loader.py
+++ b/unsloth/models/loader.py
@@ -254,7 +254,9 @@ def from_pretrained(
         # Get base model for PEFT:
         if is_peft:
             # Check base model again for PEFT
+            print("is_peft", model_name)
             model_name = peft_config.base_model_name_or_path
+            print("is_peft", model_name)
             if not use_exact_model_name:
                 model_name = get_model_name(model_name, load_in_4bit)
             model_config = AutoConfig.from_pretrained(

From 05fc2f2628b54ee2e867ff5c307abcfda7310cce Mon Sep 17 00:00:00 2001
From: Daniel Han <danielhanchen@gmail.com>
Date: Tue, 9 Sep 2025 04:31:12 -0700
Subject: [PATCH 112/272] extract_model_type_from_config

---
 unsloth/models/_utils.py | 33 ++++++++++++++++++++++++++++++++-
 unsloth/models/loader.py |  4 +---
 2 files changed, 33 insertions(+), 4 deletions(-)

diff --git a/unsloth/models/_utils.py b/unsloth/models/_utils.py
index 597ed0244b..0346ba13c1 100644
--- a/unsloth/models/_utils.py
+++ b/unsloth/models/_utils.py
@@ -12,12 +12,13 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-__version__ = "2025.9.2"
+__version__ = "2025.9.3"
 
 __all__ = [
     "SUPPORTS_BFLOAT16",
     "is_bfloat16_supported",
     "is_vLLM_available",
+    "extract_model_type_from_config",
 
     "prepare_model_for_kbit_training",
     "xformers",
@@ -1561,3 +1562,33 @@ def _prepare_model_for_qat(model: torch.nn.Module, qat_scheme: str) -> torch.nn.
     quantize_(model, QATConfig(base_config, step="prepare"), filter_fn=filter_fn)
     return model
 pass
+
+
+def extract_model_type_from_config(config):
+    """ Gets model_type from config file - can be PEFT or normal HF """
+    model_type = None
+    from peft import PeftConfig
+    if issubclass(type(config), PeftConfig):
+        model_type_list = re.finditer(r"transformers\.models\.([^\.]{2,})\.modeling_\1", str(config))
+        model_type_list = list(model_type_list)
+        # Use transformers.models.gpt_oss.modeling_gpt_oss
+        if len(model_type_list) != 0:
+            model_type = model_type_list[0].group(1)
+        elif hasattr(config, "auto_mapping"):
+            # Use GptOssForCausalLM
+            model_type = config.auto_mapping.get("base_model_class", None)
+            if model_type is None:
+                # Last resort use model name unsloth/gpt-oss-20b-unsloth-bnb-4bit
+                model_type = config.base_model_name_or_path
+                model_type = os.path.split(model_type)[-1]
+    else:
+        
+    if model_type is None:
+        raise TypeError(f"Unsloth: Cannot determine model type for config file: {str(config)}")
+
+    # Standardize model_type
+    model_type = model_type.lower()
+    model_type = model_type.replace("_", "-")
+    model_type = model_type.replace("/", "-")
+    return model_type
+pass
diff --git a/unsloth/models/loader.py b/unsloth/models/loader.py
index c0b996ae02..9c26c8834e 100644
--- a/unsloth/models/loader.py
+++ b/unsloth/models/loader.py
@@ -20,6 +20,7 @@
     HAS_FLASH_ATTENTION_SOFTCAPPING,
     USE_MODELSCOPE,
     get_transformers_model_type,
+    extract_model_type_from_config,
 )
 from .granite import FastGraniteModel
 from .llama   import FastLlamaModel, logger
@@ -254,9 +255,7 @@ def from_pretrained(
         # Get base model for PEFT:
         if is_peft:
             # Check base model again for PEFT
-            print("is_peft", model_name)
             model_name = peft_config.base_model_name_or_path
-            print("is_peft", model_name)
             if not use_exact_model_name:
                 model_name = get_model_name(model_name, load_in_4bit)
             model_config = AutoConfig.from_pretrained(
@@ -529,7 +528,6 @@ def from_pretrained(
         qat_scheme                 = None,
         *args, **kwargs,
     ):
-        print("model_name", model_name)
         if token is None: token = get_token()
         # Login to allow private models
         if token is not None:

From 99c7afb3fcc8aaa755dba2ad9f74140ff978028c Mon Sep 17 00:00:00 2001
From: Daniel Han <danielhanchen@gmail.com>
Date: Tue, 9 Sep 2025 21:51:46 -0700
Subject: [PATCH 113/272] Model types

---
 unsloth/models/_utils.py |  39 +++++++++++---
 unsloth/models/loader.py | 114 +++++++++++++++++++++------------------
 2 files changed, 92 insertions(+), 61 deletions(-)

diff --git a/unsloth/models/_utils.py b/unsloth/models/_utils.py
index 0346ba13c1..f961a49de5 100644
--- a/unsloth/models/_utils.py
+++ b/unsloth/models/_utils.py
@@ -1566,7 +1566,9 @@ def _prepare_model_for_qat(model: torch.nn.Module, qat_scheme: str) -> torch.nn.
 
 def extract_model_type_from_config(config):
     """ Gets model_type from config file - can be PEFT or normal HF """
-    model_type = None
+    if config is None:
+        raise TypeError(f"Unsloth: Cannot determine model type for config file: {str(config)}")
+    model_types = None
     from peft import PeftConfig
     if issubclass(type(config), PeftConfig):
         model_type_list = re.finditer(r"transformers\.models\.([^\.]{2,})\.modeling_\1", str(config))
@@ -1574,6 +1576,7 @@ def extract_model_type_from_config(config):
         # Use transformers.models.gpt_oss.modeling_gpt_oss
         if len(model_type_list) != 0:
             model_type = model_type_list[0].group(1)
+            model_types = [model_type]
         elif hasattr(config, "auto_mapping"):
             # Use GptOssForCausalLM
             model_type = config.auto_mapping.get("base_model_class", None)
@@ -1581,14 +1584,34 @@ def extract_model_type_from_config(config):
                 # Last resort use model name unsloth/gpt-oss-20b-unsloth-bnb-4bit
                 model_type = config.base_model_name_or_path
                 model_type = os.path.split(model_type)[-1]
+            model_types = [model_type]
     else:
-        
-    if model_type is None:
+        from collections.abc import Mapping, Sequence
+        def find_values(data, target_key):
+            stack = [data]
+            while stack:
+                obj = stack.pop()
+                if isinstance(obj, Mapping):
+                    # Emit values for matches
+                    if target_key in obj:
+                        yield obj[target_key]
+                    # Keep walking into nested values
+                    stack.extend(obj.values())
+                elif isinstance(obj, Sequence) and not isinstance(obj, (str, bytes, bytearray)):
+                    # Walk sequences (lists/tuples/sets), but not strings/bytes
+                    stack.extend(obj)
+        model_types = list(find_values(getattr(config, "to_dict", lambda *args, **kwargs: {})(), "model_type"))
+    pass
+    if model_types is None:
         raise TypeError(f"Unsloth: Cannot determine model type for config file: {str(config)}")
-
     # Standardize model_type
-    model_type = model_type.lower()
-    model_type = model_type.replace("_", "-")
-    model_type = model_type.replace("/", "-")
-    return model_type
+    final_model_types = []
+    for model_type in model_types:
+        model_type = model_type.lower()
+        model_type = model_type.replace("_", "")
+        model_type = model_type.replace("-", "")
+        model_type = model_type.replace("/", "")
+        model_type = model_type.replace(".", "")
+        final_model_types.append(model_type)
+    return tuple(sorted(final_model_types))
 pass
diff --git a/unsloth/models/loader.py b/unsloth/models/loader.py
index 9c26c8834e..6cefe33aaf 100644
--- a/unsloth/models/loader.py
+++ b/unsloth/models/loader.py
@@ -84,7 +84,8 @@
 global FORCE_FLOAT32
 FORCE_FLOAT32 = [
     "gemma3",
-    "gpt_oss",
+    "gemma3n",
+    "gptoss",
 ]
 
 class FastLanguageModel(FastLlamaModel):
@@ -178,6 +179,8 @@ def from_pretrained(
 
         autoconfig_error = None
         peft_error = None
+        model_config = None
+        peft_config = None
         try:
             model_config = AutoConfig.from_pretrained(
                 model_name,
@@ -201,8 +204,12 @@ def from_pretrained(
             peft_error = str(error)
             is_peft = False
         pass
-
-        # Both config.json and adapter_config.json should not exist!
+        model_types = extract_model_type_from_config(model_config or peft_config)
+        if len(model_types) == 1:
+            model_type = model_types[0]
+        else:
+            # Leave as tuple if more than one arch
+            model_type = model_types
 
         # Old transformers versions check
         both_exist = (is_model and is_peft) and not SUPPORTS_LLAMA32
@@ -267,8 +274,6 @@ def from_pretrained(
 
         if not was_disabled: enable_progress_bars()
 
-        model_type = model_config.model_type
-
         if model_type == "llama":
             scaling_type = None
             if getattr(model_config, "rope_scaling", None) is not None:
@@ -494,10 +499,11 @@ def from_pretrained(
     from transformers import AutoModelForVision2Seq
 pass
 
+# Must be alphabetically sorted for each entry
 DISABLE_COMPILE_MODEL_NAMES = [
-    "aya-vision",
+    "ayavision",
     "modernbert",
-    "granite-vision",
+    "granite,llavanext,siglipvisionmodel", # Granite-vision 3
 ]
 
 
@@ -574,20 +580,55 @@ def from_pretrained(
         if not use_exact_model_name:
             model_name = get_model_name(model_name, load_in_4bit)
 
+        # First check if it's a normal model via AutoConfig
+        from huggingface_hub.utils import disable_progress_bars, enable_progress_bars, are_progress_bars_disabled
+        was_disabled = are_progress_bars_disabled()
+        disable_progress_bars()
+
+        autoconfig_error = None
+        peft_error = None
+        model_config = None
+        peft_config = None
+        try:
+            model_config = AutoConfig.from_pretrained(
+                model_name,
+                token = token,
+                revision = revision,
+                trust_remote_code = trust_remote_code,
+            )
+            is_model = True
+        except Exception as error:
+            autoconfig_error = str(error)
+            is_model = False
+        try:
+            peft_config = PeftConfig.from_pretrained(
+                model_name,
+                token = token,
+                revision = revision,
+                trust_remote_code = trust_remote_code,
+            )
+            is_peft = True
+        except Exception as error:
+            peft_error = str(error)
+            is_peft = False
+        pass
+        model_types = extract_model_type_from_config(model_config or peft_config)
+        model_types_all = ",".join(model_types)
+
         # Check versions
         lowered_model_name = model_name.lower()
         os.environ["UNSLOTH_MODEL_NAME"] = lowered_model_name
         LATEST  = '\nPlease use transformers via `pip install --no-deps git+https://github.com/huggingface/transformers.git`'
         NIGHTLY = '\nPlease use nightly transformers via pip install --upgrade "transformers>=4.49.0"`'
         # Pixtral
-        if "pixtral" in lowered_model_name and transformers_version < Version("4.49.0"):
+        if "pixtral" in model_types_all and transformers_version < Version("4.49.0"):
             raise RuntimeError("Unsloth: Pixtral only works on transformers >= 4.49.0." + LATEST)
         # Qwen 2.5
-        elif "qwen2.5" in lowered_model_name and transformers_version < Version("4.49.0"):
+        elif "qwen25" in model_types_all and transformers_version < Version("4.49.0"):
             raise RuntimeError("Unsloth: Qwen 2.5 only works on transformers >= 4.49.0." + LATEST)
         # Gemma 3
-        elif "gemma-3" in lowered_model_name:
-            if "gemma-3n" in lowered_model_name:
+        elif "gemma3" in model_types_all:
+            if "gemma3n" in model_types_all:
                 if transformers_version < Version("4.53.0"):
                     raise RuntimeError("Unsloth: Gemma 3N only works on transformers >= 4.53.0" + LATEST)
                 os.environ["UNSLOTH_DISABLE_STATIC_GENERATION"] = "1"
@@ -605,10 +646,10 @@ def from_pretrained(
             # common in both gemma-3 and gemma-3n
             os.environ["UNSLOTH_HIGH_PRECISION_LAYERNORM"] = "1"
         # Cohere
-        elif "c4ai-command-a-03-2025" in lowered_model_name and transformers_version < Version("4.50.0.dev0"):
+        elif "cohere2" in model_types_all and transformers_version < Version("4.50.0.dev0"):
             raise RuntimeError("Unsloth: Cohere's Command model only works on transformers >= 4.50.0." + NIGHTLY)
         # Sesame
-        elif "csm-1b" in lowered_model_name:
+        elif "csm" in model_types_all:
             os.environ["UNSLOTH_COMPILE_DISABLE"] = "1" # Inference is too slow
             os.environ["UNSLOTH_DISABLE_STATIC_GENERATION"] = "1" # Sesame fails
             os.environ["UNSLOTH_FORCE_CUSTOM_DTYPE"] = \
@@ -616,14 +657,14 @@ def from_pretrained(
                 "if name.endswith(('_proj', 'fc1', 'fc2', 'codebook', 'head')): module.to(torch.float16)"\
                 ";"
         # Granite 4
-        elif 'granite-4' in lowered_model_name:
+        elif 'granitemoehybrid' in model_types_all:
             # Granite-4 rms norms are stored as 16 bit, but we upcast
             os.environ["UNSLOTH_HIGH_PRECISION_LAYERNORM"] = "1"
             os.environ["UNSLOTH_DISABLE_STATIC_GENERATION"] = "1"
         # Olmo 2
-        elif "olmo-2" in lowered_model_name and transformers_version < Version("4.50.0.dev0"):
+        elif "olmo2" in model_types_all and transformers_version < Version("4.50.0.dev0"):
             raise RuntimeError("Unsloth: OLMo-2 only works on transformers >= 4.50.0." + NIGHTLY)
-        elif "falcon-h1" in lowered_model_name:
+        elif "falconh1" in model_types_all:
             # Falcon must use float32 Triton ie TRITON_F32_DEFAULT = 'ieee'
             # since Mamba kernels error out on using lower precision
             os.environ["UNSLOTH_FORCE_CUSTOM_DTYPE"] = \
@@ -631,7 +672,7 @@ def from_pretrained(
                 "if name.endswith(('q_proj', 'k_proj', 'v_proj', 'o_proj', 'gate_proj', 'up_proj', 'down_proj', 'head')): module.to(torch.float16)"\
                 ";"\
                 "os.environ['TRITON_F32_DEFAULT'] = 'ieee'"
-        elif "gpt-oss" in lowered_model_name:
+        elif "gptoss" in model_types_all:
             os.environ["UNSLOTH_DISABLE_STATIC_GENERATION"] = "1"
             if not load_in_4bit:
                 # Only upcast MoE biases for MXFP4, not BnB
@@ -681,39 +722,6 @@ def from_pretrained(
             model_name = snapshot_download(model_name)
         pass
 
-        # First check if it's a normal model via AutoConfig
-        from huggingface_hub.utils import disable_progress_bars, enable_progress_bars, are_progress_bars_disabled
-        was_disabled = are_progress_bars_disabled()
-        disable_progress_bars()
-
-        autoconfig_error = None
-        peft_error = None
-        try:
-            model_config = AutoConfig.from_pretrained(
-                model_name,
-                token = token,
-                revision = revision,
-                trust_remote_code = trust_remote_code,
-            )
-            is_model = True
-        except Exception as error:
-            autoconfig_error = str(error)
-            is_model = False
-        try:
-            peft_config = PeftConfig.from_pretrained(
-                model_name,
-                token = token,
-                revision = revision,
-                trust_remote_code = trust_remote_code,
-            )
-            is_peft = True
-        except Exception as error:
-            peft_error = str(error)
-            is_peft = False
-        pass
-
-        # Both config.json and adapter_config.json should not exist!
-
         # Old transformers versions check
         both_exist = (is_model and is_peft) and not SUPPORTS_LLAMA32
 
@@ -799,8 +807,8 @@ def from_pretrained(
             if model_type_arch != "siglip": break
         global FORCE_FLOAT32
         for disable_name in FORCE_FLOAT32:
-            if (disable_name.lower() == model_type_arch.lower().replace("-", "_") or \
-                disable_name.lower() in model_name.lower()) and \
+            if (disable_name.lower() == model_type_arch.lower().replace("-", "").replace("_", "") or \
+                disable_name.lower() in model_types_all) and \
                 ((dtype == torch.float16) or not SUPPORTS_BFLOAT16):
                 os.environ["UNSLOTH_FORCE_FLOAT32"] = "1"
                 dtype = torch.bfloat16 # Change to bfloat16 loading
@@ -846,7 +854,7 @@ def from_pretrained(
             )
         pass
         # Fix SDPA
-        if "gemma-3n" in lowered_model_name:
+        if "gemma3n" in model_types_all:
             supports_sdpa = False
         pass
 

From fc5d91de3b2200e6a4a32e865c5f18272271de5a Mon Sep 17 00:00:00 2001
From: Daniel Han <danielhanchen@gmail.com>
Date: Tue, 9 Sep 2025 22:02:56 -0700
Subject: [PATCH 114/272] Update loader.py

---
 unsloth/models/loader.py | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/unsloth/models/loader.py b/unsloth/models/loader.py
index 6cefe33aaf..44a74601d9 100644
--- a/unsloth/models/loader.py
+++ b/unsloth/models/loader.py
@@ -205,6 +205,7 @@ def from_pretrained(
             is_peft = False
         pass
         model_types = extract_model_type_from_config(model_config or peft_config)
+        print("model_types", model_types)
         if len(model_types) == 1:
             model_type = model_types[0]
         else:
@@ -614,6 +615,7 @@ def from_pretrained(
         pass
         model_types = extract_model_type_from_config(model_config or peft_config)
         model_types_all = ",".join(model_types)
+        print("model_types", model_types)
 
         # Check versions
         lowered_model_name = model_name.lower()

From 702a9ead13538d5a930c9a2f644fb92671dd35f2 Mon Sep 17 00:00:00 2001
From: Daniel Han <danielhanchen@gmail.com>
Date: Tue, 9 Sep 2025 22:11:08 -0700
Subject: [PATCH 115/272] get_transformers_model_type

---
 unsloth/models/_utils.py | 54 ----------------------------------------
 unsloth/models/loader.py | 26 ++++++-------------
 2 files changed, 8 insertions(+), 72 deletions(-)

diff --git a/unsloth/models/_utils.py b/unsloth/models/_utils.py
index f961a49de5..56b98489f6 100644
--- a/unsloth/models/_utils.py
+++ b/unsloth/models/_utils.py
@@ -18,7 +18,6 @@
     "SUPPORTS_BFLOAT16",
     "is_bfloat16_supported",
     "is_vLLM_available",
-    "extract_model_type_from_config",
 
     "prepare_model_for_kbit_training",
     "xformers",
@@ -1562,56 +1561,3 @@ def _prepare_model_for_qat(model: torch.nn.Module, qat_scheme: str) -> torch.nn.
     quantize_(model, QATConfig(base_config, step="prepare"), filter_fn=filter_fn)
     return model
 pass
-
-
-def extract_model_type_from_config(config):
-    """ Gets model_type from config file - can be PEFT or normal HF """
-    if config is None:
-        raise TypeError(f"Unsloth: Cannot determine model type for config file: {str(config)}")
-    model_types = None
-    from peft import PeftConfig
-    if issubclass(type(config), PeftConfig):
-        model_type_list = re.finditer(r"transformers\.models\.([^\.]{2,})\.modeling_\1", str(config))
-        model_type_list = list(model_type_list)
-        # Use transformers.models.gpt_oss.modeling_gpt_oss
-        if len(model_type_list) != 0:
-            model_type = model_type_list[0].group(1)
-            model_types = [model_type]
-        elif hasattr(config, "auto_mapping"):
-            # Use GptOssForCausalLM
-            model_type = config.auto_mapping.get("base_model_class", None)
-            if model_type is None:
-                # Last resort use model name unsloth/gpt-oss-20b-unsloth-bnb-4bit
-                model_type = config.base_model_name_or_path
-                model_type = os.path.split(model_type)[-1]
-            model_types = [model_type]
-    else:
-        from collections.abc import Mapping, Sequence
-        def find_values(data, target_key):
-            stack = [data]
-            while stack:
-                obj = stack.pop()
-                if isinstance(obj, Mapping):
-                    # Emit values for matches
-                    if target_key in obj:
-                        yield obj[target_key]
-                    # Keep walking into nested values
-                    stack.extend(obj.values())
-                elif isinstance(obj, Sequence) and not isinstance(obj, (str, bytes, bytearray)):
-                    # Walk sequences (lists/tuples/sets), but not strings/bytes
-                    stack.extend(obj)
-        model_types = list(find_values(getattr(config, "to_dict", lambda *args, **kwargs: {})(), "model_type"))
-    pass
-    if model_types is None:
-        raise TypeError(f"Unsloth: Cannot determine model type for config file: {str(config)}")
-    # Standardize model_type
-    final_model_types = []
-    for model_type in model_types:
-        model_type = model_type.lower()
-        model_type = model_type.replace("_", "")
-        model_type = model_type.replace("-", "")
-        model_type = model_type.replace("/", "")
-        model_type = model_type.replace(".", "")
-        final_model_types.append(model_type)
-    return tuple(sorted(final_model_types))
-pass
diff --git a/unsloth/models/loader.py b/unsloth/models/loader.py
index 44a74601d9..7e8a32caa7 100644
--- a/unsloth/models/loader.py
+++ b/unsloth/models/loader.py
@@ -20,7 +20,6 @@
     HAS_FLASH_ATTENTION_SOFTCAPPING,
     USE_MODELSCOPE,
     get_transformers_model_type,
-    extract_model_type_from_config,
 )
 from .granite import FastGraniteModel
 from .llama   import FastLlamaModel, logger
@@ -204,8 +203,7 @@ def from_pretrained(
             peft_error = str(error)
             is_peft = False
         pass
-        model_types = extract_model_type_from_config(model_config or peft_config)
-        print("model_types", model_types)
+        model_types = get_transformers_model_type(model_config or peft_config)
         if len(model_types) == 1:
             model_type = model_types[0]
         else:
@@ -581,6 +579,12 @@ def from_pretrained(
         if not use_exact_model_name:
             model_name = get_model_name(model_name, load_in_4bit)
 
+        # Check modelscope
+        if USE_MODELSCOPE and not os.path.exists(model_name):
+            from modelscope import snapshot_download
+            model_name = snapshot_download(model_name)
+        pass
+
         # First check if it's a normal model via AutoConfig
         from huggingface_hub.utils import disable_progress_bars, enable_progress_bars, are_progress_bars_disabled
         was_disabled = are_progress_bars_disabled()
@@ -613,9 +617,8 @@ def from_pretrained(
             peft_error = str(error)
             is_peft = False
         pass
-        model_types = extract_model_type_from_config(model_config or peft_config)
+        model_types = get_transformers_model_type(model_config or peft_config)
         model_types_all = ",".join(model_types)
-        print("model_types", model_types)
 
         # Check versions
         lowered_model_name = model_name.lower()
@@ -719,11 +722,6 @@ def from_pretrained(
             os.environ["UNSLOTH_DISABLE_STATIC_GENERATION"] = "1"
         pass
 
-        if USE_MODELSCOPE and not os.path.exists(model_name):
-            from modelscope import snapshot_download
-            model_name = snapshot_download(model_name)
-        pass
-
         # Old transformers versions check
         both_exist = (is_model and is_peft) and not SUPPORTS_LLAMA32
 
@@ -793,15 +791,7 @@ def from_pretrained(
         else:
             redirector = contextlib.redirect_stdout(open(os.devnull, "w"))
 
-        # Get model types like Gemma3 etc
-        model_types = get_transformers_model_type(
-            model_name        = model_name,
-            token             = token,
-            revision          = revision,
-            trust_remote_code = trust_remote_code,
-        )
         model_types = ["siglip"] + model_types
-
         # Set forced float32 env flag
         os.environ["UNSLOTH_FORCE_FLOAT32"] = "0"
         do_forced_float32 = False

From 8ece4a6f915e27f536202017132d031094a518ac Mon Sep 17 00:00:00 2001
From: Daniel Han <danielhanchen@gmail.com>
Date: Tue, 9 Sep 2025 22:14:10 -0700
Subject: [PATCH 116/272] Update loader.py

---
 unsloth/models/loader.py | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/unsloth/models/loader.py b/unsloth/models/loader.py
index 7e8a32caa7..43c14050c2 100644
--- a/unsloth/models/loader.py
+++ b/unsloth/models/loader.py
@@ -204,6 +204,7 @@ def from_pretrained(
             is_peft = False
         pass
         model_types = get_transformers_model_type(model_config or peft_config)
+        print("model_types", model_types)
         if len(model_types) == 1:
             model_type = model_types[0]
         else:
@@ -619,6 +620,7 @@ def from_pretrained(
         pass
         model_types = get_transformers_model_type(model_config or peft_config)
         model_types_all = ",".join(model_types)
+        print("model_types", model_types)
 
         # Check versions
         lowered_model_name = model_name.lower()

From f3ac0e3b6d382dd432af4a49c919e4d8a2700480 Mon Sep 17 00:00:00 2001
From: Daniel Han <danielhanchen@gmail.com>
Date: Tue, 9 Sep 2025 22:18:59 -0700
Subject: [PATCH 117/272] Update loader.py

---
 unsloth/models/loader.py | 14 ++++++--------
 1 file changed, 6 insertions(+), 8 deletions(-)

diff --git a/unsloth/models/loader.py b/unsloth/models/loader.py
index 43c14050c2..27fb3afe41 100644
--- a/unsloth/models/loader.py
+++ b/unsloth/models/loader.py
@@ -84,7 +84,7 @@
 FORCE_FLOAT32 = [
     "gemma3",
     "gemma3n",
-    "gptoss",
+    "gpt_oss",
 ]
 
 class FastLanguageModel(FastLlamaModel):
@@ -204,7 +204,6 @@ def from_pretrained(
             is_peft = False
         pass
         model_types = get_transformers_model_type(model_config or peft_config)
-        print("model_types", model_types)
         if len(model_types) == 1:
             model_type = model_types[0]
         else:
@@ -501,9 +500,9 @@ def from_pretrained(
 
 # Must be alphabetically sorted for each entry
 DISABLE_COMPILE_MODEL_NAMES = [
-    "ayavision",
+    "aya_vision",
     "modernbert",
-    "granite,llavanext,siglipvisionmodel", # Granite-vision 3
+    "granite,llava_next", # Granite-vision 3
 ]
 
 
@@ -620,7 +619,6 @@ def from_pretrained(
         pass
         model_types = get_transformers_model_type(model_config or peft_config)
         model_types_all = ",".join(model_types)
-        print("model_types", model_types)
 
         # Check versions
         lowered_model_name = model_name.lower()
@@ -631,7 +629,7 @@ def from_pretrained(
         if "pixtral" in model_types_all and transformers_version < Version("4.49.0"):
             raise RuntimeError("Unsloth: Pixtral only works on transformers >= 4.49.0." + LATEST)
         # Qwen 2.5
-        elif "qwen25" in model_types_all and transformers_version < Version("4.49.0"):
+        elif "qwen2_5" in model_types_all and transformers_version < Version("4.49.0"):
             raise RuntimeError("Unsloth: Qwen 2.5 only works on transformers >= 4.49.0." + LATEST)
         # Gemma 3
         elif "gemma3" in model_types_all:
@@ -671,7 +669,7 @@ def from_pretrained(
         # Olmo 2
         elif "olmo2" in model_types_all and transformers_version < Version("4.50.0.dev0"):
             raise RuntimeError("Unsloth: OLMo-2 only works on transformers >= 4.50.0." + NIGHTLY)
-        elif "falconh1" in model_types_all:
+        elif "falcon_h1" in model_types_all:
             # Falcon must use float32 Triton ie TRITON_F32_DEFAULT = 'ieee'
             # since Mamba kernels error out on using lower precision
             os.environ["UNSLOTH_FORCE_CUSTOM_DTYPE"] = \
@@ -679,7 +677,7 @@ def from_pretrained(
                 "if name.endswith(('q_proj', 'k_proj', 'v_proj', 'o_proj', 'gate_proj', 'up_proj', 'down_proj', 'head')): module.to(torch.float16)"\
                 ";"\
                 "os.environ['TRITON_F32_DEFAULT'] = 'ieee'"
-        elif "gptoss" in model_types_all:
+        elif "gpt_oss" in model_types_all:
             os.environ["UNSLOTH_DISABLE_STATIC_GENERATION"] = "1"
             if not load_in_4bit:
                 # Only upcast MoE biases for MXFP4, not BnB

From d2b0d4193a6e32cf370f2008d8ad05011a6ad0a6 Mon Sep 17 00:00:00 2001
From: Daniel Han <danielhanchen@gmail.com>
Date: Tue, 9 Sep 2025 22:22:15 -0700
Subject: [PATCH 118/272] Update loader.py

---
 unsloth/models/loader.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/unsloth/models/loader.py b/unsloth/models/loader.py
index 27fb3afe41..de2f32f9af 100644
--- a/unsloth/models/loader.py
+++ b/unsloth/models/loader.py
@@ -809,7 +809,7 @@ def from_pretrained(
         # Patch gradient checkpointing
         if use_gradient_checkpointing == "unsloth":
             patch_unsloth_smart_gradient_checkpointing(dtype = dtype)
-
+        print(model_types)
         with redirector:
             patch_loss_functions(torch_compile = False)
             model_types, supports_sdpa = unsloth_compile_transformers(

From e5920fe7027e7caf8602fc9a7d602a84ef197bed Mon Sep 17 00:00:00 2001
From: Daniel Han <danielhanchen@gmail.com>
Date: Wed, 10 Sep 2025 01:21:49 -0700
Subject: [PATCH 119/272] Update rl.py

---
 unsloth/models/rl.py | 42 ++++++++++++++++++++++--------------------
 1 file changed, 22 insertions(+), 20 deletions(-)

diff --git a/unsloth/models/rl.py b/unsloth/models/rl.py
index f342a4d86b..14b75f6746 100644
--- a/unsloth/models/rl.py
+++ b/unsloth/models/rl.py
@@ -44,6 +44,8 @@
 }
 
 from trl import __version__ as trl_version
+from unsloth_zoo.utils import Version
+trl_version = Version(trl_version)
 
 def vLLMSamplingParams(**kwargs):
     from vllm import SamplingParams
@@ -804,7 +806,7 @@ def patch_functions(RLTrainer, trainer_file, RLTrainer_name, all_imports, import
             " " * 12 + "if (getattr(args, 'use_vllm', False) == False):\n" + \
             " " * 16 + "args.use_vllm = True\n"
 
-            if "grpo" in trainer_file and trl_version >= "0.18":
+            if "grpo" in trainer_file and trl_version >= Version("0.18.0"):
                 # If model has vllm_engine, then use vllm in colocate mode. Donot wait for server
                 vllm_setter += \
                 " " * 12 + "args.vllm_mode='colocate'\n"
@@ -850,26 +852,27 @@ def patch_functions(RLTrainer, trainer_file, RLTrainer_name, all_imports, import
                 sampling_params # Add spaces
 
             # count the indentation of last line of sampling_params.
-            last_line = sampling_params.split("\n")[-1]
-            last_prev_line = sampling_params.split("\n")[-2]
-            last_prev_indentation = len(last_prev_line) - len(last_prev_line.lstrip())
-            last_indentation = len(last_line) - len(last_line.lstrip())
-
-
-            # Add extra arguments to SamplingParams
-            extra = "**getattr(getattr(args, 'vllm_sampling_params', vLLMSamplingParams()), '_set_kwargs', {})"
-            # Backwards replace
-            to_replace = ",\n" + " "*last_prev_indentation + extra + ",\n" + " "*last_indentation + ")"
-            sampling_params = to_replace.join(sampling_params.rsplit(")", 1))
-            # Strip multiple commas
-            sampling_params = re.sub(r"[\,][\s]{0,}\,", ",", sampling_params)
-
-            new_vllm_part = \
-                f"\n{' '*8}if {args}.use_vllm:\n{sampling_params}"\
-                f"\n{' '*8}else:\n"
+            splitted_sampling_params = sampling_params.split("\n")
+            if len(splitted_sampling_params) >= 2:
+                last_line = splitted_sampling_params[-1]
+                last_prev_line = splitted_sampling_params[-2]
+                last_prev_indentation = len(last_prev_line) - len(last_prev_line.lstrip())
+                last_indentation = len(last_line) - len(last_line.lstrip())
+
+                # Add extra arguments to SamplingParams
+                extra = "**getattr(getattr(args, 'vllm_sampling_params', vLLMSamplingParams()), '_set_kwargs', {})"
+                # Backwards replace
+                to_replace = ",\n" + " "*last_prev_indentation + extra + ",\n" + " "*last_indentation + ")"
+                sampling_params = to_replace.join(sampling_params.rsplit(")", 1))
+                # Strip multiple commas
+                sampling_params = re.sub(r"[\,][\s]{0,}\,", ",", sampling_params)
+
+                new_vllm_part = \
+                    f"\n{' '*8}if {args}.use_vllm:\n{sampling_params}"\
+                    f"\n{' '*8}else:\n"
         pass
 
-        if trl_version >= "0.18":
+        if trl_version >= Version("0.18.0"):
             # Replace LLM init with already existing vLLM engine for colocate mode
             vllm_llm_init_pattern = r"self\.llm\s*=\s*LLM\(.*?\)*\)\s*?\n(?!,)"
             vllm_llm_replacement = "self.llm = model.vllm_engine\n"
@@ -881,7 +884,6 @@ def patch_functions(RLTrainer, trainer_file, RLTrainer_name, all_imports, import
             )
 
         init = init.replace(vllm_part, new_vllm_part)
-
     pass
 
     # Search for vLLM calling in all child functions

From bf0367eb45dc731104968052415184b8e2d080dc Mon Sep 17 00:00:00 2001
From: Daniel Han <danielhanchen@gmail.com>
Date: Wed, 10 Sep 2025 01:24:02 -0700
Subject: [PATCH 120/272] Update pyproject.toml

---
 pyproject.toml | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/pyproject.toml b/pyproject.toml
index c2cb87ce3b..c860a92db6 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -37,7 +37,7 @@ triton = [
 ]
 
 huggingface = [
-    "unsloth_zoo>=2025.9.3",
+    "unsloth_zoo>=2025.9.4",
     "packaging",
     "tyro",
     "transformers>=4.51.3,!=4.52.0,!=4.52.1,!=4.52.2,!=4.52.3,!=4.53.0,!=4.54.0,!=4.55.0,!=4.55.1",
@@ -453,7 +453,7 @@ colab-ampere-torch220 = [
     "flash-attn>=2.6.3",
 ]
 colab-new = [
-    "unsloth_zoo>=2025.9.3",
+    "unsloth_zoo>=2025.9.4",
     "packaging",
     "tyro",
     "transformers>=4.51.3,!=4.47.0,!=4.52.0,!=4.52.1,!=4.52.2,!=4.52.3,!=4.53.0,!=4.54.0,!=4.55.0,!=4.55.1",

From d2c2cc195a99b6b4dbeab7b6f65d1b302b7a9591 Mon Sep 17 00:00:00 2001
From: Daniel Han <danielhanchen@gmail.com>
Date: Wed, 10 Sep 2025 01:26:58 -0700
Subject: [PATCH 121/272] Update loader.py

---
 unsloth/models/loader.py | 1 -
 1 file changed, 1 deletion(-)

diff --git a/unsloth/models/loader.py b/unsloth/models/loader.py
index de2f32f9af..a57deef000 100644
--- a/unsloth/models/loader.py
+++ b/unsloth/models/loader.py
@@ -809,7 +809,6 @@ def from_pretrained(
         # Patch gradient checkpointing
         if use_gradient_checkpointing == "unsloth":
             patch_unsloth_smart_gradient_checkpointing(dtype = dtype)
-        print(model_types)
         with redirector:
             patch_loss_functions(torch_compile = False)
             model_types, supports_sdpa = unsloth_compile_transformers(

From 35ca1776b08f81f05e16e268f09cb444f1af1e1b Mon Sep 17 00:00:00 2001
From: Daniel Han <danielhanchen@gmail.com>
Date: Fri, 12 Sep 2025 18:53:46 -0700
Subject: [PATCH 122/272] Update loader.py

---
 unsloth/models/loader.py | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/unsloth/models/loader.py b/unsloth/models/loader.py
index a57deef000..5ad283d39a 100644
--- a/unsloth/models/loader.py
+++ b/unsloth/models/loader.py
@@ -204,6 +204,7 @@ def from_pretrained(
             is_peft = False
         pass
         model_types = get_transformers_model_type(model_config or peft_config)
+        print("207", model_types_all)
         if len(model_types) == 1:
             model_type = model_types[0]
         else:
@@ -619,6 +620,7 @@ def from_pretrained(
         pass
         model_types = get_transformers_model_type(model_config or peft_config)
         model_types_all = ",".join(model_types)
+        print("623", model_types_all)
 
         # Check versions
         lowered_model_name = model_name.lower()

From 2eaf868efa817657405b4b67416b91be171b6285 Mon Sep 17 00:00:00 2001
From: Daniel Han <danielhanchen@gmail.com>
Date: Fri, 12 Sep 2025 18:55:47 -0700
Subject: [PATCH 123/272] Update loader.py

---
 unsloth/models/loader.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/unsloth/models/loader.py b/unsloth/models/loader.py
index 5ad283d39a..fd41390889 100644
--- a/unsloth/models/loader.py
+++ b/unsloth/models/loader.py
@@ -204,7 +204,7 @@ def from_pretrained(
             is_peft = False
         pass
         model_types = get_transformers_model_type(model_config or peft_config)
-        print("207", model_types_all)
+        print("207", model_types)
         if len(model_types) == 1:
             model_type = model_types[0]
         else:

From 7c892e798fa9ff71f25185ad5e4fb353f3b1a7e6 Mon Sep 17 00:00:00 2001
From: Daniel Han <danielhanchen@gmail.com>
Date: Sat, 13 Sep 2025 02:21:02 -0700
Subject: [PATCH 124/272] Update loader.py

---
 unsloth/models/loader.py | 5 ++---
 1 file changed, 2 insertions(+), 3 deletions(-)

diff --git a/unsloth/models/loader.py b/unsloth/models/loader.py
index fd41390889..ab258f3ed9 100644
--- a/unsloth/models/loader.py
+++ b/unsloth/models/loader.py
@@ -204,7 +204,6 @@ def from_pretrained(
             is_peft = False
         pass
         model_types = get_transformers_model_type(model_config or peft_config)
-        print("207", model_types)
         if len(model_types) == 1:
             model_type = model_types[0]
         else:
@@ -620,11 +619,11 @@ def from_pretrained(
         pass
         model_types = get_transformers_model_type(model_config or peft_config)
         model_types_all = ",".join(model_types)
-        print("623", model_types_all)
 
         # Check versions
         lowered_model_name = model_name.lower()
-        os.environ["UNSLOTH_MODEL_NAME"] = lowered_model_name
+        if os.environ.get("UNSLOTH_MODEL_NAME", "") == "":
+            os.environ["UNSLOTH_MODEL_NAME"] = lowered_model_name
         LATEST  = '\nPlease use transformers via `pip install --no-deps git+https://github.com/huggingface/transformers.git`'
         NIGHTLY = '\nPlease use nightly transformers via pip install --upgrade "transformers>=4.49.0"`'
         # Pixtral

From 72ff24c5ebff286427f46d47a46b82627533ed7f Mon Sep 17 00:00:00 2001
From: Daniel Han <danielhanchen@gmail.com>
Date: Sat, 13 Sep 2025 21:15:04 -0700
Subject: [PATCH 125/272] Versioning

---
 pyproject.toml           | 4 ++--
 unsloth/__init__.py      | 2 +-
 unsloth/models/_utils.py | 2 +-
 3 files changed, 4 insertions(+), 4 deletions(-)

diff --git a/pyproject.toml b/pyproject.toml
index d77683c00a..8df936f807 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -37,7 +37,7 @@ triton = [
 ]
 
 huggingface = [
-    "unsloth_zoo>=2025.9.5",
+    "unsloth_zoo>=2025.9.6",
     "packaging",
     "tyro",
     "transformers>=4.51.3,!=4.52.0,!=4.52.1,!=4.52.2,!=4.52.3,!=4.53.0,!=4.54.0,!=4.55.0,!=4.55.1",
@@ -453,7 +453,7 @@ colab-ampere-torch220 = [
     "flash-attn>=2.6.3",
 ]
 colab-new = [
-    "unsloth_zoo>=2025.9.5",
+    "unsloth_zoo>=2025.9.6",
     "packaging",
     "tyro",
     "transformers>=4.51.3,!=4.47.0,!=4.52.0,!=4.52.1,!=4.52.2,!=4.52.3,!=4.53.0,!=4.54.0,!=4.55.0,!=4.55.1",
diff --git a/unsloth/__init__.py b/unsloth/__init__.py
index 8255e505a8..1be571b69b 100644
--- a/unsloth/__init__.py
+++ b/unsloth/__init__.py
@@ -240,7 +240,7 @@ def is_bf16_supported(): return SUPPORTS_BFLOAT16
 # Check for unsloth_zoo
 try:
     unsloth_zoo_version = importlib_version("unsloth_zoo")
-    if Version(unsloth_zoo_version) < Version("2025.9.5"):
+    if Version(unsloth_zoo_version) < Version("2025.9.6"):
         print(
             "Unsloth: Please update Unsloth and Unsloth-Zoo to the latest version!\n"\
             "Do this via `pip install --upgrade --force-reinstall --no-cache-dir --no-deps unsloth unsloth_zoo`"
diff --git a/unsloth/models/_utils.py b/unsloth/models/_utils.py
index e3ac56ac83..4cf34aa007 100644
--- a/unsloth/models/_utils.py
+++ b/unsloth/models/_utils.py
@@ -12,7 +12,7 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-__version__ = "2025.9.4"
+__version__ = "2025.9.5"
 
 __all__ = [
     "SUPPORTS_BFLOAT16",

From 227842c5b87203c7c4ff1c2fc76763c79f33493c Mon Sep 17 00:00:00 2001
From: Daniel Han <danielhanchen@gmail.com>
Date: Mon, 15 Sep 2025 00:00:15 -0700
Subject: [PATCH 126/272] Update _utils.py

---
 unsloth/models/_utils.py | 16 +++++++++++++++-
 1 file changed, 15 insertions(+), 1 deletion(-)

diff --git a/unsloth/models/_utils.py b/unsloth/models/_utils.py
index 4cf34aa007..707d7220b2 100644
--- a/unsloth/models/_utils.py
+++ b/unsloth/models/_utils.py
@@ -614,6 +614,18 @@ def _is_openai_available(): return False
 # Get Xformers
 try:
     from xformers import __version__ as xformers_version
+    # [TODO] Xformers does NOT work on RTX 50x (12), B200 (10), Jetson (11)
+    # See https://github.com/facebookresearch/xformers/issues/1329
+    # CUDA error (/workspace/xfrm2/third_party/flash-attention/hopper/flash_fwd_launch_template.h:188)
+    major_version, minor_version = torch.cuda.get_device_capability()
+    if (
+        f"{major_version}.{minor_version}" in ("10.0", "11.0", "12.0")) and \
+        (xformers_version in (Version("0.0.32.post2"),)
+    ):
+        raise NotImplementedError(
+            "Unsloth: Xformers does not work in RTX 50X, Blackwell GPUs as of yet."
+        )
+    pass
     # Temporarily disable 0.0.27 and higher - inference issues
     if False: #Version(xformers_version) >= Version("0.0.27"):
         raise ImportError(
@@ -661,7 +673,9 @@ def _is_openai_available(): return False
     pass
     import xformers.ops.fmha as xformers
     xformers_attention = xformers.memory_efficient_attention
-except:
+except Exception as e:
+    print("========\nSwitching to SDPA PyTorch native attention which is slightly slower.\n========\n")
+    print(str(e))
     xformers = None
     xformers_attention = None
     xformers_version = None

From 505ae67fe77b77c04faa7cfb3284fd25441b5ade Mon Sep 17 00:00:00 2001
From: Daniel Han <danielhanchen@gmail.com>
Date: Mon, 15 Sep 2025 00:03:50 -0700
Subject: [PATCH 127/272] Update _utils.py

---
 unsloth/models/_utils.py | 4 +---
 1 file changed, 1 insertion(+), 3 deletions(-)

diff --git a/unsloth/models/_utils.py b/unsloth/models/_utils.py
index 707d7220b2..3878367650 100644
--- a/unsloth/models/_utils.py
+++ b/unsloth/models/_utils.py
@@ -84,7 +84,7 @@
 from unsloth_zoo.utils import Version
 from importlib.metadata import version as importlib_version
 from unsloth import DEVICE_TYPE, DEVICE_COUNT
-
+from unsloth_zoo.log import logger
 from unsloth_zoo.tokenizer_utils import (
     patch_tokenizer as _patch_tokenizer,
 )
@@ -608,8 +608,6 @@ def _is_openai_available(): return False
 elif DEVICE_TYPE == "xpu":
     SUPPORTS_BFLOAT16 = True
 
-from transformers.models.llama.modeling_llama import logger
-
 # =============================================
 # Get Xformers
 try:

From 80465dcabe0bd75dc8b43fddf3d8d672608fd087 Mon Sep 17 00:00:00 2001
From: Daniel Han <danielhanchen@gmail.com>
Date: Mon, 15 Sep 2025 00:06:36 -0700
Subject: [PATCH 128/272] Update _utils.py

---
 unsloth/models/_utils.py | 5 ++---
 1 file changed, 2 insertions(+), 3 deletions(-)

diff --git a/unsloth/models/_utils.py b/unsloth/models/_utils.py
index 3878367650..2abc6b269b 100644
--- a/unsloth/models/_utils.py
+++ b/unsloth/models/_utils.py
@@ -617,13 +617,12 @@ def _is_openai_available(): return False
     # CUDA error (/workspace/xfrm2/third_party/flash-attention/hopper/flash_fwd_launch_template.h:188)
     major_version, minor_version = torch.cuda.get_device_capability()
     if (
-        f"{major_version}.{minor_version}" in ("10.0", "11.0", "12.0")) and \
-        (xformers_version in (Version("0.0.32.post2"),)
+        (f"{major_version}.{minor_version}" in ("10.0", "11.0", "12.0")) and \
+        (Version(xformers_version) in (Version("0.0.32.post2"),))
     ):
         raise NotImplementedError(
             "Unsloth: Xformers does not work in RTX 50X, Blackwell GPUs as of yet."
         )
-    pass
     # Temporarily disable 0.0.27 and higher - inference issues
     if False: #Version(xformers_version) >= Version("0.0.27"):
         raise ImportError(

From 4150e081ada733352975234f5a42f97a696a53c3 Mon Sep 17 00:00:00 2001
From: Daniel Han <danielhanchen@gmail.com>
Date: Mon, 15 Sep 2025 01:21:43 -0700
Subject: [PATCH 129/272] Update _utils.py

---
 unsloth/models/_utils.py | 6 +++++-
 1 file changed, 5 insertions(+), 1 deletion(-)

diff --git a/unsloth/models/_utils.py b/unsloth/models/_utils.py
index 2abc6b269b..a559d34ca4 100644
--- a/unsloth/models/_utils.py
+++ b/unsloth/models/_utils.py
@@ -621,7 +621,11 @@ def _is_openai_available(): return False
         (Version(xformers_version) in (Version("0.0.32.post2"),))
     ):
         raise NotImplementedError(
-            "Unsloth: Xformers does not work in RTX 50X, Blackwell GPUs as of yet."
+            "Unsloth: Xformers does not work in RTX 50X, Blackwell GPUs as of yet. Please build from source via\n"\
+            "```\n"\
+            "pip install ninja\n"\
+            "pip install -v --no-build-isolation -U git+https://github.com/facebookresearch/xformers.git@main#egg=xformers\n"\
+            "```\n"
         )
     # Temporarily disable 0.0.27 and higher - inference issues
     if False: #Version(xformers_version) >= Version("0.0.27"):

From 032c2c840067870adbbba78ad3088ccd5e2ff849 Mon Sep 17 00:00:00 2001
From: Daniel Han <danielhanchen@gmail.com>
Date: Mon, 15 Sep 2025 22:52:32 -0700
Subject: [PATCH 130/272] Update vision.py

---
 unsloth/models/vision.py | 23 ++++++++---------------
 1 file changed, 8 insertions(+), 15 deletions(-)

diff --git a/unsloth/models/vision.py b/unsloth/models/vision.py
index 1451ed92cd..2c77169cb9 100644
--- a/unsloth/models/vision.py
+++ b/unsloth/models/vision.py
@@ -636,24 +636,17 @@ def get_peft_model(
                 torch.xpu.empty_cache()
         pass
         max_seq_length = model.max_seq_length
-        # if we pass loftq_config = None we will get an error
+        # If we pass loftq_config = None we will get an error
         loftq_config = validate_loftq_config(loftq_config, lora_dropout, bias, init_lora_weights, model)
-        lora_config_dict = {
-            "r"                 : r,
-            "lora_alpha"        : lora_alpha,
-            "target_modules"    : target_modules,
-            "target_parameters" : kwargs.get("target_parameters", None),
-            "lora_dropout"      : lora_dropout,
-            "bias"              : bias,
-            "task_type"         : task_type,
-            "modules_to_save"   : modules_to_save,
-            "use_rslora"        : use_rslora,
-            "init_lora_weights" : init_lora_weights,
-            "loftq_config"      : loftq_config,
-        }
+
+        # Get only allowed parameters for LoraConfig
+        local_variables = { **locals(), **kwargs, }
+        del local_variables["kwargs"]
+        allowed_parameters = inspect.signature(LoraConfig).parameters.keys()
         lora_config = LoraConfig(
-            **{k:v for k,v in lora_config_dict.items() if k in LoraConfig.__doc__},
+            **{ k : v for k, v in local_variables.items() if k in allowed_parameters },
         )
+        print(lora_config)
         model = prepare_model_for_kbit_training(
             model,
             use_gradient_checkpointing = use_gradient_checkpointing,

From b105aae096e46646bf9ea5b7e0f541cad981f066 Mon Sep 17 00:00:00 2001
From: Daniel Han <danielhanchen@gmail.com>
Date: Mon, 15 Sep 2025 23:00:14 -0700
Subject: [PATCH 131/272] Update vision.py

---
 unsloth/models/vision.py | 1 -
 1 file changed, 1 deletion(-)

diff --git a/unsloth/models/vision.py b/unsloth/models/vision.py
index 2c77169cb9..f8c0f866f9 100644
--- a/unsloth/models/vision.py
+++ b/unsloth/models/vision.py
@@ -646,7 +646,6 @@ def get_peft_model(
         lora_config = LoraConfig(
             **{ k : v for k, v in local_variables.items() if k in allowed_parameters },
         )
-        print(lora_config)
         model = prepare_model_for_kbit_training(
             model,
             use_gradient_checkpointing = use_gradient_checkpointing,

From 400df38fb04aaec151c1d5b1e0d2a1ac23ceca6f Mon Sep 17 00:00:00 2001
From: Daniel Han <danielhanchen@gmail.com>
Date: Tue, 16 Sep 2025 03:00:39 -0700
Subject: [PATCH 132/272] Fix DataParallel

---
 unsloth/models/llama.py | 7 +++++--
 unsloth/models/rl.py    | 6 ++++++
 2 files changed, 11 insertions(+), 2 deletions(-)

diff --git a/unsloth/models/llama.py b/unsloth/models/llama.py
index f7a53d05fd..e04ffd029e 100644
--- a/unsloth/models/llama.py
+++ b/unsloth/models/llama.py
@@ -1200,7 +1200,8 @@ def _CausalLM_fast_forward(
 
             if not RETURN_LOGITS and labels is not None:
 
-                n_items = kwargs.get("num_items_in_batch", None) or kwargs.get("n_items", None)
+                n_items = kwargs.get("num_items_in_batch", None)
+                if n_items is None: n_items = kwargs.get("n_items", None)
 
                 if self.config.model_type == "falcon_h1":
                     hidden_states = hidden_states * self.config.lm_head_multiplier
@@ -1264,12 +1265,14 @@ def _CausalLM_fast_forward(
             shift_labels[..., :-1] = labels[..., 1:]
             shift_labels[..., -1] = -100
             # shift_labels = torch.hstack((labels[..., 1:], self.extra_ignored_labels[:labels.shape[0]]))
+            n_items = kwargs.get("num_items_in_batch", None)
+            if n_items is None: n_items = kwargs.get("n_items", None)
             loss = fast_cross_entropy_loss(
                 logits = shift_logits,
                 labels = shift_labels,
                 logit_softcapping = logit_softcapping,
                 logit_scaling     = logit_scaling,
-                n_items           = kwargs.get("num_items_in_batch", None) or kwargs.get("n_items", None),
+                n_items           = n_items,
             )
         else:
             if logit_scaling != 0:
diff --git a/unsloth/models/rl.py b/unsloth/models/rl.py
index 53f5eee66c..9e940c763b 100644
--- a/unsloth/models/rl.py
+++ b/unsloth/models/rl.py
@@ -110,6 +110,7 @@ def generate_with_clone(*args, **kwargs):
 from contextlib import nullcontext
 from torch.nn import functional as F
 from transformers import DataCollatorForSeq2Seq, DataCollatorForLanguageModeling as TransformersDataCollatorForLanguageModeling
+from transformers.training_args import ParallelMode
 
 torch_compile_options = {{
     "epilogue_fusion"   : True,
@@ -160,6 +161,11 @@ def __init__({RLTrainer_arguments},
     ):
         if args is None: args = Unsloth{RLConfig_name}()
 {RLTrainer_extra_args}
+        # [TODO] Fix up DataParallel multiplying batch sizes
+        # [TODO] DDP works, but DP seems to not work? [TODO]
+        if getattr(args, "parallel_mode", None) == ParallelMode.NOT_DISTRIBUTED and args.n_gpu > 1:
+            if getattr(args, "_n_gpu", 1) != 1:
+                args._n_gpu = 1
         super().__init__({RLTrainer_call_args}{RLTrainer_kwargs})
 {RLTrainer_post}
 pass

From 809a8b3b206db30c676852af07270db8c44b7319 Mon Sep 17 00:00:00 2001
From: Daniel Han <danielhanchen@gmail.com>
Date: Tue, 16 Sep 2025 03:02:52 -0700
Subject: [PATCH 133/272] Update _utils.py

---
 unsloth/models/_utils.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/unsloth/models/_utils.py b/unsloth/models/_utils.py
index a559d34ca4..194d18771c 100644
--- a/unsloth/models/_utils.py
+++ b/unsloth/models/_utils.py
@@ -12,7 +12,7 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-__version__ = "2025.9.5"
+__version__ = "2025.9.6"
 
 __all__ = [
     "SUPPORTS_BFLOAT16",

From 3dcc0911eb5e5ae360456e281f3e9ca99c5f95b8 Mon Sep 17 00:00:00 2001
From: Daniel Han <danielhanchen@gmail.com>
Date: Wed, 17 Sep 2025 02:13:07 -0700
Subject: [PATCH 134/272] Update rl.py

---
 unsloth/models/rl.py | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/unsloth/models/rl.py b/unsloth/models/rl.py
index 3e2fcf22be..6f1f000e68 100644
--- a/unsloth/models/rl.py
+++ b/unsloth/models/rl.py
@@ -271,14 +271,17 @@ def _patch_trl_rl_trainers(trainer_file = "grpo_trainer"):
         "if not force_float32 and (float16 and use_bf16): raise TypeError('Unsloth: Model is in float16 precision but you want to use bfloat16 precision. Set fp16 to `True` and bf16 to `False`')\n"\
         "if not force_float32 and (not float16 and use_fp16): raise TypeError('Unsloth: Model is in bfloat16 precision but you want to use float16 precision. Set fp16 to `False` and bf16 to `True`')\n"\
         "if force_float32:\n"\
+        "    # Forced float32 training\n"\
         "    args.fp16 = False\n"\
         "    args.bf16 = False\n"\
         "    os.environ['ACCELERATE_MIXED_PRECISION'] = 'no'\n"\
         "elif (not use_bf16 and not use_fp16) and mixed_precision_dtype == 'float32':\n"\
+        "    # Mixed precision training\n"\
         "    args.fp16 = float16\n"\
         "    args.bf16 = not float16\n"\
         "    os.environ['ACCELERATE_MIXED_PRECISION'] = 'fp16' if float16 else 'bf16'\n"
         "elif mixed_precision_dtype == 'bfloat16':\n"\
+        "    # Both False since bfloat16 full finetuning doesn't do any autocasting.\n"\
         "    args.fp16 = False\n"\
         "    args.bf16 = False\n"\
         "    os.environ['ACCELERATE_MIXED_PRECISION'] = 'no'\n"

From 28b1d50016921db9ada7bcdcdb67c61b92c9f379 Mon Sep 17 00:00:00 2001
From: Daniel Han <danielhanchen@gmail.com>
Date: Wed, 17 Sep 2025 02:40:22 -0700
Subject: [PATCH 135/272] Update synthetic.py

---
 unsloth/dataprep/synthetic.py | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/unsloth/dataprep/synthetic.py b/unsloth/dataprep/synthetic.py
index 52c114fab6..60742b7fdc 100644
--- a/unsloth/dataprep/synthetic.py
+++ b/unsloth/dataprep/synthetic.py
@@ -28,6 +28,7 @@
     patch_vllm,
     delete_vllm,
 )
+from unsloth_zoo.log import logger 
 import numpy as np
 
 from .synthetic_configs import (
@@ -117,6 +118,7 @@ def __init__(
             else:
                 subprocess_commands += ["--" + flag, which,]
         pass
+        logger.info(subprocess_commands)
         vllm_process = subprocess.Popen(
             subprocess_commands,
             stdout = subprocess.PIPE,

From de162d3e2a724dd178d24961bd9b989a68b70f2d Mon Sep 17 00:00:00 2001
From: Daniel Han <danielhanchen@gmail.com>
Date: Wed, 17 Sep 2025 02:56:36 -0700
Subject: [PATCH 136/272] Update synthetic.py

---
 unsloth/dataprep/synthetic.py | 1 +
 1 file changed, 1 insertion(+)

diff --git a/unsloth/dataprep/synthetic.py b/unsloth/dataprep/synthetic.py
index 60742b7fdc..2cca155d6d 100644
--- a/unsloth/dataprep/synthetic.py
+++ b/unsloth/dataprep/synthetic.py
@@ -77,6 +77,7 @@ def __init__(
             return_args            = True,
             enable_lora            = False,
             use_bitsandbytes       = False,
+            compilation_config     = 3,
             **kwargs,
         )
         if "dtype" in engine_args:

From a507a7d82bb1792986ffaa99c9f10b4de7e6bba3 Mon Sep 17 00:00:00 2001
From: Daniel Han <danielhanchen@gmail.com>
Date: Wed, 17 Sep 2025 03:03:32 -0700
Subject: [PATCH 137/272] Update synthetic.py

---
 unsloth/dataprep/synthetic.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/unsloth/dataprep/synthetic.py b/unsloth/dataprep/synthetic.py
index 2cca155d6d..d52f1df373 100644
--- a/unsloth/dataprep/synthetic.py
+++ b/unsloth/dataprep/synthetic.py
@@ -99,7 +99,7 @@ def __init__(
         if "model"  in engine_args: del engine_args["model"]
         if "compilation_config" in engine_args:
             # Cannot parse in vllm serve
-            engine_args["compilation_config"] = 3
+            engine_args["compilation_config"] = "'" + str(engine_args["compilation_config"]) + "'"
 
         subprocess_commands = [
             "vllm", "serve", str(model_name),

From cda72638c333e653d1ac74df30a69b6abfbf3624 Mon Sep 17 00:00:00 2001
From: Daniel Han <danielhanchen@gmail.com>
Date: Wed, 17 Sep 2025 03:06:04 -0700
Subject: [PATCH 138/272] Update synthetic.py

---
 unsloth/dataprep/synthetic.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/unsloth/dataprep/synthetic.py b/unsloth/dataprep/synthetic.py
index d52f1df373..68dd475e59 100644
--- a/unsloth/dataprep/synthetic.py
+++ b/unsloth/dataprep/synthetic.py
@@ -99,7 +99,7 @@ def __init__(
         if "model"  in engine_args: del engine_args["model"]
         if "compilation_config" in engine_args:
             # Cannot parse in vllm serve
-            engine_args["compilation_config"] = "'" + str(engine_args["compilation_config"]) + "'"
+            engine_args["compilation_config"] = '"' + str(engine_args["compilation_config"]) + '"'
 
         subprocess_commands = [
             "vllm", "serve", str(model_name),

From dd8ad929e13235091c0379a03a2f09ac3a5c61a1 Mon Sep 17 00:00:00 2001
From: Daniel Han <danielhanchen@gmail.com>
Date: Wed, 17 Sep 2025 03:07:03 -0700
Subject: [PATCH 139/272] Update synthetic.py

---
 unsloth/dataprep/synthetic.py | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/unsloth/dataprep/synthetic.py b/unsloth/dataprep/synthetic.py
index 68dd475e59..53d655ce0e 100644
--- a/unsloth/dataprep/synthetic.py
+++ b/unsloth/dataprep/synthetic.py
@@ -97,15 +97,15 @@ def __init__(
                 engine_args["dtype"] = "auto"
         if "device" in engine_args: del engine_args["device"]
         if "model"  in engine_args: del engine_args["model"]
-        if "compilation_config" in engine_args:
-            # Cannot parse in vllm serve
-            engine_args["compilation_config"] = '"' + str(engine_args["compilation_config"]) + '"'
 
         subprocess_commands = [
             "vllm", "serve", str(model_name),
         ]
         for key, value in engine_args.items():
             flag  = key.replace("_", "-")
+            if key == "compilation_config":
+                subprocess_commands += ["--" + '"' + str(value) + '"',]
+                continue
             which = str(value).replace("torch.", "")
             if which == "True":
                 # Ignore --enforce-eager True

From a725b98363e50b7c80649e83975c1f9017f01eed Mon Sep 17 00:00:00 2001
From: Daniel Han <danielhanchen@gmail.com>
Date: Wed, 17 Sep 2025 03:24:07 -0700
Subject: [PATCH 140/272] Update synthetic.py

---
 unsloth/dataprep/synthetic.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/unsloth/dataprep/synthetic.py b/unsloth/dataprep/synthetic.py
index 53d655ce0e..7c421b33bf 100644
--- a/unsloth/dataprep/synthetic.py
+++ b/unsloth/dataprep/synthetic.py
@@ -104,7 +104,7 @@ def __init__(
         for key, value in engine_args.items():
             flag  = key.replace("_", "-")
             if key == "compilation_config":
-                subprocess_commands += ["--" + '"' + str(value) + '"',]
+                subprocess_commands += ["--" + flag, '"' + str(value) + '"',]
                 continue
             which = str(value).replace("torch.", "")
             if which == "True":

From 321f1a33b0e243691b8e297ac0170393d51456ff Mon Sep 17 00:00:00 2001
From: Daniel Han <danielhanchen@gmail.com>
Date: Wed, 17 Sep 2025 03:26:57 -0700
Subject: [PATCH 141/272] Update synthetic.py

---
 unsloth/dataprep/synthetic.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/unsloth/dataprep/synthetic.py b/unsloth/dataprep/synthetic.py
index 7c421b33bf..7e27b8261d 100644
--- a/unsloth/dataprep/synthetic.py
+++ b/unsloth/dataprep/synthetic.py
@@ -104,7 +104,7 @@ def __init__(
         for key, value in engine_args.items():
             flag  = key.replace("_", "-")
             if key == "compilation_config":
-                subprocess_commands += ["--" + flag, '"' + str(value) + '"',]
+                subprocess_commands += ["--" + flag, "'" + str(value) + "'",]
                 continue
             which = str(value).replace("torch.", "")
             if which == "True":

From 357e5019b7341c9b19f62db146950113e4aa58b9 Mon Sep 17 00:00:00 2001
From: Daniel Han <danielhanchen@gmail.com>
Date: Wed, 17 Sep 2025 03:30:11 -0700
Subject: [PATCH 142/272] Update synthetic.py

---
 unsloth/dataprep/synthetic.py | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/unsloth/dataprep/synthetic.py b/unsloth/dataprep/synthetic.py
index 7e27b8261d..aa5296c58b 100644
--- a/unsloth/dataprep/synthetic.py
+++ b/unsloth/dataprep/synthetic.py
@@ -104,7 +104,8 @@ def __init__(
         for key, value in engine_args.items():
             flag  = key.replace("_", "-")
             if key == "compilation_config":
-                subprocess_commands += ["--" + flag, "'" + str(value) + "'",]
+                quoted_compilation_config = '"' + str(value) + '"'
+                subprocess_commands += ["--" + flag, "'" + quoted_compilation_config[1:-1] + "'",]
                 continue
             which = str(value).replace("torch.", "")
             if which == "True":

From 8a03656b958d023c4e2639ef3cf7d6c0616f4efb Mon Sep 17 00:00:00 2001
From: Daniel Han <danielhanchen@gmail.com>
Date: Wed, 17 Sep 2025 03:32:54 -0700
Subject: [PATCH 143/272] Update synthetic.py

---
 unsloth/dataprep/synthetic.py | 1 +
 1 file changed, 1 insertion(+)

diff --git a/unsloth/dataprep/synthetic.py b/unsloth/dataprep/synthetic.py
index aa5296c58b..eb73a5fb84 100644
--- a/unsloth/dataprep/synthetic.py
+++ b/unsloth/dataprep/synthetic.py
@@ -121,6 +121,7 @@ def __init__(
                 subprocess_commands += ["--" + flag, which,]
         pass
         logger.info(subprocess_commands)
+        print(subprocess_commands)
         vllm_process = subprocess.Popen(
             subprocess_commands,
             stdout = subprocess.PIPE,

From d7832d01baaef9a791c509d69c122c61385425f2 Mon Sep 17 00:00:00 2001
From: Daniel Han <danielhanchen@gmail.com>
Date: Wed, 17 Sep 2025 03:42:00 -0700
Subject: [PATCH 144/272] Update synthetic.py

---
 unsloth/dataprep/synthetic.py | 6 ++----
 1 file changed, 2 insertions(+), 4 deletions(-)

diff --git a/unsloth/dataprep/synthetic.py b/unsloth/dataprep/synthetic.py
index eb73a5fb84..70f94e5584 100644
--- a/unsloth/dataprep/synthetic.py
+++ b/unsloth/dataprep/synthetic.py
@@ -102,10 +102,9 @@ def __init__(
             "vllm", "serve", str(model_name),
         ]
         for key, value in engine_args.items():
-            flag  = key.replace("_", "-")
+            flag = key.replace("_", "-")
             if key == "compilation_config":
-                quoted_compilation_config = '"' + str(value) + '"'
-                subprocess_commands += ["--" + flag, "'" + quoted_compilation_config[1:-1] + "'",]
+                subprocess_commands += ["--" + flag, str(value),]
                 continue
             which = str(value).replace("torch.", "")
             if which == "True":
@@ -121,7 +120,6 @@ def __init__(
                 subprocess_commands += ["--" + flag, which,]
         pass
         logger.info(subprocess_commands)
-        print(subprocess_commands)
         vllm_process = subprocess.Popen(
             subprocess_commands,
             stdout = subprocess.PIPE,

From 84f54348de880229dd67afbb737ea247839a6afa Mon Sep 17 00:00:00 2001
From: Daniel Han <danielhanchen@gmail.com>
Date: Wed, 17 Sep 2025 04:09:59 -0700
Subject: [PATCH 145/272] Update synthetic.py

---
 unsloth/dataprep/synthetic.py | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/unsloth/dataprep/synthetic.py b/unsloth/dataprep/synthetic.py
index 70f94e5584..b75918237b 100644
--- a/unsloth/dataprep/synthetic.py
+++ b/unsloth/dataprep/synthetic.py
@@ -104,7 +104,8 @@ def __init__(
         for key, value in engine_args.items():
             flag = key.replace("_", "-")
             if key == "compilation_config":
-                subprocess_commands += ["--" + flag, str(value),]
+                # [TODO] Unsure why subprocess doesn't process json properly
+                subprocess_commands += ["-O3",]
                 continue
             which = str(value).replace("torch.", "")
             if which == "True":

From 17b2e98f3df7735166a6c3f8b4ba2689418bc6e3 Mon Sep 17 00:00:00 2001
From: Daniel Han <danielhanchen@gmail.com>
Date: Wed, 17 Sep 2025 04:16:41 -0700
Subject: [PATCH 146/272] Update synthetic.py

---
 unsloth/dataprep/synthetic.py | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/unsloth/dataprep/synthetic.py b/unsloth/dataprep/synthetic.py
index b75918237b..9651df23e8 100644
--- a/unsloth/dataprep/synthetic.py
+++ b/unsloth/dataprep/synthetic.py
@@ -105,7 +105,8 @@ def __init__(
             flag = key.replace("_", "-")
             if key == "compilation_config":
                 # [TODO] Unsure why subprocess doesn't process json properly
-                subprocess_commands += ["-O3",]
+                # Also -O3 breaks on T4!
+                # subprocess_commands += ["-O3",]
                 continue
             which = str(value).replace("torch.", "")
             if which == "True":

From 5364138046cdddedc37594ae87f5e51bb0265031 Mon Sep 17 00:00:00 2001
From: Daniel Han <danielhanchen@gmail.com>
Date: Wed, 17 Sep 2025 06:35:44 -0700
Subject: [PATCH 147/272] Update mapper.py

---
 unsloth/models/mapper.py | 10 ++++++++++
 1 file changed, 10 insertions(+)

diff --git a/unsloth/models/mapper.py b/unsloth/models/mapper.py
index be269316fe..eb9119b681 100644
--- a/unsloth/models/mapper.py
+++ b/unsloth/models/mapper.py
@@ -956,6 +956,16 @@
         "google/gemma-3-270m",
         "unsloth/gemma-3-270m-bnb-4bit",
     ),
+    "unsloth/Magistral-Small-2507-unsloth-bnb-4bit" : (
+        "unsloth/Magistral-Small-2507",
+        "mistralai/Magistral-Small-2507",
+        "unsloth/Magistral-Small-2507-bnb-4bit",
+    ),
+    "unsloth/Magistral-Small-2509-unsloth-bnb-4bit" : (
+        "unsloth/Magistral-Small-2509",
+        "mistralai/Magistral-Small-2509",
+        "unsloth/Magistral-Small-2509-bnb-4bit",
+    ),
 }
 
 INT_TO_FLOAT_MAPPER  = {}

From 8dbd0084d4097cf3c5eb03027ecdf5ec5bdacc17 Mon Sep 17 00:00:00 2001
From: Daniel Han <danielhanchen@gmail.com>
Date: Wed, 17 Sep 2025 08:21:10 -0700
Subject: [PATCH 148/272] Versioning

---
 pyproject.toml           | 4 ++--
 unsloth/models/_utils.py | 2 +-
 2 files changed, 3 insertions(+), 3 deletions(-)

diff --git a/pyproject.toml b/pyproject.toml
index 70fc3bdedc..c3915c1cd6 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -37,7 +37,7 @@ triton = [
 ]
 
 huggingface = [
-    "unsloth_zoo>=2025.9.7",
+    "unsloth_zoo>=2025.9.8",
     "packaging",
     "tyro",
     "transformers>=4.51.3,!=4.52.0,!=4.52.1,!=4.52.2,!=4.52.3,!=4.53.0,!=4.54.0,!=4.55.0,!=4.55.1,<=4.55.4",
@@ -453,7 +453,7 @@ colab-ampere-torch220 = [
     "flash-attn>=2.6.3",
 ]
 colab-new = [
-    "unsloth_zoo>=2025.9.7",
+    "unsloth_zoo>=2025.9.8",
     "packaging",
     "tyro",
     "transformers>=4.51.3,!=4.47.0,!=4.52.0,!=4.52.1,!=4.52.2,!=4.52.3,!=4.53.0,!=4.54.0,!=4.55.0,!=4.55.1,<=4.55.4",
diff --git a/unsloth/models/_utils.py b/unsloth/models/_utils.py
index 41adc74650..d2ebc29bf2 100644
--- a/unsloth/models/_utils.py
+++ b/unsloth/models/_utils.py
@@ -12,7 +12,7 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-__version__ = "2025.9.6"
+__version__ = "2025.9.7"
 
 __all__ = [
     "SUPPORTS_BFLOAT16",

From d7ca79f18ef5b794b3684768708ab7ebb57a4acc Mon Sep 17 00:00:00 2001
From: Daniel Han <danielhanchen@gmail.com>
Date: Wed, 17 Sep 2025 22:01:19 -0700
Subject: [PATCH 149/272] Update loader.py

---
 unsloth/models/loader.py | 39 ++++++++++++++++++++-------------------
 1 file changed, 20 insertions(+), 19 deletions(-)

diff --git a/unsloth/models/loader.py b/unsloth/models/loader.py
index da40fb57d8..e891340221 100644
--- a/unsloth/models/loader.py
+++ b/unsloth/models/loader.py
@@ -83,8 +83,8 @@
 
 global FORCE_FLOAT32
 FORCE_FLOAT32 = [
-    "gemma3,",  # Add comma bc gemma3 will match gemma3n
-    "gemma3n",
+    "gemma3,",
+    "gemma3n,",
     "gpt_oss",
 ]
 
@@ -627,7 +627,7 @@ def from_pretrained(
             is_peft = False
         pass
         model_types = get_transformers_model_type(model_config or peft_config)
-        model_types_all = ",".join(model_types)
+        model_types_all = ",".join(model_types) + ","
 
         # Check versions
         lowered_model_name = model_name.lower()
@@ -642,21 +642,22 @@ def from_pretrained(
         elif "qwen2_5" in model_types_all and transformers_version < Version("4.49.0"):
             raise RuntimeError("Unsloth: Qwen 2.5 only works on transformers >= 4.49.0." + LATEST)
         # Gemma 3
-        elif "gemma3" in model_types_all:
-            if "gemma3n" in model_types_all:
-                if transformers_version < Version("4.53.0"):
-                    raise RuntimeError("Unsloth: Gemma 3N only works on transformers >= 4.53.0" + LATEST)
-                os.environ["UNSLOTH_DISABLE_STATIC_GENERATION"] = "1"
-                os.environ["UNSLOTH_FORCE_CUSTOM_DTYPE"] = \
-                    "float16;torch.float16;torch.float16;"\
-                    "if name.endswith('norm'): "\
-                    "module._pre_set_compute_dtype = torch.float32\n"\
-                    ";"\
-                    "from unsloth_zoo.temporary_patches.gemma3n import patch_Gemma3nConv_Embed_forwards; patch_Gemma3nConv_Embed_forwards()"
-            else:
-                if transformers_version < Version("4.50.0.dev0"):
-                    raise RuntimeError("Unsloth: Gemma 3 only works on transformers >= 4.50.0." + NIGHTLY)
-
+        elif "gemma3," in model_types_all:
+            if transformers_version < Version("4.50.0.dev0"):
+                raise RuntimeError("Unsloth: Gemma 3 only works on transformers >= 4.50.0." + NIGHTLY)
+            # Set norms to float32 since anyways they get upcasted to float32
+            # common in both gemma-3 and gemma-3n
+            os.environ["UNSLOTH_HIGH_PRECISION_LAYERNORM"] = "1"
+        elif "gemma3n," in model_types_all:
+            if transformers_version < Version("4.53.0"):
+                raise RuntimeError("Unsloth: Gemma 3N only works on transformers >= 4.53.0" + LATEST)
+            os.environ["UNSLOTH_DISABLE_STATIC_GENERATION"] = "1"
+            os.environ["UNSLOTH_FORCE_CUSTOM_DTYPE"] = \
+                "float16;torch.float16;torch.float16;"\
+                "if name.endswith('norm'): "\
+                "module._pre_set_compute_dtype = torch.float32\n"\
+                ";"\
+                "from unsloth_zoo.temporary_patches.gemma3n import patch_Gemma3nConv_Embed_forwards; patch_Gemma3nConv_Embed_forwards()"
             # Set norms to float32 since anyways they get upcasted to float32
             # common in both gemma-3 and gemma-3n
             os.environ["UNSLOTH_HIGH_PRECISION_LAYERNORM"] = "1"
@@ -811,7 +812,7 @@ def from_pretrained(
         for disable_name in FORCE_FLOAT32:
             # add comma to model_types_all matching in case of exact match for end
             if (disable_name.lower() == model_type_arch.lower().replace("-", "").replace("_", "") or \
-                disable_name.lower() in f'{model_types_all},') and \
+                disable_name.lower() in model_types_all) and \
                 ((dtype == torch.float16) or not SUPPORTS_BFLOAT16):
                 os.environ["UNSLOTH_FORCE_FLOAT32"] = "1"
                 dtype = torch.bfloat16 # Change to bfloat16 loading

From bb90785ad3066b4ba926cf1e607f120128c32982 Mon Sep 17 00:00:00 2001
From: Daniel Han <danielhanchen@gmail.com>
Date: Wed, 17 Sep 2025 22:10:29 -0700
Subject: [PATCH 150/272] Update loader.py

---
 unsloth/models/loader.py | 39 +++++++++++++++++++--------------------
 1 file changed, 19 insertions(+), 20 deletions(-)

diff --git a/unsloth/models/loader.py b/unsloth/models/loader.py
index e891340221..da40fb57d8 100644
--- a/unsloth/models/loader.py
+++ b/unsloth/models/loader.py
@@ -83,8 +83,8 @@
 
 global FORCE_FLOAT32
 FORCE_FLOAT32 = [
-    "gemma3,",
-    "gemma3n,",
+    "gemma3,",  # Add comma bc gemma3 will match gemma3n
+    "gemma3n",
     "gpt_oss",
 ]
 
@@ -627,7 +627,7 @@ def from_pretrained(
             is_peft = False
         pass
         model_types = get_transformers_model_type(model_config or peft_config)
-        model_types_all = ",".join(model_types) + ","
+        model_types_all = ",".join(model_types)
 
         # Check versions
         lowered_model_name = model_name.lower()
@@ -642,22 +642,21 @@ def from_pretrained(
         elif "qwen2_5" in model_types_all and transformers_version < Version("4.49.0"):
             raise RuntimeError("Unsloth: Qwen 2.5 only works on transformers >= 4.49.0." + LATEST)
         # Gemma 3
-        elif "gemma3," in model_types_all:
-            if transformers_version < Version("4.50.0.dev0"):
-                raise RuntimeError("Unsloth: Gemma 3 only works on transformers >= 4.50.0." + NIGHTLY)
-            # Set norms to float32 since anyways they get upcasted to float32
-            # common in both gemma-3 and gemma-3n
-            os.environ["UNSLOTH_HIGH_PRECISION_LAYERNORM"] = "1"
-        elif "gemma3n," in model_types_all:
-            if transformers_version < Version("4.53.0"):
-                raise RuntimeError("Unsloth: Gemma 3N only works on transformers >= 4.53.0" + LATEST)
-            os.environ["UNSLOTH_DISABLE_STATIC_GENERATION"] = "1"
-            os.environ["UNSLOTH_FORCE_CUSTOM_DTYPE"] = \
-                "float16;torch.float16;torch.float16;"\
-                "if name.endswith('norm'): "\
-                "module._pre_set_compute_dtype = torch.float32\n"\
-                ";"\
-                "from unsloth_zoo.temporary_patches.gemma3n import patch_Gemma3nConv_Embed_forwards; patch_Gemma3nConv_Embed_forwards()"
+        elif "gemma3" in model_types_all:
+            if "gemma3n" in model_types_all:
+                if transformers_version < Version("4.53.0"):
+                    raise RuntimeError("Unsloth: Gemma 3N only works on transformers >= 4.53.0" + LATEST)
+                os.environ["UNSLOTH_DISABLE_STATIC_GENERATION"] = "1"
+                os.environ["UNSLOTH_FORCE_CUSTOM_DTYPE"] = \
+                    "float16;torch.float16;torch.float16;"\
+                    "if name.endswith('norm'): "\
+                    "module._pre_set_compute_dtype = torch.float32\n"\
+                    ";"\
+                    "from unsloth_zoo.temporary_patches.gemma3n import patch_Gemma3nConv_Embed_forwards; patch_Gemma3nConv_Embed_forwards()"
+            else:
+                if transformers_version < Version("4.50.0.dev0"):
+                    raise RuntimeError("Unsloth: Gemma 3 only works on transformers >= 4.50.0." + NIGHTLY)
+
             # Set norms to float32 since anyways they get upcasted to float32
             # common in both gemma-3 and gemma-3n
             os.environ["UNSLOTH_HIGH_PRECISION_LAYERNORM"] = "1"
@@ -812,7 +811,7 @@ def from_pretrained(
         for disable_name in FORCE_FLOAT32:
             # add comma to model_types_all matching in case of exact match for end
             if (disable_name.lower() == model_type_arch.lower().replace("-", "").replace("_", "") or \
-                disable_name.lower() in model_types_all) and \
+                disable_name.lower() in f'{model_types_all},') and \
                 ((dtype == torch.float16) or not SUPPORTS_BFLOAT16):
                 os.environ["UNSLOTH_FORCE_FLOAT32"] = "1"
                 dtype = torch.bfloat16 # Change to bfloat16 loading

From 3289826add711c92dee44f1117fa6a54d6e68b91 Mon Sep 17 00:00:00 2001
From: Daniel Han <danielhanchen@gmail.com>
Date: Thu, 18 Sep 2025 02:22:14 -0700
Subject: [PATCH 151/272] Update rl.py

---
 unsloth/models/rl.py | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/unsloth/models/rl.py b/unsloth/models/rl.py
index 6f1f000e68..3d5f6d084b 100644
--- a/unsloth/models/rl.py
+++ b/unsloth/models/rl.py
@@ -259,7 +259,8 @@ def _patch_trl_rl_trainers(trainer_file = "grpo_trainer"):
         "use_fp16 = getattr(args, 'fp16', False)\n"\
         "if type(use_fp16) is not bool: use_fp16 = False\n"\
         "force_float32 = False\n"\
-        "if os.environ.get('UNSLOTH_FORCE_FLOAT32', '0') == '1':\n"\
+        "full_finetuning = os.environ.get('UNSLOTH_ENABLE_FULL_FINETUNING', '0') == '1'\n"\
+        "if not full_finetuning and (os.environ.get('UNSLOTH_FORCE_FLOAT32', '0') == '1'):\n"\
         "    print('Unsloth: Switching to float32 training since model cannot work with float16')\n"\
         "    force_float32 = True\n"\
         "mixed_precision_dtype = os.environ.get('UNSLOTH_MIXED_PRECISION', 'float32')\n"\

From a04211436f8a11aaece59d4662a29ab4c825a0b1 Mon Sep 17 00:00:00 2001
From: Daniel Han <danielhanchen@gmail.com>
Date: Thu, 18 Sep 2025 04:31:28 -0700
Subject: [PATCH 152/272] Versioning

---
 pyproject.toml           | 4 ++--
 unsloth/models/_utils.py | 2 +-
 2 files changed, 3 insertions(+), 3 deletions(-)

diff --git a/pyproject.toml b/pyproject.toml
index c3915c1cd6..4f9c308b32 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -37,7 +37,7 @@ triton = [
 ]
 
 huggingface = [
-    "unsloth_zoo>=2025.9.8",
+    "unsloth_zoo>=2025.9.9",
     "packaging",
     "tyro",
     "transformers>=4.51.3,!=4.52.0,!=4.52.1,!=4.52.2,!=4.52.3,!=4.53.0,!=4.54.0,!=4.55.0,!=4.55.1,<=4.55.4",
@@ -453,7 +453,7 @@ colab-ampere-torch220 = [
     "flash-attn>=2.6.3",
 ]
 colab-new = [
-    "unsloth_zoo>=2025.9.8",
+    "unsloth_zoo>=2025.9.9",
     "packaging",
     "tyro",
     "transformers>=4.51.3,!=4.47.0,!=4.52.0,!=4.52.1,!=4.52.2,!=4.52.3,!=4.53.0,!=4.54.0,!=4.55.0,!=4.55.1,<=4.55.4",
diff --git a/unsloth/models/_utils.py b/unsloth/models/_utils.py
index 79134005dc..5f41352d97 100644
--- a/unsloth/models/_utils.py
+++ b/unsloth/models/_utils.py
@@ -12,7 +12,7 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-__version__ = "2025.9.7"
+__version__ = "2025.9.8"
 
 __all__ = [
     "SUPPORTS_BFLOAT16",

From ffa04dde12b7fa9430566cce8b0309531f7af2ba Mon Sep 17 00:00:00 2001
From: Daniel Han <danielhanchen@gmail.com>
Date: Thu, 18 Sep 2025 15:45:42 -0700
Subject: [PATCH 153/272] Update _utils.py

---
 unsloth/models/_utils.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/unsloth/models/_utils.py b/unsloth/models/_utils.py
index 5f41352d97..79134005dc 100644
--- a/unsloth/models/_utils.py
+++ b/unsloth/models/_utils.py
@@ -12,7 +12,7 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-__version__ = "2025.9.8"
+__version__ = "2025.9.7"
 
 __all__ = [
     "SUPPORTS_BFLOAT16",

From b3654449bdd237e642e5f44c6e96c74e203232f7 Mon Sep 17 00:00:00 2001
From: Daniel Han <danielhanchen@gmail.com>
Date: Thu, 18 Sep 2025 18:57:41 -0700
Subject: [PATCH 154/272] Fix auto_mapping

---
 unsloth/models/llama.py  | 4 +++-
 unsloth/models/vision.py | 4 +++-
 2 files changed, 6 insertions(+), 2 deletions(-)

diff --git a/unsloth/models/llama.py b/unsloth/models/llama.py
index 7414c07326..6326f519f1 100644
--- a/unsloth/models/llama.py
+++ b/unsloth/models/llama.py
@@ -25,7 +25,7 @@
 from torch.nn.functional import scaled_dot_product_attention
 from transformers import __version__ as transformers_version
 from unsloth_zoo.utils import Version, _get_dtype
-from unsloth_zoo.hf_utils import dtype_from_config, add_dtype_kwargs
+from unsloth_zoo.hf_utils import dtype_from_config, add_dtype_kwargs, fix_lora_auto_mapping
 from unsloth_zoo.peft_utils import SKIP_QUANTIZATION_MODULES
 from unsloth import DEVICE_TYPE, DEVICE_COUNT
 
@@ -2632,6 +2632,8 @@ def get_peft_model(
         pass
 
         model = _get_peft_model(model, lora_config)
+        # Fix LoraConfig.auto_mapping is None
+        fix_lora_auto_mapping(model)
 
         # Apply QAT + LoRA if specified
         if qat_scheme is not None:
diff --git a/unsloth/models/vision.py b/unsloth/models/vision.py
index d6c710c281..d03ffb45a9 100644
--- a/unsloth/models/vision.py
+++ b/unsloth/models/vision.py
@@ -43,7 +43,7 @@
 from transformers import __version__ as transformers_version
 from triton import __version__ as triton_version
 from unsloth_zoo.utils import _get_dtype
-from unsloth_zoo.hf_utils import dtype_from_config, add_dtype_kwargs
+from unsloth_zoo.hf_utils import dtype_from_config, add_dtype_kwargs, fix_lora_auto_mapping
 from unsloth_zoo.patching_utils import patch_model_and_tokenizer
 from unsloth_zoo.training_utils import prepare_model_for_training
 
@@ -758,6 +758,8 @@ def get_peft_model(
             use_gradient_checkpointing = use_gradient_checkpointing,
         )
         model = _get_peft_model(model, lora_config)
+        # Fix LoraConfig.auto_mapping is None
+        fix_lora_auto_mapping(model)
         # Enable gradients on modules which are trainable
         requires_grad_for_gradient_checkpointing(model)
         trust_remote_code = getattr(model, "_unsloth_trust_remote_code", False)

From 5ce7bf895269e15d56b2de088993e829285c0805 Mon Sep 17 00:00:00 2001
From: Daniel Han <danielhanchen@gmail.com>
Date: Sat, 20 Sep 2025 02:09:50 -0700
Subject: [PATCH 155/272] Update loader.py

---
 unsloth/models/loader.py | 126 ++++++++++++++++++++++-----------------
 1 file changed, 70 insertions(+), 56 deletions(-)

diff --git a/unsloth/models/loader.py b/unsloth/models/loader.py
index 3eb80fc0dd..ce90874f94 100644
--- a/unsloth/models/loader.py
+++ b/unsloth/models/loader.py
@@ -82,12 +82,28 @@
 )
 
 global FORCE_FLOAT32
+# Forces float32 precision since float16 goes to infinity
 FORCE_FLOAT32 = [
-    "gemma3,",  # Add comma bc gemma3 will match gemma3n
+    "gemma3,", # Add comma bc gemma3 will match gemma3n
     "gemma3n",
     "gpt_oss",
 ]
 
+global DISABLE_COMPILE_MODEL_NAMES
+# Must be alphabetically sorted for each entry
+DISABLE_COMPILE_MODEL_NAMES = [
+    "aya_vision",
+    "modernbert",
+    "granite,llava_next", # Granite-vision 3
+]
+
+global DISABLE_SDPA_MODEL_NAMES
+# Disables some SDPA modules since it's wrong
+DISABLE_SDPA_MODEL_NAMES = [
+    "gemma3,", # Add comma bc gemma3 will match gemma3n
+]
+
+
 class FastLanguageModel(FastLlamaModel):
     @staticmethod
     def from_pretrained(
@@ -213,16 +229,27 @@ def from_pretrained(
             peft_error = str(error)
             is_peft = False
         pass
-        model_types = get_transformers_model_type(peft_config or model_config)
+
+        # Old transformers versions check
+        both_exist = (is_model and is_peft) and not SUPPORTS_LLAMA32
+
+        # Error out if both LoRA and normal model config exists.
+        if both_exist:
+            raise RuntimeError(
+                "Unsloth: Your repo has a LoRA adapter and a base model.\n"\
+                "You have 2 files `config.json` and `adapter_config.json`.\n"\
+                "We must only allow one config file.\n"\
+                "Please separate the LoRA and base models to 2 repos."
+            )
+        model_types = get_transformers_model_type(
+            peft_config if peft_config is not None else model_config
+        )
         if len(model_types) == 1:
             model_type = model_types[0]
         else:
             # Leave as tuple if more than one arch
             model_type = model_types
 
-        # Old transformers versions check
-        both_exist = (is_model and is_peft) and not SUPPORTS_LLAMA32
-
         # New transformers need to check manually.
         if SUPPORTS_LLAMA32:
             # Check if folder exists locally
@@ -240,17 +267,8 @@ def from_pretrained(
             pass
         pass
 
-        # Error out if both LoRA and normal model config exists.
-        if both_exist:
-            raise RuntimeError(
-                "Unsloth: Your repo has a LoRA adapter and a base model.\n"\
-                "You have 2 files `config.json` and `adapter_config.json`.\n"\
-                "We must only allow one config file.\n"\
-                "Please separate the LoRA and base models to 2 repos."
-            )
-
-        elif not is_model and not is_peft:
-            error = autoconfig_error or peft_error
+        if not is_model and not is_peft:
+            error = autoconfig_error if autoconfig_error is not None else peft_error
             # Old transformers version
             if "rope_scaling" in error.lower() and not SUPPORTS_LLAMA31:
                 raise ImportError(
@@ -498,13 +516,6 @@ def from_pretrained(
     from transformers import AutoModelForVision2Seq
 pass
 
-# Must be alphabetically sorted for each entry
-DISABLE_COMPILE_MODEL_NAMES = [
-    "aya_vision",
-    "modernbert",
-    "granite,llava_next", # Granite-vision 3
-]
-
 
 class FastModel(FastBaseModel):
     @staticmethod
@@ -626,8 +637,20 @@ def from_pretrained(
             peft_error = str(error)
             is_peft = False
         pass
-        model_types = get_transformers_model_type(peft_config or model_config)
-        model_types_all = ",".join(model_types)
+        # Old transformers versions check
+        both_exist = (is_model and is_peft) and not SUPPORTS_LLAMA32
+        # Error out if both LoRA and normal model config exists.
+        if both_exist:
+            raise RuntimeError(
+                "Unsloth: Your repo has a LoRA adapter and a base model.\n"\
+                "You have 2 files `config.json` and `adapter_config.json`.\n"\
+                "We must only allow one config file.\n"\
+                "Please separate the LoRA and base models to 2 repos."
+            )
+        model_types = get_transformers_model_type(
+            peft_config if peft_config is not None else model_config
+        )
+        model_types_all = ",".join(model_types) + ","
 
         # Check versions
         lowered_model_name = model_name.lower()
@@ -643,20 +666,22 @@ def from_pretrained(
             raise RuntimeError("Unsloth: Qwen 2.5 only works on transformers >= 4.49.0." + LATEST)
         # Gemma 3
         elif "gemma3" in model_types_all:
-            if "gemma3n" in model_types_all:
-                if transformers_version < Version("4.53.0"):
-                    raise RuntimeError("Unsloth: Gemma 3N only works on transformers >= 4.53.0" + LATEST)
-                os.environ["UNSLOTH_DISABLE_STATIC_GENERATION"] = "1"
-                os.environ["UNSLOTH_FORCE_CUSTOM_DTYPE"] = \
-                    "float16;torch.float16;torch.float16;"\
-                    "if name.endswith('norm'): "\
-                    "module._pre_set_compute_dtype = torch.float32\n"\
-                    ";"\
-                    "from unsloth_zoo.temporary_patches.gemma3n import patch_Gemma3nConv_Embed_forwards; patch_Gemma3nConv_Embed_forwards()"
-            else:
-                if transformers_version < Version("4.50.0.dev0"):
-                    raise RuntimeError("Unsloth: Gemma 3 only works on transformers >= 4.50.0." + NIGHTLY)
-
+            if transformers_version < Version("4.50.0.dev0"):
+                raise RuntimeError("Unsloth: Gemma 3 only works on transformers >= 4.50.0." + NIGHTLY)
+            # Set norms to float32 since anyways they get upcasted to float32
+            # common in both gemma-3 and gemma-3n
+            os.environ["UNSLOTH_HIGH_PRECISION_LAYERNORM"] = "1"
+        # Gemma 3N
+        elif "gemma3n" in model_types_all:
+            if transformers_version < Version("4.53.0"):
+                raise RuntimeError("Unsloth: Gemma 3N only works on transformers >= 4.53.0" + LATEST)
+            os.environ["UNSLOTH_DISABLE_STATIC_GENERATION"] = "1"
+            os.environ["UNSLOTH_FORCE_CUSTOM_DTYPE"] = \
+                "float16;torch.float16;torch.float16;"\
+                "if name.endswith('norm'): "\
+                "module._pre_set_compute_dtype = torch.float32\n"\
+                ";"\
+                "from unsloth_zoo.temporary_patches.gemma3n import patch_Gemma3nConv_Embed_forwards; patch_Gemma3nConv_Embed_forwards()"
             # Set norms to float32 since anyways they get upcasted to float32
             # common in both gemma-3 and gemma-3n
             os.environ["UNSLOTH_HIGH_PRECISION_LAYERNORM"] = "1"
@@ -732,9 +757,6 @@ def from_pretrained(
             os.environ["UNSLOTH_DISABLE_STATIC_GENERATION"] = "1"
         pass
 
-        # Old transformers versions check
-        both_exist = (is_model and is_peft) and not SUPPORTS_LLAMA32
-
         # New transformers need to check manually.
         if SUPPORTS_LLAMA32:
             # Check if folder exists locally
@@ -751,17 +773,8 @@ def from_pretrained(
             pass
         pass
 
-        # Error out if both LoRA and normal model config exists.
-        if both_exist:
-            raise RuntimeError(
-                "Unsloth: Your repo has a LoRA adapter and a base model.\n"\
-                "You have 2 files `config.json` and `adapter_config.json`.\n"\
-                "We must only allow one config file.\n"\
-                "Please separate the LoRA and base models to 2 repos."
-            )
-
-        elif not is_model and not is_peft:
-            error = autoconfig_error or peft_error
+        if not is_model and not is_peft:
+            error = autoconfig_error if autoconfig_error is not None else peft_error
             # Old transformers version
             if "rope_scaling" in error.lower() and not SUPPORTS_LLAMA31:
                 raise ImportError(
@@ -811,7 +824,7 @@ def from_pretrained(
         for disable_name in FORCE_FLOAT32:
             # add comma to model_types_all matching in case of exact match for end
             if (disable_name.lower() == model_type_arch.lower().replace("-", "").replace("_", "") or \
-                disable_name.lower() in f'{model_types_all},') and \
+                disable_name.lower() in model_types_all) and \
                 ((dtype == torch.float16) or not SUPPORTS_BFLOAT16):
                 os.environ["UNSLOTH_FORCE_FLOAT32"] = "1"
                 dtype = torch.bfloat16 # Change to bfloat16 loading
@@ -855,8 +868,9 @@ def from_pretrained(
                 unsloth_force_compile   = unsloth_force_compile,
             )
         pass
-        # Fix SDPA
-        if "gemma3n" in model_types_all:
+        # Fix SDPA issues
+        for model_type in DISABLE_SDPA_MODEL_NAMES:
+            if model_type in model_types_all:
             supports_sdpa = False
         pass
 

From 755e6e2026cc120f876d489a6daaf512255f843d Mon Sep 17 00:00:00 2001
From: Daniel Han <danielhanchen@gmail.com>
Date: Sat, 20 Sep 2025 02:12:12 -0700
Subject: [PATCH 156/272] Update loader.py

---
 unsloth/models/loader.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/unsloth/models/loader.py b/unsloth/models/loader.py
index ce90874f94..a400cb621a 100644
--- a/unsloth/models/loader.py
+++ b/unsloth/models/loader.py
@@ -871,7 +871,7 @@ def from_pretrained(
         # Fix SDPA issues
         for model_type in DISABLE_SDPA_MODEL_NAMES:
             if model_type in model_types_all:
-            supports_sdpa = False
+                supports_sdpa = False
         pass
 
         # Check if this is local model since the tokenizer gets overwritten

From d01b8af06d3b56ee1cc44934b620e0861c7d666d Mon Sep 17 00:00:00 2001
From: Daniel Han <danielhanchen@gmail.com>
Date: Sat, 20 Sep 2025 16:54:25 -0700
Subject: [PATCH 157/272] Update vision.py

---
 unsloth/models/vision.py | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/unsloth/models/vision.py b/unsloth/models/vision.py
index 3f5ae816ea..4e9b88a205 100644
--- a/unsloth/models/vision.py
+++ b/unsloth/models/vision.py
@@ -242,6 +242,8 @@ def unsloth_base_fast_generate(
             kwargs["compile_config"] = _compile_config
     pass
 
+    kwargs["cache_implementation"] = "static"
+
     with torch.inference_mode(), autocaster:
         output = self._old_generate(*args, **kwargs)
 

From d048d3aedaaa5d5d62bdfd679f8c42ffd295ce27 Mon Sep 17 00:00:00 2001
From: Daniel Han <danielhanchen@gmail.com>
Date: Sat, 20 Sep 2025 18:01:46 -0700
Subject: [PATCH 158/272] Update vision.py

---
 unsloth/models/vision.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/unsloth/models/vision.py b/unsloth/models/vision.py
index 4e9b88a205..934f2bbc5e 100644
--- a/unsloth/models/vision.py
+++ b/unsloth/models/vision.py
@@ -422,7 +422,7 @@ def from_pretrained(
         # Stop SDPA for some archs like Pixtral / Mistral3
         if not ("attn_implementation" in kwargs):
             kwargs["attn_implementation"] = "sdpa"
-        if not supports_sdpa:
+        if not supports_sdpa and (os.environ.get("UNSLOTH_ENABLE_FLEX_ATTENTION", "0") == "0"):
             print(f"Unsloth: {model_type_arch.title()} does not support SDPA - switching to fast eager.")
             del kwargs["attn_implementation"]
         pass

From 81ba78e9a11500de05596dd0b28979abe3b14050 Mon Sep 17 00:00:00 2001
From: Daniel Han <danielhanchen@gmail.com>
Date: Sat, 20 Sep 2025 18:29:35 -0700
Subject: [PATCH 159/272] Update loader.py

---
 unsloth/models/loader.py | 1 +
 1 file changed, 1 insertion(+)

diff --git a/unsloth/models/loader.py b/unsloth/models/loader.py
index a400cb621a..baeedd339d 100644
--- a/unsloth/models/loader.py
+++ b/unsloth/models/loader.py
@@ -868,6 +868,7 @@ def from_pretrained(
                 unsloth_force_compile   = unsloth_force_compile,
             )
         pass
+        print("supports_sdpa", supports_sdpa)
         # Fix SDPA issues
         for model_type in DISABLE_SDPA_MODEL_NAMES:
             if model_type in model_types_all:

From 0bb74fe32ae1f0790142f88eadefbaba36a94742 Mon Sep 17 00:00:00 2001
From: Daniel Han <danielhanchen@gmail.com>
Date: Sat, 20 Sep 2025 18:31:36 -0700
Subject: [PATCH 160/272] Message

---
 unsloth/models/loader.py | 1 -
 unsloth/models/vision.py | 5 +++--
 2 files changed, 3 insertions(+), 3 deletions(-)

diff --git a/unsloth/models/loader.py b/unsloth/models/loader.py
index baeedd339d..a400cb621a 100644
--- a/unsloth/models/loader.py
+++ b/unsloth/models/loader.py
@@ -868,7 +868,6 @@ def from_pretrained(
                 unsloth_force_compile   = unsloth_force_compile,
             )
         pass
-        print("supports_sdpa", supports_sdpa)
         # Fix SDPA issues
         for model_type in DISABLE_SDPA_MODEL_NAMES:
             if model_type in model_types_all:
diff --git a/unsloth/models/vision.py b/unsloth/models/vision.py
index 934f2bbc5e..9e954894a4 100644
--- a/unsloth/models/vision.py
+++ b/unsloth/models/vision.py
@@ -422,8 +422,9 @@ def from_pretrained(
         # Stop SDPA for some archs like Pixtral / Mistral3
         if not ("attn_implementation" in kwargs):
             kwargs["attn_implementation"] = "sdpa"
-        if not supports_sdpa and (os.environ.get("UNSLOTH_ENABLE_FLEX_ATTENTION", "0") == "0"):
-            print(f"Unsloth: {model_type_arch.title()} does not support SDPA - switching to fast eager.")
+        if not supports_sdpa:
+            if os.environ.get("UNSLOTH_ENABLE_FLEX_ATTENTION", "0") == "0":
+                print(f"Unsloth: {model_type_arch.title()} does not support SDPA - switching to fast eager.")
             del kwargs["attn_implementation"]
         pass
 

From 14fdb224e61a24e22660f54033ace71fc62c0163 Mon Sep 17 00:00:00 2001
From: Daniel Han <danielhanchen@gmail.com>
Date: Sat, 20 Sep 2025 19:12:58 -0700
Subject: [PATCH 161/272] Update vision.py

---
 unsloth/models/vision.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/unsloth/models/vision.py b/unsloth/models/vision.py
index 9e954894a4..d716cdf9a1 100644
--- a/unsloth/models/vision.py
+++ b/unsloth/models/vision.py
@@ -242,7 +242,7 @@ def unsloth_base_fast_generate(
             kwargs["compile_config"] = _compile_config
     pass
 
-    kwargs["cache_implementation"] = "static"
+    print(kwargs["cache_implementation"], args, kwargs)
 
     with torch.inference_mode(), autocaster:
         output = self._old_generate(*args, **kwargs)

From ce4f2b6cb2f8b1a060cdebddfeee3bffaa1291f7 Mon Sep 17 00:00:00 2001
From: Daniel Han <danielhanchen@gmail.com>
Date: Sat, 20 Sep 2025 19:18:05 -0700
Subject: [PATCH 162/272] Update loader.py

---
 unsloth/models/loader.py | 1 -
 1 file changed, 1 deletion(-)

diff --git a/unsloth/models/loader.py b/unsloth/models/loader.py
index a400cb621a..c25fd86966 100644
--- a/unsloth/models/loader.py
+++ b/unsloth/models/loader.py
@@ -713,7 +713,6 @@ def from_pretrained(
                 ";"\
                 "os.environ['TRITON_F32_DEFAULT'] = 'ieee'"
         elif "gpt_oss" in model_types_all:
-            os.environ["UNSLOTH_DISABLE_STATIC_GENERATION"] = "1"
             if not load_in_4bit:
                 # Only upcast MoE biases for MXFP4, not BnB
                 # Set norms to float32 since anyways they get upcasted to float32

From e333b03f0645a5c1c3ee33e0319f80f0259ccddc Mon Sep 17 00:00:00 2001
From: Daniel Han <danielhanchen@gmail.com>
Date: Sat, 20 Sep 2025 19:21:20 -0700
Subject: [PATCH 163/272] Update vision.py

---
 unsloth/models/vision.py | 2 --
 1 file changed, 2 deletions(-)

diff --git a/unsloth/models/vision.py b/unsloth/models/vision.py
index d716cdf9a1..f257bf4ec3 100644
--- a/unsloth/models/vision.py
+++ b/unsloth/models/vision.py
@@ -242,8 +242,6 @@ def unsloth_base_fast_generate(
             kwargs["compile_config"] = _compile_config
     pass
 
-    print(kwargs["cache_implementation"], args, kwargs)
-
     with torch.inference_mode(), autocaster:
         output = self._old_generate(*args, **kwargs)
 

From 456d225e1a9f1297bc0067840cb8a8b975712e72 Mon Sep 17 00:00:00 2001
From: Daniel Han <danielhanchen@gmail.com>
Date: Sat, 20 Sep 2025 19:23:03 -0700
Subject: [PATCH 164/272] cache_implementation

---
 unsloth/models/loader.py | 1 +
 unsloth/models/vision.py | 7 +++++--
 2 files changed, 6 insertions(+), 2 deletions(-)

diff --git a/unsloth/models/loader.py b/unsloth/models/loader.py
index c25fd86966..a400cb621a 100644
--- a/unsloth/models/loader.py
+++ b/unsloth/models/loader.py
@@ -713,6 +713,7 @@ def from_pretrained(
                 ";"\
                 "os.environ['TRITON_F32_DEFAULT'] = 'ieee'"
         elif "gpt_oss" in model_types_all:
+            os.environ["UNSLOTH_DISABLE_STATIC_GENERATION"] = "1"
             if not load_in_4bit:
                 # Only upcast MoE biases for MXFP4, not BnB
                 # Set norms to float32 since anyways they get upcasted to float32
diff --git a/unsloth/models/vision.py b/unsloth/models/vision.py
index f257bf4ec3..e3b87c3dde 100644
--- a/unsloth/models/vision.py
+++ b/unsloth/models/vision.py
@@ -217,8 +217,11 @@ def unsloth_base_fast_generate(
     if getattr(self, "_supports_static_cache", getattr(self, "_can_compile_fullgraph", True)):
         if os.environ.get("UNSLOTH_DISABLE_STATIC_GENERATION", "0") == "0":
             cache_implementation = "static"
-        else:
+        elif Version(transformers_version) < Version("4.56.0.dev0"):
             cache_implementation = None
+        else:
+            # Should work in latest transformers!
+            cache_implementation = "static"
     else:
         cache_implementation = None
     if cache_implementation is not None:
@@ -241,7 +244,7 @@ def unsloth_base_fast_generate(
         if cache_implementation is not None:
             kwargs["compile_config"] = _compile_config
     pass
-
+    print(cache_implementation)
     with torch.inference_mode(), autocaster:
         output = self._old_generate(*args, **kwargs)
 

From 1cd7b85b07f03071936d3e336d17540cc445b987 Mon Sep 17 00:00:00 2001
From: Daniel Han <danielhanchen@gmail.com>
Date: Sat, 20 Sep 2025 19:37:23 -0700
Subject: [PATCH 165/272] Update vision.py

---
 unsloth/models/vision.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/unsloth/models/vision.py b/unsloth/models/vision.py
index e3b87c3dde..f0ae0e39a8 100644
--- a/unsloth/models/vision.py
+++ b/unsloth/models/vision.py
@@ -233,7 +233,7 @@ def unsloth_base_fast_generate(
             if Version(transformers_version) < Version("4.56.0.dev0"):
                 cache_implementation = "hybrid"
             else:
-                cache_implementation = "static"
+                cache_implementation = None
 
     if "generation_config" in kwargs:
         kwargs["generation_config"].cache_implementation = cache_implementation

From 2b0d2195a3c70b375c85a3106aba01215ee5e082 Mon Sep 17 00:00:00 2001
From: Daniel Han <danielhanchen@gmail.com>
Date: Sat, 20 Sep 2025 21:35:13 -0700
Subject: [PATCH 166/272] Update loader.py

---
 unsloth/models/loader.py | 16 ++++++++--------
 1 file changed, 8 insertions(+), 8 deletions(-)

diff --git a/unsloth/models/loader.py b/unsloth/models/loader.py
index a400cb621a..a445dd72f1 100644
--- a/unsloth/models/loader.py
+++ b/unsloth/models/loader.py
@@ -664,14 +664,7 @@ def from_pretrained(
         # Qwen 2.5
         elif "qwen2_5" in model_types_all and transformers_version < Version("4.49.0"):
             raise RuntimeError("Unsloth: Qwen 2.5 only works on transformers >= 4.49.0." + LATEST)
-        # Gemma 3
-        elif "gemma3" in model_types_all:
-            if transformers_version < Version("4.50.0.dev0"):
-                raise RuntimeError("Unsloth: Gemma 3 only works on transformers >= 4.50.0." + NIGHTLY)
-            # Set norms to float32 since anyways they get upcasted to float32
-            # common in both gemma-3 and gemma-3n
-            os.environ["UNSLOTH_HIGH_PRECISION_LAYERNORM"] = "1"
-        # Gemma 3N
+        # Gemma 3N must be beefore Gemma 3
         elif "gemma3n" in model_types_all:
             if transformers_version < Version("4.53.0"):
                 raise RuntimeError("Unsloth: Gemma 3N only works on transformers >= 4.53.0" + LATEST)
@@ -685,6 +678,13 @@ def from_pretrained(
             # Set norms to float32 since anyways they get upcasted to float32
             # common in both gemma-3 and gemma-3n
             os.environ["UNSLOTH_HIGH_PRECISION_LAYERNORM"] = "1"
+        # Gemma 3
+        elif "gemma3" in model_types_all:
+            if transformers_version < Version("4.50.0.dev0"):
+                raise RuntimeError("Unsloth: Gemma 3 only works on transformers >= 4.50.0." + NIGHTLY)
+            # Set norms to float32 since anyways they get upcasted to float32
+            # common in both gemma-3 and gemma-3n
+            os.environ["UNSLOTH_HIGH_PRECISION_LAYERNORM"] = "1"
         # Cohere
         elif "cohere2" in model_types_all and transformers_version < Version("4.50.0.dev0"):
             raise RuntimeError("Unsloth: Cohere's Command model only works on transformers >= 4.50.0." + NIGHTLY)

From d1c92839451f0d11673d5814460089f99ef1eea9 Mon Sep 17 00:00:00 2001
From: Daniel Han <danielhanchen@gmail.com>
Date: Sat, 20 Sep 2025 21:39:37 -0700
Subject: [PATCH 167/272] Update vision.py

---
 unsloth/models/vision.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/unsloth/models/vision.py b/unsloth/models/vision.py
index f0ae0e39a8..e3b87c3dde 100644
--- a/unsloth/models/vision.py
+++ b/unsloth/models/vision.py
@@ -233,7 +233,7 @@ def unsloth_base_fast_generate(
             if Version(transformers_version) < Version("4.56.0.dev0"):
                 cache_implementation = "hybrid"
             else:
-                cache_implementation = None
+                cache_implementation = "static"
 
     if "generation_config" in kwargs:
         kwargs["generation_config"].cache_implementation = cache_implementation

From a0df6ab7090090f907cdb763468f6ec0cc373c75 Mon Sep 17 00:00:00 2001
From: Daniel Han <danielhanchen@gmail.com>
Date: Sat, 20 Sep 2025 22:19:47 -0700
Subject: [PATCH 168/272] Update vision.py

---
 unsloth/models/vision.py | 1 +
 1 file changed, 1 insertion(+)

diff --git a/unsloth/models/vision.py b/unsloth/models/vision.py
index e3b87c3dde..d5076f7762 100644
--- a/unsloth/models/vision.py
+++ b/unsloth/models/vision.py
@@ -244,6 +244,7 @@ def unsloth_base_fast_generate(
         if cache_implementation is not None:
             kwargs["compile_config"] = _compile_config
     pass
+    kwargs["cache_implementation"] = None
     print(cache_implementation)
     with torch.inference_mode(), autocaster:
         output = self._old_generate(*args, **kwargs)

From 450b2da52f45722b62c4084699f5102869577193 Mon Sep 17 00:00:00 2001
From: Daniel Han <danielhanchen@gmail.com>
Date: Sat, 20 Sep 2025 22:23:10 -0700
Subject: [PATCH 169/272] Update vision.py

---
 unsloth/models/vision.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/unsloth/models/vision.py b/unsloth/models/vision.py
index d5076f7762..13b078378d 100644
--- a/unsloth/models/vision.py
+++ b/unsloth/models/vision.py
@@ -244,7 +244,7 @@ def unsloth_base_fast_generate(
         if cache_implementation is not None:
             kwargs["compile_config"] = _compile_config
     pass
-    kwargs["cache_implementation"] = None
+    # kwargs["cache_implementation"] = None
     print(cache_implementation)
     with torch.inference_mode(), autocaster:
         output = self._old_generate(*args, **kwargs)

From b1116d59a22dc2a09e19d1ced2e48254de2dc742 Mon Sep 17 00:00:00 2001
From: Daniel Han <danielhanchen@gmail.com>
Date: Sun, 21 Sep 2025 00:52:40 -0700
Subject: [PATCH 170/272] Update loader.py

---
 unsloth/models/loader.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/unsloth/models/loader.py b/unsloth/models/loader.py
index a445dd72f1..1a9c145368 100644
--- a/unsloth/models/loader.py
+++ b/unsloth/models/loader.py
@@ -690,7 +690,7 @@ def from_pretrained(
             raise RuntimeError("Unsloth: Cohere's Command model only works on transformers >= 4.50.0." + NIGHTLY)
         # Sesame
         elif "csm" in model_types_all:
-            os.environ["UNSLOTH_COMPILE_DISABLE"] = "1" # Inference is too slow
+            os.environ["UNSLOTH_COMPILE_DISABLE"] = "partial" # Inference is too slow
             os.environ["UNSLOTH_DISABLE_STATIC_GENERATION"] = "1" # Sesame fails
             os.environ["UNSLOTH_FORCE_CUSTOM_DTYPE"] = \
                 "all;torch.float32;torch.float16;"\
@@ -745,7 +745,7 @@ def from_pretrained(
         else:
             for check_model_name in DISABLE_COMPILE_MODEL_NAMES:
                 if check_model_name in lowered_model_name:
-                    os.environ["UNSLOTH_COMPILE_DISABLE"] = "1"
+                    os.environ["UNSLOTH_COMPILE_DISABLE"] = "partial"
                     os.environ["UNSLOTH_DISABLE_STATIC_GENERATION"] = "1"
                     if transformers_version < Version("4.50.0.dev0"):
                         raise RuntimeError(f"Unsloth: {check_model_name} only works on transformers >= 4.50.0." + NIGHTLY)

From 7210cb1d5b53601efde268751f788825e3302e74 Mon Sep 17 00:00:00 2001
From: Daniel Han <danielhanchen@gmail.com>
Date: Sun, 21 Sep 2025 03:30:11 -0700
Subject: [PATCH 171/272] Update vision.py

---
 unsloth/models/vision.py | 2 --
 1 file changed, 2 deletions(-)

diff --git a/unsloth/models/vision.py b/unsloth/models/vision.py
index 13b078378d..2ea4d1f71f 100644
--- a/unsloth/models/vision.py
+++ b/unsloth/models/vision.py
@@ -244,8 +244,6 @@ def unsloth_base_fast_generate(
         if cache_implementation is not None:
             kwargs["compile_config"] = _compile_config
     pass
-    # kwargs["cache_implementation"] = None
-    print(cache_implementation)
     with torch.inference_mode(), autocaster:
         output = self._old_generate(*args, **kwargs)
 

From f148170a3cc007d2cf3f28c76cb1c9797c828bcd Mon Sep 17 00:00:00 2001
From: Daniel Han <danielhanchen@gmail.com>
Date: Sun, 21 Sep 2025 03:54:20 -0700
Subject: [PATCH 172/272] Save max_seq_length

---
 unsloth/models/llama.py  | 6 ++++++
 unsloth/models/vision.py | 6 ++++++
 2 files changed, 12 insertions(+)

diff --git a/unsloth/models/llama.py b/unsloth/models/llama.py
index 6326f519f1..fee83441fe 100644
--- a/unsloth/models/llama.py
+++ b/unsloth/models/llama.py
@@ -2170,6 +2170,9 @@ def from_pretrained(
             m = m.model
         pass
         m.max_seq_length = max_seq_length
+        # Save to modules as well
+        for module in model.modules():
+            module.max_seq_length = max_seq_length
 
         # We check the tokenizer first for errors
         if fix_tokenizer:
@@ -2892,6 +2895,9 @@ def patch_peft_model(
             internal_model = internal_model.model
         pass
         internal_model.max_seq_length = max_seq_length
+        # Save to modules as well
+        for module in model.modules():
+            module.max_seq_length = max_seq_length
 
         # Patch tokenizer to pad to the right
         internal_model = model
diff --git a/unsloth/models/vision.py b/unsloth/models/vision.py
index 2ea4d1f71f..6d8250318d 100644
--- a/unsloth/models/vision.py
+++ b/unsloth/models/vision.py
@@ -645,6 +645,9 @@ def from_pretrained(
             m = m.model
         pass
         m.max_seq_length = max_seq_length
+        # Save to modules as well
+        for module in model.modules():
+            module.max_seq_length = max_seq_length
         m._saved_temp_tokenizer = tokenizer
         # Also set is_loaded_in_8bit to disable incorrect DDP
         m.is_loaded_in_8bit = True if not full_finetuning else False
@@ -780,6 +783,9 @@ def get_peft_model(
         trust_remote_code = getattr(model, "_unsloth_trust_remote_code", False)
         model = FastBaseModel.post_patch_model(model, use_gradient_checkpointing, trust_remote_code = trust_remote_code)
         model.max_seq_length = max_seq_length
+        # Save to modules as well
+        for module in model.modules():
+            module.max_seq_length = max_seq_length
         # Clear deleted GPU items
         for _ in range(3):
             gc.collect()

From 7fa66da748deb9144122107f083a79a4c8f97c18 Mon Sep 17 00:00:00 2001
From: Daniel Han <danielhanchen@gmail.com>
Date: Sun, 21 Sep 2025 03:56:12 -0700
Subject: [PATCH 173/272] Update _utils.py

---
 unsloth/models/_utils.py | 1 +
 1 file changed, 1 insertion(+)

diff --git a/unsloth/models/_utils.py b/unsloth/models/_utils.py
index 60abcea702..8275283c6a 100644
--- a/unsloth/models/_utils.py
+++ b/unsloth/models/_utils.py
@@ -137,6 +137,7 @@
 # =============================================
 # Disable some warnings which can get annoying
 warnings.filterwarnings(action = "ignore", category = UserWarning,    module = "torch")
+warnings.filterwarnings(action = "ignore", category = FutureWarning,  module = "torch")
 warnings.filterwarnings(action = "ignore", category = UserWarning,    module = "huggingface_hub")
 warnings.filterwarnings(action = "ignore", category = FutureWarning,  module = "huggingface_hub")
 warnings.filterwarnings(action = "ignore", category = UserWarning,    module = "trl")

From 0b49db1a0b2d13789da43a9f3326063093cdec54 Mon Sep 17 00:00:00 2001
From: Daniel Han <danielhanchen@gmail.com>
Date: Sun, 21 Sep 2025 18:01:21 -0700
Subject: [PATCH 174/272] Update rl.py

---
 unsloth/models/rl.py | 33 +++++++++++++++++++++++++++++++--
 1 file changed, 31 insertions(+), 2 deletions(-)

diff --git a/unsloth/models/rl.py b/unsloth/models/rl.py
index 3d5f6d084b..889cbaccaf 100644
--- a/unsloth/models/rl.py
+++ b/unsloth/models/rl.py
@@ -116,6 +116,22 @@ def generate_with_clone(*args, **kwargs):
 from transformers import DataCollatorForSeq2Seq, DataCollatorForLanguageModeling as TransformersDataCollatorForLanguageModeling
 from transformers.training_args import ParallelMode
 
+# Wrap trainer with padding to right and enable training mode
+import functools
+def prepare_for_training_mode(f):
+    @functools.wraps(f)
+    def wrapper(self, *args, **kwargs):
+        # Enable training mode
+        if hasattr(self, model) and hasattr(self.model, "for_training"):
+            self.model.for_training()
+        output = f(self, *args, **kwargs)
+        # Return inference mode
+        if hasattr(self, model) and hasattr(self.model, "for_inference"):
+            self.model.for_inference()
+        return output
+    return wrapper
+pass
+
 torch_compile_options = {{
     "epilogue_fusion"   : True,
     "max_autotune"      : False,
@@ -174,7 +190,11 @@ def __init__({RLTrainer_arguments},
         if getattr(args, "parallel_mode", None) == ParallelMode.NOT_DISTRIBUTED and args.n_gpu > 1:
             if getattr(args, "_n_gpu", 1) != 1:
                 args._n_gpu = 1
+        if "model" in locals() and hasattr(model, "for_training"):
+            model.for_training()
         super().__init__({RLTrainer_call_args}{RLTrainer_kwargs})
+        if "model" in locals() and hasattr(model, "for_inference"):
+            model.for_inference()
 {RLTrainer_post}
 pass
 '''
@@ -460,7 +480,7 @@ def _patch_trl_rl_trainers(trainer_file = "grpo_trainer"):
 
     # Add accelerator scaler to model
     if "model" in call_args:
-        neftune_check = \
+        accelerator_check = \
         "if hasattr(self, 'accelerator'):\n"\
         "    scaler = self.accelerator.scaler\n"\
         "    current_model = model\n"\
@@ -469,7 +489,16 @@ def _patch_trl_rl_trainers(trainer_file = "grpo_trainer"):
         "        current_model = current_model.model\n"\
         "    current_model.accelerator_scaler = scaler\n"\
         "pass\n"
-        RLTrainer_post += neftune_check
+        RLTrainer_post += accelerator_check
+    pass
+
+    # Add enabling and disabling training modes
+    if "model" in call_args:
+        training_check = \
+        "if hasattr(self, 'train'):\n"\
+        "    self.train = prepare_for_training_mode(self.train)\n"\
+        "pass\n"
+        RLTrainer_post += training_check
     pass
 
     # Edit optional metrics

From f1c47f860c7f9a7dc34bd5eba27c61daf67af38b Mon Sep 17 00:00:00 2001
From: Daniel Han <danielhanchen@gmail.com>
Date: Sun, 21 Sep 2025 18:10:27 -0700
Subject: [PATCH 175/272] Update vision.py

---
 unsloth/models/vision.py | 26 ++++++++++++++++++--------
 1 file changed, 18 insertions(+), 8 deletions(-)

diff --git a/unsloth/models/vision.py b/unsloth/models/vision.py
index 6d8250318d..36e2cdd459 100644
--- a/unsloth/models/vision.py
+++ b/unsloth/models/vision.py
@@ -247,7 +247,7 @@ def unsloth_base_fast_generate(
     with torch.inference_mode(), autocaster:
         output = self._old_generate(*args, **kwargs)
 
-    FastBaseModel.for_training(self)
+    # FastBaseModel.for_training(self)
     return output
 pass
 
@@ -576,7 +576,7 @@ def from_pretrained(
         if (whisper_language and whisper_task) or auto_model.__name__.endswith("ForConditionalGeneration"):
            tokenizer = auto_processor.from_pretrained(
                 tokenizer_name,
-                padding_side = "right",
+                padding_side = "left",
                 token        = token,
                 language     = whisper_language,
                 task         = whisper_task,
@@ -585,19 +585,19 @@ def from_pretrained(
             try:
                 tokenizer = auto_processor.from_pretrained(
                     tokenizer_name,
-                    padding_side = "right",
+                    padding_side = "left",
                     token        = token,
                 )
             except:
                 tokenizer = get_auto_processor(
                     tokenizer_name,
-                    padding_side = "right",
+                    padding_side = "left",
                     token        = token,
                 )
         if hasattr(tokenizer, "tokenizer"):
             __tokenizer = tokenizer.tokenizer
             # Add padding side as well
-            __tokenizer.padding_side = "right"
+            __tokenizer.padding_side = "left"
             # Check bos, eos, pad tokens
             if hasattr(__tokenizer, "bos_token"):
                 tokenizer.bos_token    = __tokenizer.bos_token
@@ -800,6 +800,11 @@ def get_peft_model(
         # Add for_inference and for_training
         model.for_training  = functools.partial(FastBaseModel.for_training,  model)
         model.for_inference = functools.partial(FastBaseModel.for_inference, model)
+        m = model
+        while hasattr(m, "model"):
+            m.for_training  = functools.partial(FastBaseModel.for_training,  m)
+            m.for_inference = functools.partial(FastBaseModel.for_inference, m)
+            m = m.model
         return model
     pass
 
@@ -835,12 +840,12 @@ def post_patch_model(
         pass
         patch_saving_functions(model, vision = True)
 
-        # Patch tokenizer to pad to the right
+        # Patch tokenizer to pad to the left
         m = model
         while hasattr(m, "model"):
             if hasattr(m, "_saved_temp_tokenizer"):
                 if hasattr(m._saved_temp_tokenizer, "tokenizer"):
-                    m._saved_temp_tokenizer.tokenizer.padding_side = "right"
+                    m._saved_temp_tokenizer.tokenizer.padding_side = "left"
             pass
             # Also set is_loaded_in_8bit to disable incorrect DDP
             m.is_loaded_in_8bit = True if not full_finetuning else False
@@ -848,7 +853,7 @@ def post_patch_model(
         pass
         if hasattr(m, "_saved_temp_tokenizer"):
             if hasattr(m._saved_temp_tokenizer, "tokenizer"):
-                m._saved_temp_tokenizer.tokenizer.padding_side = "right"
+                m._saved_temp_tokenizer.tokenizer.padding_side = "left"
         pass
         # Also set is_loaded_in_8bit to disable incorrect DDP
         m.is_loaded_in_8bit = True if not full_finetuning else False
@@ -864,6 +869,11 @@ def post_patch_model(
         # Add for_inference and for_training
         model.for_training  = functools.partial(FastBaseModel.for_training,  model)
         model.for_inference = functools.partial(FastBaseModel.for_inference, model)
+        m = model
+        while hasattr(m, "model"):
+            m.for_training  = functools.partial(FastBaseModel.for_training,  m)
+            m.for_inference = functools.partial(FastBaseModel.for_inference, m)
+            m = m.model
         return model
     pass
 

From 27f62038eda9b01c3022d2dcd25ac268bb1a030a Mon Sep 17 00:00:00 2001
From: Daniel Han <danielhanchen@gmail.com>
Date: Sun, 21 Sep 2025 18:15:39 -0700
Subject: [PATCH 176/272] Update llama.py

---
 unsloth/models/llama.py | 15 +++++++++++++++
 1 file changed, 15 insertions(+)

diff --git a/unsloth/models/llama.py b/unsloth/models/llama.py
index fee83441fe..8708fb5218 100644
--- a/unsloth/models/llama.py
+++ b/unsloth/models/llama.py
@@ -2231,6 +2231,11 @@ def from_pretrained(
         # Add for_inference and for_training
         model.for_training  = functools.partial(FastLlamaModel.for_training,  model)
         model.for_inference = functools.partial(FastLlamaModel.for_inference, model)
+        m = model
+        while hasattr(m, "model"):
+            m.for_training  = functools.partial(FastBaseModel.for_training,  m)
+            m.for_inference = functools.partial(FastBaseModel.for_inference, m)
+            m = m.model
 
         # Patch generate
         is_classification =  "Classification" in str(type(model))
@@ -2707,6 +2712,11 @@ def get_peft_model(
         # Add for_inference and for_training
         model.for_training  = functools.partial(FastLlamaModel.for_training,  model)
         model.for_inference = functools.partial(FastLlamaModel.for_inference, model)
+        m = model
+        while hasattr(m, "model"):
+            m.for_training  = functools.partial(FastBaseModel.for_training,  m)
+            m.for_inference = functools.partial(FastBaseModel.for_inference, m)
+            m = m.model
         return model
     pass
 
@@ -2922,6 +2932,11 @@ def patch_peft_model(
         # Add for_inference and for_training
         model.for_training  = functools.partial(FastLlamaModel.for_training,  model)
         model.for_inference = functools.partial(FastLlamaModel.for_inference, model)
+        m = model
+        while hasattr(m, "model"):
+            m.for_training  = functools.partial(FastBaseModel.for_training,  m)
+            m.for_inference = functools.partial(FastBaseModel.for_inference, m)
+            m = m.model
         return model
     pass
 

From f06179fa85d7810a5c82de1212ca1a3706a71f9e Mon Sep 17 00:00:00 2001
From: Datta Nimmaturi <venkatadattasainimmaturi@gmail.com>
Date: Mon, 22 Sep 2025 10:50:03 +0530
Subject: [PATCH 177/272] Mistral3 vllm (#3349)
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

* [WIP] use vLLM for vision language models

* Update README.md

Editing icon sizes

* Update README.md

Updating icon sizes

* Update README.md (#2885)

* MoE kernels AGPLv3

* versioning

* Many bug fixes (#2908)

* add deepseek v3

* add deepseek r1 base

* add deepseek r1 zero

* add deepseek distill llama

* add deepseek distill models

* remove redundant code when constructing model names

* add mistral small to registry

* rename model registration methods

* rename deepseek registration methods

* refactor naming for mistral and phi

* add global register models

* refactor model registration tests for new registry apis

* add model search method

* remove deprecated registration api

* add quant type test

* add registry readme

* make llama registration more specific

* clear registry when executing individual model registration file

* more registry readme updates

* Update _auto_install.py

* Llama4

* Update synthetic.py

* Update synthetic.py

* Update synthetic.py

* Update synthetic.py

* Update synthetic.py

* Update synthetic.py

* Update synthetic.py

* Update synthetic.py

* Update synthetic.py

* Update synthetic.py

* Update synthetic.py

* Synthetic data

* Update mapper.py

* Xet and Synthetic

* Update synthetic.py

* Update loader.py

* Update synthetic.py

* Update synthetic.py

* Update synthetic.py

* Update synthetic.py

* Update synthetic.py

* Update synthetic.py

* Update synthetic.py

* Update synthetic.py

* Update synthetic.py

* Update synthetic.py

* Update synthetic.py

* Update synthetic.py

* Update synthetic.py

* Update synthetic.py

* Update synthetic.py

* Update synthetic.py

* Update synthetic.py

* Update synthetic.py

* Update synthetic.py

* Update synthetic.py

* Update synthetic.py

* Update synthetic.py

* Update synthetic.py

* Update synthetic.py

* Update synthetic.py

* Update synthetic.py

* Update synthetic.py

* Update pyproject.toml

* Delete .gitignore

* Update synthetic.py

* Update synthetic.py

* Update synthetic.py

* Update synthetic.py

* Update synthetic.py

* Update synthetic.py

* Update synthetic.py

* Update synthetic.py

* Update synthetic.py

* Update synthetic.py

* Update synthetic.py

* Update synthetic.py

* Update synthetic.py

* Update synthetic.py

* Update synthetic.py

* Update synthetic.py

* Update synthetic.py

* Update _utils.py

* Update pyproject.toml

* Update synthetic.py

* Update synthetic.py

* Update synthetic.py

* Update synthetic.py

* Update chat_templates.py

* Seasame force float16 / float32

* Fix Seasame

* Update loader.py

* Update vision.py

* Update vision.py

* Update vision.py

* Update loader.py

* is_multimodal

* Update loader.py

* Update loader.py

* Update loader.py

* Update loader.py

* Update vision.py

* Update vision.py

* Update vision.py

* UNSLOTH_DISABLE_STATIC_GENERATION

* Update vision.py

* Auto vision detection

* Sesame

* Whisper

* Update loader.py

* Update loader.py

* Update loader.py

* Update mapper.py

* Update vision.py

* Update vision.py

* Update vision.py

* Update vision.py

* Update vision.py

* Update vision.py

* Update loader.py

* Update loader.py

* Update loader.py

* Update loader.py

* Update _utils.py

* Update rl.py

* versioning

* Update rl.py

* Update rl.py

* Update rl.py

* Update rl.py

* Update rl.py

* logging

* Update pyproject.toml

* Update rl.py

* versioning

* Update rl.py

* Update rl.py

* Update rl_replacements.py

* Update rl_replacements.py

* Update rl.py

* Update rl_replacements.py

* Update rl_replacements.py

* logits / temperature

* Update rl_replacements.py

* Update pyproject.toml

* Update rl_replacements.py

* Update rl_replacements.py

* Debugging only

* Update llama.py

* Update llama.py

* Update rl_replacements.py

* Update rl_replacements.py

* Update rl_replacements.py

* Update rl_replacements.py

* Update rl_replacements.py

* Generic efficient GRPO

* Update rl_replacements.py

* Update rl_replacements.py

* Remove debugging

* Update rl_replacements.py

* Update rl_replacements.py

* Update vision.py

* Update llama.py

* Update rl_replacements.py

* versioning

* Update _utils.py

* Update vision.py

* Update mapper.py

* Update loader.py

* Update mapper.py

* Update vision.py

* Update loader.py

* Update vision.py

* Update loader.py

* Update _utils.py

* Update vision.py

* gradient checkpointing

* Gemma 3N fixes

* Update loader.py

* Versioning

* Gemma 3N fixes

* Update vision.py

* Update vision.py

* Update loader.py

* Update vision.py

* Fix setup.py

* setup.py

* Prints

* Update setup.py

* Update setup.py

* Update setup.py

* Update pyproject.toml

* Update pyproject.toml

* Update pyproject.toml

* Update pyproject.toml

* Update pyproject.toml

* Update pyproject.toml

* Update vision.py

* Update vision.py

* Update pyproject.toml

* Update vision.py

* Update _utils.py

* Update __init__.py

* Update __init__.py

---------

Co-authored-by: jeromeku <jerome.ku@gmail.com>
Co-authored-by: Michael Han <107991372+shimmyshimmer@users.noreply.github.com>

* silienty skip falcon h1 import is transformers_version < 4.53.0 (#2912)

* Dynamically adjust get_per_token_logps function and patch as well (#2911)

* add intel gpu with vllm support (#2903)

* [bugs] fix for casual mask (#2868)

* fix for casual mask

* use un_casual in sdpa

* add missing mask

* fix for type

* Explicitly check if xformers exists for attention (#2889)

* Update __init__.py

* Update llama.py

* if mlp doesn't exist in layer module check for feed_forward name for falcon h1 (#2913)

* Move inputs to right devices. (#2919)

* Move tensors to right devices

* fix multi gpu for non mistral models

* multi GPU RoPE for gemma2

* Finish up multi GPU inference

* Make multiGPU rope a list

* Remove unnecessary transfer to CPU

* Remove unnecessary move to CPU

* Donot move inputs to device yet

will be handled separately in another PR

* Move inputs to appropriate decoder device

* Make device count global variable

* Cleanup RoPE device code

* Fixup num_gpu to device count

* Cleanup device counts

* Use device index for RoPE get_cache

* Donot typecast

* Use tuple instead of list for tensors. Use device index directly

* fixup move to device logic

* WIP VLM vLLM

* Make vLLM patch a function

* Add save and load lora functions

* Make fast_inference setup depend on the flag

* Improve fast inference patching mechanism

* Make vision setting depend on checks in fastbasemodel

* Check LoRA and vLLM intercompatibility for vision models

* Comment pointing to vLLM LoRA check

* Improve lora validation on vLLM

* Error out on no vLLM and increase max lora rank

* Bug fixes (#3017)

* Update synthetic.py

* Update synthetic.py

* Update synthetic.py

* Update synthetic.py

* Update synthetic.py

* Update synthetic.py

* Update synthetic.py

* Update synthetic.py

* Update synthetic.py

* Update synthetic.py

* Update synthetic.py

* Update synthetic.py

* Update synthetic.py

* Update synthetic.py

* Update synthetic.py

* Update synthetic.py

* Update synthetic.py

* Update synthetic.py

* Update synthetic.py

* Update synthetic.py

* Update synthetic.py

* Update synthetic.py

* Update synthetic.py

* Update pyproject.toml

* Delete .gitignore

* Update synthetic.py

* Update synthetic.py

* Update synthetic.py

* Update synthetic.py

* Update synthetic.py

* Update synthetic.py

* Update synthetic.py

* Update synthetic.py

* Update synthetic.py

* Update synthetic.py

* Update synthetic.py

* Update synthetic.py

* Update synthetic.py

* Update synthetic.py

* Update synthetic.py

* Update synthetic.py

* Update synthetic.py

* Update _utils.py

* Update pyproject.toml

* Update synthetic.py

* Update synthetic.py

* Update synthetic.py

* Update synthetic.py

* Update chat_templates.py

* Seasame force float16 / float32

* Fix Seasame

* Update loader.py

* Update vision.py

* Update vision.py

* Update vision.py

* Update loader.py

* is_multimodal

* Update loader.py

* Update loader.py

* Update loader.py

* Update loader.py

* Update vision.py

* Update vision.py

* Update vision.py

* UNSLOTH_DISABLE_STATIC_GENERATION

* Update vision.py

* Auto vision detection

* Sesame

* Whisper

* Update loader.py

* Update loader.py

* Update loader.py

* Update mapper.py

* Update vision.py

* Update vision.py

* Update vision.py

* Update vision.py

* Update vision.py

* Update vision.py

* Update loader.py

* Update loader.py

* Update loader.py

* Update loader.py

* Update _utils.py

* Update rl.py

* versioning

* Update rl.py

* Update rl.py

* Update rl.py

* Update rl.py

* Update rl.py

* logging

* Update pyproject.toml

* Update rl.py

* versioning

* Update rl.py

* Update rl.py

* Update rl_replacements.py

* Update rl_replacements.py

* Update rl.py

* Update rl_replacements.py

* Update rl_replacements.py

* logits / temperature

* Update rl_replacements.py

* Update pyproject.toml

* Update rl_replacements.py

* Update rl_replacements.py

* Debugging only

* Update llama.py

* Update llama.py

* Update rl_replacements.py

* Update rl_replacements.py

* Update rl_replacements.py

* Update rl_replacements.py

* Update rl_replacements.py

* Generic efficient GRPO

* Update rl_replacements.py

* Update rl_replacements.py

* Remove debugging

* Update rl_replacements.py

* Update rl_replacements.py

* Update vision.py

* Update llama.py

* Update rl_replacements.py

* versioning

* Update _utils.py

* Update vision.py

* Update mapper.py

* Update loader.py

* Update mapper.py

* Update vision.py

* Update loader.py

* Update vision.py

* Update loader.py

* Update _utils.py

* Update vision.py

* gradient checkpointing

* Gemma 3N fixes

* Update loader.py

* Versioning

* Gemma 3N fixes

* Update vision.py

* Update vision.py

* Update loader.py

* Update vision.py

* Fix setup.py

* setup.py

* Prints

* Update setup.py

* Update setup.py

* Update setup.py

* Update pyproject.toml

* Update pyproject.toml

* Update pyproject.toml

* Update pyproject.toml

* Update pyproject.toml

* Update pyproject.toml

* Update vision.py

* Update vision.py

* Update pyproject.toml

* Update vision.py

* Update _utils.py

* Update __init__.py

* Update __init__.py

* Small fixes

* Update vision.py

* Update vision.py

* versioning

* Update __init__.py

* Update llama.py

* Update rl.py

* Update rl.py

* Update _utils.py

* Update vision.py

* Update vision.py

* compiler stance

* Update _utils.py

* Update pyproject.toml

* Update pyproject.toml

* Update rl_replacements.py

* Update rl_replacements.py

* Update rl_replacements.py

* Update rl_replacements.py

* Update rl.py

* Update rl_replacements.py

* Update rl_replacements.py

* Update rl_replacements.py

* Update rl_replacements.py

* Update rl_replacements.py

* Update rl_replacements.py

* Update rl_replacements.py

* Revert "Revert "Add Qwen2.5-VL-32B-Instruct mapping to fix quantized model me…" (#2990)

This reverts commit 204fc46e1904ac3de01f06099f07b88b46be38bf.

* skip_guard_eval_unsafe fix

* Update synthetic.py

* Update synthetic.py

* Update synthetic.py

* Update synthetic.py

* Update synthetic.py

* Update llama.py

* Update llama.py

* Fix `quantization_method`

* versioning

* fix for casual mask (#3011)

* [intel] add for intel path for llama.py (#3012)

* fix for intel path

* remove unuse code

* Update unsloth/models/llama.py

---------

Co-authored-by: Daniel Han <danielhanchen@gmail.com>

* Update llama.py

* Fix Gemma 2 (#3024)

* Update synthetic.py

* Update synthetic.py

* Update synthetic.py

* Update synthetic.py

* Update synthetic.py

* Update synthetic.py

* Update synthetic.py

* Update synthetic.py

* Update synthetic.py

* Update synthetic.py

* Update synthetic.py

* Update synthetic.py

* Update synthetic.py

* Update synthetic.py

* Update synthetic.py

* Update synthetic.py

* Update synthetic.py

* Update synthetic.py

* Update pyproject.toml

* Delete .gitignore

* Update synthetic.py

* Update synthetic.py

* Update synthetic.py

* Update synthetic.py

* Update synthetic.py

* Update synthetic.py

* Update synthetic.py

* Update synthetic.py

* Update synthetic.py

* Update synthetic.py

* Update synthetic.py

* Update synthetic.py

* Update synthetic.py

* Update synthetic.py

* Update synthetic.py

* Update synthetic.py

* Update synthetic.py

* Update _utils.py

* Update pyproject.toml

* Update synthetic.py

* Update synthetic.py

* Update synthetic.py

* Update synthetic.py

* Update chat_templates.py

* Seasame force float16 / float32

* Fix Seasame

* Update loader.py

* Update vision.py

* Update vision.py

* Update vision.py

* Update loader.py

* is_multimodal

* Update loader.py

* Update loader.py

* Update loader.py

* Update loader.py

* Update vision.py

* Update vision.py

* Update vision.py

* UNSLOTH_DISABLE_STATIC_GENERATION

* Update vision.py

* Auto vision detection

* Sesame

* Whisper

* Update loader.py

* Update loader.py

* Update loader.py

* Update mapper.py

* Update vision.py

* Update vision.py

* Update vision.py

* Update vision.py

* Update vision.py

* Update vision.py

* Update loader.py

* Update loader.py

* Update loader.py

* Update loader.py

* Update _utils.py

* Update rl.py

* versioning

* Update rl.py

* Update rl.py

* Update rl.py

* Update rl.py

* Update rl.py

* logging

* Update pyproject.toml

* Update rl.py

* versioning

* Update rl.py

* Update rl.py

* Update rl_replacements.py

* Update rl_replacements.py

* Update rl.py

* Update rl_replacements.py

* Update rl_replacements.py

* logits / temperature

* Update rl_replacements.py

* Update pyproject.toml

* Update rl_replacements.py

* Update rl_replacements.py

* Debugging only

* Update llama.py

* Update llama.py

* Update rl_replacements.py

* Update rl_replacements.py

* Update rl_replacements.py

* Update rl_replacements.py

* Update rl_replacements.py

* Generic efficient GRPO

* Update rl_replacements.py

* Update rl_replacements.py

* Remove debugging

* Update rl_replacements.py

* Update rl_replacements.py

* Update vision.py

* Update llama.py

* Update rl_replacements.py

* versioning

* Update _utils.py

* Update vision.py

* Update mapper.py

* Update loader.py

* Update mapper.py

* Update vision.py

* Update loader.py

* Update vision.py

* Update loader.py

* Update _utils.py

* Update vision.py

* gradient checkpointing

* Gemma 3N fixes

* Update loader.py

* Versioning

* Gemma 3N fixes

* Update vision.py

* Update vision.py

* Update loader.py

* Update vision.py

* Fix setup.py

* setup.py

* Prints

* Update setup.py

* Update setup.py

* Update setup.py

* Update pyproject.toml

* Update pyproject.toml

* Update pyproject.toml

* Update pyproject.toml

* Update pyproject.toml

* Update pyproject.toml

* Update vision.py

* Update vision.py

* Update pyproject.toml

* Update vision.py

* Update _utils.py

* Update __init__.py

* Update __init__.py

* Small fixes

* Update vision.py

* Update vision.py

* versioning

* Update __init__.py

* Update llama.py

* Update rl.py

* Update rl.py

* Update _utils.py

* Update vision.py

* Update vision.py

* compiler stance

* Update _utils.py

* Update pyproject.toml

* Update pyproject.toml

* Update rl_replacements.py

* Update rl_replacements.py

* Update rl_replacements.py

* Update rl_replacements.py

* Update rl.py

* Update rl_replacements.py

* Update rl_replacements.py

* Update rl_replacements.py

* Update rl_replacements.py

* Update rl_replacements.py

* Update rl_replacements.py

* Update rl_replacements.py

* Revert "Revert "Add Qwen2.5-VL-32B-Instruct mapping to fix quantized model me…" (#2990)

This reverts commit 204fc46e1904ac3de01f06099f07b88b46be38bf.

* skip_guard_eval_unsafe fix

* Update synthetic.py

* Update synthetic.py

* Update synthetic.py

* Update synthetic.py

* Update synthetic.py

* Update llama.py

* Update llama.py

* Fix `quantization_method`

* versioning

* Update _utils.py

* Update _utils.py

* Update _utils.py

* falcon force float32 on sm<75 machines (#3026)

* Fix torch compile issues (#3028)

* Update synthetic.py

* Update synthetic.py

* Update synthetic.py

* Update synthetic.py

* Update synthetic.py

* Update synthetic.py

* Update synthetic.py

* Update synthetic.py

* Update synthetic.py

* Update synthetic.py

* Update synthetic.py

* Update synthetic.py

* Update pyproject.toml

* Delete .gitignore

* Update synthetic.py

* Update synthetic.py

* Update synthetic.py

* Update synthetic.py

* Update synthetic.py

* Update synthetic.py

* Update synthetic.py

* Update synthetic.py

* Update synthetic.py

* Update synthetic.py

* Update synthetic.py

* Update synthetic.py

* Update synthetic.py

* Update synthetic.py

* Update synthetic.py

* Update synthetic.py

* Update synthetic.py

* Update _utils.py

* Update pyproject.toml

* Update synthetic.py

* Update synthetic.py

* Update synthetic.py

* Update synthetic.py

* Update chat_templates.py

* Seasame force float16 / float32

* Fix Seasame

* Update loader.py

* Update vision.py

* Update vision.py

* Update vision.py

* Update loader.py

* is_multimodal

* Update loader.py

* Update loader.py

* Update loader.py

* Update loader.py

* Update vision.py

* Update vision.py

* Update vision.py

* UNSLOTH_DISABLE_STATIC_GENERATION

* Update vision.py

* Auto vision detection

* Sesame

* Whisper

* Update loader.py

* Update loader.py

* Update loader.py

* Update mapper.py

* Update vision.py

* Update vision.py

* Update vision.py

* Update vision.py

* Update vision.py

* Update vision.py

* Update loader.py

* Update loader.py

* Update loader.py

* Update loader.py

* Update _utils.py

* Update rl.py

* versioning

* Update rl.py

* Update rl.py

* Update rl.py

* Update rl.py

* Update rl.py

* logging

* Update pyproject.toml

* Update rl.py

* versioning

* Update rl.py

* Update rl.py

* Update rl_replacements.py

* Update rl_replacements.py

* Update rl.py

* Update rl_replacements.py

* Update rl_replacements.py

* logits / temperature

* Update rl_replacements.py

* Update pyproject.toml

* Update rl_replacements.py

* Update rl_replacements.py

* Debugging only

* Update llama.py

* Update llama.py

* Update rl_replacements.py

* Update rl_replacements.py

* Update rl_replacements.py

* Update rl_replacements.py

* Update rl_replacements.py

* Generic efficient GRPO

* Update rl_replacements.py

* Update rl_replacements.py

* Remove debugging

* Update rl_replacements.py

* Update rl_replacements.py

* Update vision.py

* Update llama.py

* Update rl_replacements.py

* versioning

* Update _utils.py

* Update vision.py

* Update mapper.py

* Update loader.py

* Update mapper.py

* Update vision.py

* Update loader.py

* Update vision.py

* Update loader.py

* Update _utils.py

* Update vision.py

* gradient checkpointing

* Gemma 3N fixes

* Update loader.py

* Versioning

* Gemma 3N fixes

* Update vision.py

* Update vision.py

* Update loader.py

* Update vision.py

* Fix setup.py

* setup.py

* Prints

* Update setup.py

* Update setup.py

* Update setup.py

* Update pyproject.toml

* Update pyproject.toml

* Update pyproject.toml

* Update pyproject.toml

* Update pyproject.toml

* Update pyproject.toml

* Update vision.py

* Update vision.py

* Update pyproject.toml

* Update vision.py

* Update _utils.py

* Update __init__.py

* Update __init__.py

* Small fixes

* Update vision.py

* Update vision.py

* versioning

* Update __init__.py

* Update llama.py

* Update rl.py

* Update rl.py

* Update _utils.py

* Update vision.py

* Update vision.py

* compiler stance

* Update _utils.py

* Update pyproject.toml

* Update pyproject.toml

* Update rl_replacements.py

* Update rl_replacements.py

* Update rl_replacements.py

* Update rl_replacements.py

* Update rl.py

* Update rl_replacements.py

* Update rl_replacements.py

* Update rl_replacements.py

* Update rl_replacements.py

* Update rl_replacements.py

* Update rl_replacements.py

* Update rl_replacements.py

* Revert "Revert "Add Qwen2.5-VL-32B-Instruct mapping to fix quantized model me…" (#2990)

This reverts commit 204fc46e1904ac3de01f06099f07b88b46be38bf.

* skip_guard_eval_unsafe fix

* Update synthetic.py

* Update synthetic.py

* Update synthetic.py

* Update synthetic.py

* Update synthetic.py

* Update llama.py

* Update llama.py

* Fix `quantization_method`

* versioning

* Update _utils.py

* Update _utils.py

* Update _utils.py

* check stride

* Cleanup

* Update rope_embedding.py

* Update gemma2.py

* Fix `set_stance`

* Update pyproject.toml

* Update _utils.py

* Fixup patch vllm

* Disable mllama

* Use variables to decide VLM support

* Better attn_impl handling

* Patch TF protobuf incompatability

* Torch 2.8 (#3186)

* Fix mamba

* Update loader.py

* Update vision.py

* Update loader.py

* Filter vLLM standby logs (#3131)

* filter vLLM standby logs

* safeguard standby logger patch

* Update unsloth/models/_utils.py

* Update unsloth/models/_utils.py

* Update unsloth/models/_utils.py

---------

Co-authored-by: Daniel Han <danielhanchen@gmail.com>

* Update loader.py

* Add scaler

* Update llama.py

* Update _utils.py

* Versioning

* GPT OSS fix

* GPT OSS fix

* Update loader.py

* Update vision.py

* Update vision.py

* Update loader.py

* Update vision.py

* Update vision.py

* Update llama.py

* Update llama.py

* Update llama.py

* Versioning

* Update mapper.py

* Update vision.py

* Update vision.py

* Update vision.py

* Upcast norms

* Update loader.py

* Update vision.py

* Upcast layernorms

* Update llama.py

* Update llama.py

* Update llama.py

* Update llama.py

* Update llama.py

* Update llama.py

* Update save.py

* Update rl.py

* Update pyproject.toml

* Update rl.py

* Update rl_replacements.py

* Update rl.py

* Update rl.py

* Update rl.py

* Update _utils.py

* Update __init__.py

* Torch 2.8

* Update rl_replacements.py

---------

Co-authored-by: Datta Nimmaturi <venkatadattasainimmaturi@gmail.com>

* Update _auto_install.py

* Update pyproject.toml

* Update rl.py

* Protobuf issue

* Update pyproject.toml

* Fix extras transformers typo in pyproject.toml

* Update _utils.py

* Bug fixes (#3195)

* Fix mamba

* Update loader.py

* Update vision.py

* Update loader.py

* Filter vLLM standby logs (#3131)

* filter vLLM standby logs

* safeguard standby logger patch

* Update unsloth/models/_utils.py

* Update unsloth/models/_utils.py

* Update unsloth/models/_utils.py

---------

Co-authored-by: Daniel Han <danielhanchen@gmail.com>

* Update loader.py

* Add scaler

* Update llama.py

* Update _utils.py

* Versioning

* GPT OSS fix

* GPT OSS fix

* Update loader.py

* Update vision.py

* Update vision.py

* Update loader.py

* Update vision.py

* Update vision.py

* Update llama.py

* Update llama.py

* Update llama.py

* Versioning

* Update mapper.py

* Update vision.py

* Update vision.py

* Update vision.py

* Upcast norms

* Update loader.py

* Update vision.py

* Upcast layernorms

* Update llama.py

* Update llama.py

* Update llama.py

* Update llama.py

* Update llama.py

* Update llama.py

* Update save.py

* Update rl.py

* Update pyproject.toml

* Update rl.py

* Update rl_replacements.py

* Update rl.py

* Update rl.py

* Update rl.py

* Update _utils.py

* Update __init__.py

* Torch 2.8

* Update rl_replacements.py

* Update loader.py

* UNSLOTH_ENABLE_CCE

* Fix

* Update loader.py

* Update loader.py

* Update __init__.py

* Update __init__.py

* Update __init__.py

* Update __init__.py

* Import fixes

* Update loader.py

* Fix aimv2 issue

* Update loader.py

* Update import_fixes.py

* Update import_fixes.py

* Update loader.py

* Update loader.py

* Update loader.py

* Upgrade

* Update loader.py

* Update loader.py

* Update loader.py

* Update loader.py

---------

Co-authored-by: Datta Nimmaturi <venkatadattasainimmaturi@gmail.com>

* adallow float32 dtype in FastLanguageModel (#3204)

* Update loader.py

* Update vision.py

* Suppress message and use unsloth sampling params

* Use trl sampling params for now

* Improve error message

* fixup quantized fast inference model name

* Add mistral 3 support

---------

Co-authored-by: Michael Han <107991372+shimmyshimmer@users.noreply.github.com>
Co-authored-by: Daniel Han <danielhanchen@gmail.com>
Co-authored-by: jeromeku <jerome.ku@gmail.com>
Co-authored-by: DoubleMathew <mmathew23@gmail.com>
Co-authored-by: Lei Zhenyuan <zhenyuan.lei@intel.com>
Co-authored-by: parth2510 <parthguptapg7326@gmail.com>
---
 unsloth/models/vision.py | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/unsloth/models/vision.py b/unsloth/models/vision.py
index 36e2cdd459..b0777c99a8 100644
--- a/unsloth/models/vision.py
+++ b/unsloth/models/vision.py
@@ -83,6 +83,7 @@
 VLLM_SUPPORTED_VLM = [
     "qwen2_5_vl",
     "gemma3",
+    "mistral3"
 ]
 VLLM_NON_LORA_VLM = [
     "mllama"
@@ -90,6 +91,7 @@
 
 from transformers import GenerationConfig, CompileConfig, HybridCache, AutoConfig, PretrainedConfig
 HAS_TORCH_DTYPE = "torch_dtype" in PretrainedConfig.__doc__
+
 from transformers import GenerationConfig, CompileConfig, HybridCache
 
 _compile_config = CompileConfig(

From 67a544de0598a6c7a5442e4578c1348e70fc0e65 Mon Sep 17 00:00:00 2001
From: Daniel Han <danielhanchen@gmail.com>
Date: Sun, 21 Sep 2025 23:10:47 -0700
Subject: [PATCH 178/272] Set padding to 0

---
 unsloth/models/llama.py  | 7 +++++++
 unsloth/models/vision.py | 7 +++++++
 2 files changed, 14 insertions(+)

diff --git a/unsloth/models/llama.py b/unsloth/models/llama.py
index 8708fb5218..1b22542514 100644
--- a/unsloth/models/llama.py
+++ b/unsloth/models/llama.py
@@ -2244,6 +2244,13 @@ def from_pretrained(
             unsloth_fast_generate.__doc__ = model._old_generate.__doc__
             model.generate = types.MethodType(unsloth_fast_generate, model)
         pass
+        # Set weight[padding_idx] = 0
+        with torch.no_grad():
+            for name, module in model.named_modules():
+                if type(module) is torch.nn.Embedding:
+                    if getattr(module, "weight", None) is not None and getattr(module, "padding_idx", None) is not None:
+                        if module.padding_idx < module.weight.shape[0]:
+                            module.weight[module.padding_idx] = 0
         return model, tokenizer
     pass
 
diff --git a/unsloth/models/vision.py b/unsloth/models/vision.py
index b0777c99a8..10f757c21b 100644
--- a/unsloth/models/vision.py
+++ b/unsloth/models/vision.py
@@ -876,6 +876,13 @@ def post_patch_model(
             m.for_training  = functools.partial(FastBaseModel.for_training,  m)
             m.for_inference = functools.partial(FastBaseModel.for_inference, m)
             m = m.model
+        # Set weight[padding_idx] = 0
+        with torch.no_grad():
+            for name, module in model.named_modules():
+                if type(module) is torch.nn.Embedding:
+                    if getattr(module, "weight", None) is not None and getattr(module, "padding_idx", None) is not None:
+                        if module.padding_idx < module.weight.shape[0]:
+                            module.weight[module.padding_idx] = 0
         return model
     pass
 

From 72383278d174f42398ca7f8e3b8e3e9b18401da5 Mon Sep 17 00:00:00 2001
From: Daniel Han <danielhanchen@gmail.com>
Date: Tue, 23 Sep 2025 02:24:10 -0700
Subject: [PATCH 179/272] Fix patch

---
 unsloth/models/loader.py | 2 +-
 unsloth/models/rl.py     | 3 ++-
 2 files changed, 3 insertions(+), 2 deletions(-)

diff --git a/unsloth/models/loader.py b/unsloth/models/loader.py
index 1a9c145368..875c6f73fc 100644
--- a/unsloth/models/loader.py
+++ b/unsloth/models/loader.py
@@ -664,7 +664,7 @@ def from_pretrained(
         # Qwen 2.5
         elif "qwen2_5" in model_types_all and transformers_version < Version("4.49.0"):
             raise RuntimeError("Unsloth: Qwen 2.5 only works on transformers >= 4.49.0." + LATEST)
-        # Gemma 3N must be beefore Gemma 3
+        # Gemma 3N must be before Gemma 3
         elif "gemma3n" in model_types_all:
             if transformers_version < Version("4.53.0"):
                 raise RuntimeError("Unsloth: Gemma 3N only works on transformers >= 4.53.0" + LATEST)
diff --git a/unsloth/models/rl.py b/unsloth/models/rl.py
index 889cbaccaf..b032f228c8 100644
--- a/unsloth/models/rl.py
+++ b/unsloth/models/rl.py
@@ -118,6 +118,7 @@ def generate_with_clone(*args, **kwargs):
 
 # Wrap trainer with padding to right and enable training mode
 import functools
+from types import MethodType
 def prepare_for_training_mode(f):
     @functools.wraps(f)
     def wrapper(self, *args, **kwargs):
@@ -496,7 +497,7 @@ def _patch_trl_rl_trainers(trainer_file = "grpo_trainer"):
     if "model" in call_args:
         training_check = \
         "if hasattr(self, 'train'):\n"\
-        "    self.train = prepare_for_training_mode(self.train)\n"\
+        "    self.train = MethodType(prepare_for_training_mode(self.__class__.train), self)\n"\
         "pass\n"
         RLTrainer_post += training_check
     pass

From 8a1e6fb9d6a5147e51e161c8a9f92788457f250d Mon Sep 17 00:00:00 2001
From: Daniel Han <danielhanchen@gmail.com>
Date: Tue, 23 Sep 2025 02:29:46 -0700
Subject: [PATCH 180/272] fixup patch (#3359)

Co-authored-by: Datta Nimmaturi <venkatadattasainimmaturi@gmail.com>
---
 unsloth/models/rl.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/unsloth/models/rl.py b/unsloth/models/rl.py
index b032f228c8..65434bb095 100644
--- a/unsloth/models/rl.py
+++ b/unsloth/models/rl.py
@@ -123,11 +123,11 @@ def prepare_for_training_mode(f):
     @functools.wraps(f)
     def wrapper(self, *args, **kwargs):
         # Enable training mode
-        if hasattr(self, model) and hasattr(self.model, "for_training"):
+        if hasattr(self, 'model') and hasattr(self.model, "for_training"):
             self.model.for_training()
         output = f(self, *args, **kwargs)
         # Return inference mode
-        if hasattr(self, model) and hasattr(self.model, "for_inference"):
+        if hasattr(self, 'model') and hasattr(self.model, "for_inference"):
             self.model.for_inference()
         return output
     return wrapper

From f0ec1aeaa550daa755b8eb4b89c1a0ba6c496c91 Mon Sep 17 00:00:00 2001
From: Daniel Han <danielhanchen@gmail.com>
Date: Tue, 23 Sep 2025 07:36:15 -0700
Subject: [PATCH 181/272] Update vision.py

---
 unsloth/models/vision.py | 7 +++++++
 1 file changed, 7 insertions(+)

diff --git a/unsloth/models/vision.py b/unsloth/models/vision.py
index 10f757c21b..f62596a8c2 100644
--- a/unsloth/models/vision.py
+++ b/unsloth/models/vision.py
@@ -246,6 +246,13 @@ def unsloth_base_fast_generate(
         if cache_implementation is not None:
             kwargs["compile_config"] = _compile_config
     pass
+
+    # Delete cached Flex Attention masks to reset inference
+    for name, module in self.named_modules():
+        if hasattr(module, "_flex_attention_cache"):
+            del module._flex_attention_cache
+    pass
+
     with torch.inference_mode(), autocaster:
         output = self._old_generate(*args, **kwargs)
 

From a64a3b2d619af08ef5214e6281a7fa0f6c2f039a Mon Sep 17 00:00:00 2001
From: Daniel Han <danielhanchen@gmail.com>
Date: Tue, 23 Sep 2025 08:08:09 -0700
Subject: [PATCH 182/272] Versioning

---
 pyproject.toml           | 4 ++--
 unsloth/models/_utils.py | 2 +-
 2 files changed, 3 insertions(+), 3 deletions(-)

diff --git a/pyproject.toml b/pyproject.toml
index 4f9c308b32..c7f67acfdd 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -37,7 +37,7 @@ triton = [
 ]
 
 huggingface = [
-    "unsloth_zoo>=2025.9.9",
+    "unsloth_zoo>=2025.9.10",
     "packaging",
     "tyro",
     "transformers>=4.51.3,!=4.52.0,!=4.52.1,!=4.52.2,!=4.52.3,!=4.53.0,!=4.54.0,!=4.55.0,!=4.55.1,<=4.55.4",
@@ -453,7 +453,7 @@ colab-ampere-torch220 = [
     "flash-attn>=2.6.3",
 ]
 colab-new = [
-    "unsloth_zoo>=2025.9.9",
+    "unsloth_zoo>=2025.9.10",
     "packaging",
     "tyro",
     "transformers>=4.51.3,!=4.47.0,!=4.52.0,!=4.52.1,!=4.52.2,!=4.52.3,!=4.53.0,!=4.54.0,!=4.55.0,!=4.55.1,<=4.55.4",
diff --git a/unsloth/models/_utils.py b/unsloth/models/_utils.py
index 8275283c6a..ef8fc0fba6 100644
--- a/unsloth/models/_utils.py
+++ b/unsloth/models/_utils.py
@@ -12,7 +12,7 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-__version__ = "2025.9.7"
+__version__ = "2025.9.8"
 
 __all__ = [
     "SUPPORTS_BFLOAT16",

From 1b7640ba35436dda38385d7a463bebdc34ac3969 Mon Sep 17 00:00:00 2001
From: Daniel Han <danielhanchen@gmail.com>
Date: Tue, 23 Sep 2025 08:41:29 -0700
Subject: [PATCH 183/272] Update vision.py

---
 unsloth/models/vision.py | 7 ++++++-
 1 file changed, 6 insertions(+), 1 deletion(-)

diff --git a/unsloth/models/vision.py b/unsloth/models/vision.py
index f62596a8c2..e83bd3ec08 100644
--- a/unsloth/models/vision.py
+++ b/unsloth/models/vision.py
@@ -957,7 +957,12 @@ def _for_training(m):
             # Pad tokenizer to the left
             if hasattr(m, "_saved_temp_tokenizer"): m._saved_temp_tokenizer.padding_side = "right"
             # Set a flag for generation!
-            if hasattr(m, "_flag_for_generation"): del m._flag_for_generation
+            if hasattr(m, "_flag_for_generation"):
+                try:
+                    # Weirdly sometimes cannot succeed so do a try except
+                    del m._flag_for_generation
+                except:
+                    pass
         pass
         m = model
         while hasattr(m, "model"):

From f5c438540a2e69d4985171f9e04badebda67e7fa Mon Sep 17 00:00:00 2001
From: Daniel Han <danielhanchen@gmail.com>
Date: Tue, 23 Sep 2025 18:01:53 -0700
Subject: [PATCH 184/272] Update vision.py

---
 unsloth/models/vision.py | 58 +++++++++++++++++++++++++++++++++++++---
 1 file changed, 55 insertions(+), 3 deletions(-)

diff --git a/unsloth/models/vision.py b/unsloth/models/vision.py
index e83bd3ec08..fdfaee89f5 100644
--- a/unsloth/models/vision.py
+++ b/unsloth/models/vision.py
@@ -83,10 +83,13 @@
 VLLM_SUPPORTED_VLM = [
     "qwen2_5_vl",
     "gemma3",
-    "mistral3"
+    "mistral3",
 ]
 VLLM_NON_LORA_VLM = [
-    "mllama"
+    "mllama",
+]
+PRE_COMPILE_INFERENCE = [
+    "gpt_oss",
 ]
 
 from transformers import GenerationConfig, CompileConfig, HybridCache, AutoConfig, PretrainedConfig
@@ -250,12 +253,29 @@ def unsloth_base_fast_generate(
     # Delete cached Flex Attention masks to reset inference
     for name, module in self.named_modules():
         if hasattr(module, "_flex_attention_cache"):
-            del module._flex_attention_cache
+            try: del module._flex_attention_cache
+            except: pass
+        # Solves AttributeError: 'SlidingWindowLayer' object has no attribute 'max_batch_size'
+        if hasattr(module, "_cache") and "cache_utils" in str(module._cache.__class__):
+            try: del module._cache
+            except: pass
     pass
 
+    # DO INFERENCE
     with torch.inference_mode(), autocaster:
         output = self._old_generate(*args, **kwargs)
 
+    # Delete cached Flex Attention masks to reset inference
+    for name, module in self.named_modules():
+        if hasattr(module, "_flex_attention_cache"):
+            try: del module._flex_attention_cache
+            except: pass
+        # Solves AttributeError: 'SlidingWindowLayer' object has no attribute 'max_batch_size'
+        if hasattr(module, "_cache") and "cache_utils" in str(module._cache.__class__):
+            try: del module._cache
+            except: pass
+    pass
+
     # FastBaseModel.for_training(self)
     return output
 pass
@@ -674,6 +694,7 @@ def from_pretrained(
             model,
             use_gradient_checkpointing = use_gradient_checkpointing,
             trust_remote_code  = trust_remote_code,
+            model_type = model_type_arch,
         )
         # Clear deleted GPU items
         for _ in range(3):
@@ -686,6 +707,31 @@ def from_pretrained(
         return model, tokenizer
     pass
 
+    @staticmethod
+    def pre_compile_for_inference(model_type, model, tokenizer):
+        """
+        We need to invoke torch.compile to save VRAM usage and make it faster downstream.
+        Sometimes torch.compile can use 3GB weirdly on large batches, then it goes down to <1GB.
+        So we invoke torch.compile on short batches to reduce VRAM usage.
+        """
+        if model_type is None or model is None or tokenizer is None: return
+        if str(model_type).lower() not in PRE_COMPILE_INFERENCE: return
+        if getattr(tokenizer, "chat_template", None) is None: return
+        messages = [
+            [
+                 {"role": "user", "content": f"1+1"},
+            ],
+        ]*4
+        inputs = tokenizer.apply_chat_template(
+            messages,
+            add_generation_prompt = True,
+            return_tensors = "pt",
+            return_dict = True,
+        ).to(model.device)
+        print(f"Unsloth: Pre compiling {model_type.title()} model for faster inference - this might take a few minutes!")
+        _ = model.generate(**inputs, max_new_tokens = 3)
+        del inputs
+    pass
 
     @staticmethod
     def get_peft_model(
@@ -823,6 +869,7 @@ def post_patch_model(
         model,
         use_gradient_checkpointing = True,
         trust_remote_code = False,
+        model_type = None,
     ):
         full_finetuning = os.environ.get("UNSLOTH_ENABLE_FULL_FINETUNING", "0") == "1"
 
@@ -850,10 +897,12 @@ def post_patch_model(
         patch_saving_functions(model, vision = True)
 
         # Patch tokenizer to pad to the left
+        tokenizer = None
         m = model
         while hasattr(m, "model"):
             if hasattr(m, "_saved_temp_tokenizer"):
                 if hasattr(m._saved_temp_tokenizer, "tokenizer"):
+                    tokenizer = m._saved_temp_tokenizer
                     m._saved_temp_tokenizer.tokenizer.padding_side = "left"
             pass
             # Also set is_loaded_in_8bit to disable incorrect DDP
@@ -862,6 +911,7 @@ def post_patch_model(
         pass
         if hasattr(m, "_saved_temp_tokenizer"):
             if hasattr(m._saved_temp_tokenizer, "tokenizer"):
+                tokenizer = m._saved_temp_tokenizer
                 m._saved_temp_tokenizer.tokenizer.padding_side = "left"
         pass
         # Also set is_loaded_in_8bit to disable incorrect DDP
@@ -890,6 +940,8 @@ def post_patch_model(
                     if getattr(module, "weight", None) is not None and getattr(module, "padding_idx", None) is not None:
                         if module.padding_idx < module.weight.shape[0]:
                             module.weight[module.padding_idx] = 0
+        # Patch for torch.compiled inference
+        FastBaseModel.pre_compile_for_inference(model_type, model, tokenizer)
         return model
     pass
 

From 8438a7620eafc9f012c92c6ac01c61c0da0b7bf3 Mon Sep 17 00:00:00 2001
From: Daniel Han <danielhanchen@gmail.com>
Date: Tue, 23 Sep 2025 18:05:18 -0700
Subject: [PATCH 185/272] Update vision.py

---
 unsloth/models/vision.py | 1 +
 1 file changed, 1 insertion(+)

diff --git a/unsloth/models/vision.py b/unsloth/models/vision.py
index fdfaee89f5..941e461889 100644
--- a/unsloth/models/vision.py
+++ b/unsloth/models/vision.py
@@ -941,6 +941,7 @@ def post_patch_model(
                         if module.padding_idx < module.weight.shape[0]:
                             module.weight[module.padding_idx] = 0
         # Patch for torch.compiled inference
+        print("Precompiling")
         FastBaseModel.pre_compile_for_inference(model_type, model, tokenizer)
         return model
     pass

From 5867273c8fd0649bf8f304f4182bd9250c844ef6 Mon Sep 17 00:00:00 2001
From: Daniel Han <danielhanchen@gmail.com>
Date: Tue, 23 Sep 2025 18:06:50 -0700
Subject: [PATCH 186/272] Update vision.py

---
 unsloth/models/vision.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/unsloth/models/vision.py b/unsloth/models/vision.py
index 941e461889..07067afb3c 100644
--- a/unsloth/models/vision.py
+++ b/unsloth/models/vision.py
@@ -941,7 +941,7 @@ def post_patch_model(
                         if module.padding_idx < module.weight.shape[0]:
                             module.weight[module.padding_idx] = 0
         # Patch for torch.compiled inference
-        print("Precompiling")
+        print(model_type, model, tokenizer)
         FastBaseModel.pre_compile_for_inference(model_type, model, tokenizer)
         return model
     pass

From 7b2bef1053c5b136431a727314ddf1c220afc7e9 Mon Sep 17 00:00:00 2001
From: Daniel Han <danielhanchen@gmail.com>
Date: Tue, 23 Sep 2025 18:10:09 -0700
Subject: [PATCH 187/272] Update vision.py

---
 unsloth/models/vision.py | 12 ++++++++----
 1 file changed, 8 insertions(+), 4 deletions(-)

diff --git a/unsloth/models/vision.py b/unsloth/models/vision.py
index 07067afb3c..5d0080f8b8 100644
--- a/unsloth/models/vision.py
+++ b/unsloth/models/vision.py
@@ -695,6 +695,7 @@ def from_pretrained(
             use_gradient_checkpointing = use_gradient_checkpointing,
             trust_remote_code  = trust_remote_code,
             model_type = model_type_arch,
+            tokenizer = tokenizer,
         )
         # Clear deleted GPU items
         for _ in range(3):
@@ -717,6 +718,10 @@ def pre_compile_for_inference(model_type, model, tokenizer):
         if model_type is None or model is None or tokenizer is None: return
         if str(model_type).lower() not in PRE_COMPILE_INFERENCE: return
         if getattr(tokenizer, "chat_template", None) is None: return
+        # Check if already compiled and exit
+        for module in model.modules():
+            if hasattr(module, "_pre_compiled_for_inference"): return
+        pass
         messages = [
             [
                  {"role": "user", "content": f"1+1"},
@@ -731,6 +736,8 @@ def pre_compile_for_inference(model_type, model, tokenizer):
         print(f"Unsloth: Pre compiling {model_type.title()} model for faster inference - this might take a few minutes!")
         _ = model.generate(**inputs, max_new_tokens = 3)
         del inputs
+        # Set we already pre compiled
+        model._pre_compiled_for_inference = True
     pass
 
     @staticmethod
@@ -870,6 +877,7 @@ def post_patch_model(
         use_gradient_checkpointing = True,
         trust_remote_code = False,
         model_type = None,
+        tokenizer = None,
     ):
         full_finetuning = os.environ.get("UNSLOTH_ENABLE_FULL_FINETUNING", "0") == "1"
 
@@ -897,12 +905,10 @@ def post_patch_model(
         patch_saving_functions(model, vision = True)
 
         # Patch tokenizer to pad to the left
-        tokenizer = None
         m = model
         while hasattr(m, "model"):
             if hasattr(m, "_saved_temp_tokenizer"):
                 if hasattr(m._saved_temp_tokenizer, "tokenizer"):
-                    tokenizer = m._saved_temp_tokenizer
                     m._saved_temp_tokenizer.tokenizer.padding_side = "left"
             pass
             # Also set is_loaded_in_8bit to disable incorrect DDP
@@ -911,7 +917,6 @@ def post_patch_model(
         pass
         if hasattr(m, "_saved_temp_tokenizer"):
             if hasattr(m._saved_temp_tokenizer, "tokenizer"):
-                tokenizer = m._saved_temp_tokenizer
                 m._saved_temp_tokenizer.tokenizer.padding_side = "left"
         pass
         # Also set is_loaded_in_8bit to disable incorrect DDP
@@ -941,7 +946,6 @@ def post_patch_model(
                         if module.padding_idx < module.weight.shape[0]:
                             module.weight[module.padding_idx] = 0
         # Patch for torch.compiled inference
-        print(model_type, model, tokenizer)
         FastBaseModel.pre_compile_for_inference(model_type, model, tokenizer)
         return model
     pass

From 82a7697da359a8df6c54c72a02ce9795e90c60c4 Mon Sep 17 00:00:00 2001
From: Daniel Han <danielhanchen@gmail.com>
Date: Tue, 23 Sep 2025 18:13:36 -0700
Subject: [PATCH 188/272] Update vision.py

---
 unsloth/models/vision.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/unsloth/models/vision.py b/unsloth/models/vision.py
index 5d0080f8b8..15fe64858f 100644
--- a/unsloth/models/vision.py
+++ b/unsloth/models/vision.py
@@ -733,7 +733,7 @@ def pre_compile_for_inference(model_type, model, tokenizer):
             return_tensors = "pt",
             return_dict = True,
         ).to(model.device)
-        print(f"Unsloth: Pre compiling {model_type.title()} model for faster inference - this might take a few minutes!")
+        print(f"🦥 Unsloth: Pre compiling {model_type.title()} model for faster inference - this might take a ~ 3 minutes!")
         _ = model.generate(**inputs, max_new_tokens = 3)
         del inputs
         # Set we already pre compiled

From aa9b200437e18a4f4856c3d8c9dbe2946b50663f Mon Sep 17 00:00:00 2001
From: Daniel Han <danielhanchen@gmail.com>
Date: Tue, 23 Sep 2025 18:19:24 -0700
Subject: [PATCH 189/272] Update vision.py

---
 unsloth/models/vision.py | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/unsloth/models/vision.py b/unsloth/models/vision.py
index 15fe64858f..1a1de849cf 100644
--- a/unsloth/models/vision.py
+++ b/unsloth/models/vision.py
@@ -733,7 +733,8 @@ def pre_compile_for_inference(model_type, model, tokenizer):
             return_tensors = "pt",
             return_dict = True,
         ).to(model.device)
-        print(f"🦥 Unsloth: Pre compiling {model_type.title()} model for faster inference - this might take a ~ 3 minutes!")
+        print(f"🦥 Unsloth: Pre compiling {model_type.title()} model for faster inference - this might take 3 minutes or so!")
+        print("========= Pre compiling model for faster inference. Please be patient thank you! =========")
         _ = model.generate(**inputs, max_new_tokens = 3)
         del inputs
         # Set we already pre compiled

From eb1df232a50d0ae6889e3cf3357faf9b3d31109f Mon Sep 17 00:00:00 2001
From: Daniel Han <danielhanchen@gmail.com>
Date: Tue, 23 Sep 2025 18:22:55 -0700
Subject: [PATCH 190/272] Update vision.py

---
 unsloth/models/vision.py | 19 ++++++++++++++++---
 1 file changed, 16 insertions(+), 3 deletions(-)

diff --git a/unsloth/models/vision.py b/unsloth/models/vision.py
index 1a1de849cf..15fb7f8232 100644
--- a/unsloth/models/vision.py
+++ b/unsloth/models/vision.py
@@ -722,6 +722,22 @@ def pre_compile_for_inference(model_type, model, tokenizer):
         for module in model.modules():
             if hasattr(module, "_pre_compiled_for_inference"): return
         pass
+        print(f"🦥 Unsloth: Pre compiling {model_type.title()} model for faster inference - this might take 3 minutes or so!")
+        print("========= Pre compiling model for faster inference. Please be patient thank you! =========")
+        # Do single inference
+        messages = [
+            [
+                 {"role": "user", "content": f"1+1"},
+            ],
+        ]*1
+        inputs = tokenizer.apply_chat_template(
+            messages,
+            add_generation_prompt = True,
+            return_tensors = "pt",
+            return_dict = True,
+        ).to(model.device)
+        _ = model.generate(**inputs, max_new_tokens = 3)
+        # Do batched inference
         messages = [
             [
                  {"role": "user", "content": f"1+1"},
@@ -733,10 +749,7 @@ def pre_compile_for_inference(model_type, model, tokenizer):
             return_tensors = "pt",
             return_dict = True,
         ).to(model.device)
-        print(f"🦥 Unsloth: Pre compiling {model_type.title()} model for faster inference - this might take 3 minutes or so!")
-        print("========= Pre compiling model for faster inference. Please be patient thank you! =========")
         _ = model.generate(**inputs, max_new_tokens = 3)
-        del inputs
         # Set we already pre compiled
         model._pre_compiled_for_inference = True
     pass

From 563aa35081e581bcbac63cf8943c0ad4489195ac Mon Sep 17 00:00:00 2001
From: Daniel Han <danielhanchen@gmail.com>
Date: Tue, 23 Sep 2025 18:33:51 -0700
Subject: [PATCH 191/272] Update vision.py

---
 unsloth/models/vision.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/unsloth/models/vision.py b/unsloth/models/vision.py
index 15fb7f8232..4a559dc8e3 100644
--- a/unsloth/models/vision.py
+++ b/unsloth/models/vision.py
@@ -727,7 +727,7 @@ def pre_compile_for_inference(model_type, model, tokenizer):
         # Do single inference
         messages = [
             [
-                 {"role": "user", "content": f"1+1"},
+                 {"role": "user", "content": f"What is 1+1 equal to?"},
             ],
         ]*1
         inputs = tokenizer.apply_chat_template(

From 4bfde2ea2333be379456b5b296cc20b960891869 Mon Sep 17 00:00:00 2001
From: Daniel Han <danielhanchen@gmail.com>
Date: Tue, 23 Sep 2025 23:14:47 -0700
Subject: [PATCH 192/272] Update vision.py

---
 unsloth/models/vision.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/unsloth/models/vision.py b/unsloth/models/vision.py
index 4a559dc8e3..94dcf3673d 100644
--- a/unsloth/models/vision.py
+++ b/unsloth/models/vision.py
@@ -960,7 +960,7 @@ def post_patch_model(
                         if module.padding_idx < module.weight.shape[0]:
                             module.weight[module.padding_idx] = 0
         # Patch for torch.compiled inference
-        FastBaseModel.pre_compile_for_inference(model_type, model, tokenizer)
+        # FastBaseModel.pre_compile_for_inference(model_type, model, tokenizer)
         return model
     pass
 

From d6beafe16a7c9f388777e04032397ff6e61cc9e6 Mon Sep 17 00:00:00 2001
From: Daniel Han <danielhanchen@gmail.com>
Date: Wed, 24 Sep 2025 00:01:42 -0700
Subject: [PATCH 193/272] MXFP4 dequant

---
 unsloth/models/loader.py |  1 +
 unsloth/models/vision.py | 23 +++++++++++++++++++++--
 2 files changed, 22 insertions(+), 2 deletions(-)

diff --git a/unsloth/models/loader.py b/unsloth/models/loader.py
index 875c6f73fc..ade7e1292f 100644
--- a/unsloth/models/loader.py
+++ b/unsloth/models/loader.py
@@ -910,6 +910,7 @@ def from_pretrained(
             supports_sdpa     = supports_sdpa,
             whisper_language  = whisper_language,
             whisper_task      = whisper_task,
+            auto_config       = auto_config,
 
             # Pass vLLM/inference parameters
             fast_inference         = fast_inference,
diff --git a/unsloth/models/vision.py b/unsloth/models/vision.py
index 94dcf3673d..1894bd185f 100644
--- a/unsloth/models/vision.py
+++ b/unsloth/models/vision.py
@@ -300,7 +300,9 @@ def from_pretrained(
         supports_sdpa     = True,
         whisper_language  = None,
         whisper_task      = None,
-        fast_inference   = False,
+        auto_config       = None,
+        # vLLM parameters
+        fast_inference    = False,
         gpu_memory_utilization = 0.5,
         float8_kv_cache   = False,
         random_state      = 3407,
@@ -500,10 +502,27 @@ def from_pretrained(
         # Cannot be None, since HF now checks for the config
         if load_in_4bit:
             # Ignore load_in_4bit / load_in_8bit for MXFP4 - best to get config file
-            if "gpt-oss" in model_name.lower():
+            if "gpt-oss-20b" in model_name.lower() or "gpt-oss-120b" in model_name.lower():
                 pass
             else:
                 kwargs["quantization_config"] = bnb_config
+        else:
+            # Try dequantizing the quantized model if it's a quantized model
+            if auto_config is None:
+                auto_config = AutoConfig.from_pretrained(
+                    model_name,
+                    token = token,
+                    trust_remote_code = trust_remote_code,
+                )
+            if hasattr(auto_config, "quantization_config"):
+                from transformers.quantizers.auto import AUTO_QUANTIZATION_CONFIG_MAPPING
+                quantizer = AUTO_QUANTIZATION_CONFIG_MAPPING[auto_config["quant_method"]]
+                quantizer_kwargs = {}
+                if "dequantize" in inspect.signature(quantizer).parameters:
+                    quantizer_kwargs["dequantize"] = True
+                quantization_config = quantizer.from_dict(config, **quantizer_kwargs)
+                kwargs["quantization_config"] = quantization_config
+            pass
         pass
 
         # Check if using forced float32 - we load it in bfloat16, then cast to float16!

From 19cfe1be33961e083ec345357d6e4eb083bb2ab8 Mon Sep 17 00:00:00 2001
From: Daniel Han <danielhanchen@gmail.com>
Date: Wed, 24 Sep 2025 00:03:24 -0700
Subject: [PATCH 194/272] Update loader.py

---
 unsloth/models/loader.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/unsloth/models/loader.py b/unsloth/models/loader.py
index ade7e1292f..c859d62858 100644
--- a/unsloth/models/loader.py
+++ b/unsloth/models/loader.py
@@ -910,7 +910,7 @@ def from_pretrained(
             supports_sdpa     = supports_sdpa,
             whisper_language  = whisper_language,
             whisper_task      = whisper_task,
-            auto_config       = auto_config,
+            auto_config       = model_config,
 
             # Pass vLLM/inference parameters
             fast_inference         = fast_inference,

From 63a7f65a89893e484c620365f5d15f9c1f782a33 Mon Sep 17 00:00:00 2001
From: Daniel Han <danielhanchen@gmail.com>
Date: Wed, 24 Sep 2025 00:06:52 -0700
Subject: [PATCH 195/272] Update vision.py

---
 unsloth/models/vision.py | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/unsloth/models/vision.py b/unsloth/models/vision.py
index 1894bd185f..7722ddd482 100644
--- a/unsloth/models/vision.py
+++ b/unsloth/models/vision.py
@@ -516,7 +516,8 @@ def from_pretrained(
                 )
             if hasattr(auto_config, "quantization_config"):
                 from transformers.quantizers.auto import AUTO_QUANTIZATION_CONFIG_MAPPING
-                quantizer = AUTO_QUANTIZATION_CONFIG_MAPPING[auto_config["quant_method"]]
+                quantization_config = auto_config.quantization_config
+                quantizer = AUTO_QUANTIZATION_CONFIG_MAPPING[quantization_config["quant_method"]]
                 quantizer_kwargs = {}
                 if "dequantize" in inspect.signature(quantizer).parameters:
                     quantizer_kwargs["dequantize"] = True

From df5282b7339b7712f916dbdf2ab958f607184271 Mon Sep 17 00:00:00 2001
From: Daniel Han <danielhanchen@gmail.com>
Date: Wed, 24 Sep 2025 00:13:28 -0700
Subject: [PATCH 196/272] load_in_16bit

---
 unsloth/models/loader.py | 25 ++++++++++++++++---------
 unsloth/models/vision.py | 12 ++++++++----
 2 files changed, 24 insertions(+), 13 deletions(-)

diff --git a/unsloth/models/loader.py b/unsloth/models/loader.py
index c859d62858..0e130fb973 100644
--- a/unsloth/models/loader.py
+++ b/unsloth/models/loader.py
@@ -110,8 +110,9 @@ def from_pretrained(
         model_name                 = "unsloth/Llama-3.2-1B-Instruct",
         max_seq_length             = 2048,
         dtype                      = None,
-        load_in_4bit               = True,
-        load_in_8bit               = False,
+        load_in_4bit               = True,  # 4bit QLoRA
+        load_in_8bit               = False, # 8bit  LoRA
+        load_in_16bit              = False, # 16bit LoRA
         full_finetuning            = False,
         token                      = None,
         device_map                 = "sequential",
@@ -147,6 +148,7 @@ def from_pretrained(
                 dtype                      = dtype,
                 load_in_4bit               = load_in_4bit,
                 load_in_8bit               = load_in_8bit,
+                load_in_16bit              = load_in_16bit,
                 full_finetuning            = full_finetuning,
                 token                      = token,
                 device_map                 = device_map,
@@ -386,6 +388,7 @@ def from_pretrained(
                 dtype                      = dtype,
                 load_in_4bit               = load_in_4bit,
                 load_in_8bit               = load_in_8bit,
+                load_in_16bit              = load_in_16bit,
                 full_finetuning            = full_finetuning,
                 token                      = token,
                 device_map                 = device_map,
@@ -523,8 +526,9 @@ def from_pretrained(
         model_name                 = "unsloth/Llama-3.2-11B-Vision-Instruct-bnb-4bit",
         max_seq_length             = 2048,
         dtype                      = None,
-        load_in_4bit               = True,
-        load_in_8bit               = False,
+        load_in_4bit               = True,  # 4bit QLoRA
+        load_in_8bit               = False, # 8bit  LoRA
+        load_in_16bit              = False, # 16bit LoRA
         full_finetuning            = False,
         token                      = None,
         device_map                 = "sequential",
@@ -576,15 +580,17 @@ def from_pretrained(
 
         if full_finetuning and (load_in_4bit or load_in_8bit):
             print("Unsloth: You selected full finetuning support, but 4bit / 8bit is enabled - disabling LoRA / QLoRA.")
-            load_in_4bit = False
-            load_in_8bit = False
+            load_in_4bit  = False
+            load_in_8bit  = False
+            load_in_16bit = False
         pass
 
-        if load_in_4bit and load_in_8bit:
+        if int(load_in_4bit) + int(load_in_8bit) + int(load_in_16bit) >= 2:
             raise RuntimeError(
-                "Unsloth: Can only load in 4bit or 8bit, not both!\n"\
+                "Unsloth: Can only load in 4bit or 8bit or 16bit, not a combination!\n"\
                 "Also, we by default set `load_in_4bit = True`.\n"\
-                "If you want 8bit finetuning, set both `load_in_4bit = False` and `load_in_8bit = True`"
+                "If you want 8bit finetuning, set both `load_in_4bit = False` and `load_in_8bit = True`\n"\
+                "If you want 16bit LoRA finetuning, set `load_in_16bit = True`"
             )
         pass
 
@@ -898,6 +904,7 @@ def from_pretrained(
             dtype             = _get_dtype(dtype),
             load_in_4bit      = load_in_4bit,
             load_in_8bit      = load_in_8bit,
+            load_in_16bit     = load_in_16bit,
             full_finetuning   = full_finetuning,
             token             = token,
             device_map        = device_map,
diff --git a/unsloth/models/vision.py b/unsloth/models/vision.py
index 7722ddd482..506d16bc57 100644
--- a/unsloth/models/vision.py
+++ b/unsloth/models/vision.py
@@ -289,6 +289,7 @@ def from_pretrained(
         dtype             = None,
         load_in_4bit      = True,
         load_in_8bit      = False,
+        load_in_16bit     = False,
         full_finetuning   = False,
         token             = None,
         device_map        = "sequential",
@@ -462,12 +463,13 @@ def from_pretrained(
         bnb_config = None
         if full_finetuning and (load_in_4bit or load_in_8bit):
             print("Unsloth: You selected full finetuning support, but 4bit / 8bit is enabled - disabling LoRA / QLoRA.")
-            load_in_4bit = False
-            load_in_8bit = False
+            load_in_4bit  = False
+            load_in_8bit  = False
+            load_in_16bit = False
         pass
 
-        if load_in_4bit and load_in_8bit:
-            raise RuntimeError("Unsloth: Can only load in 4bit or 8bit, not both!")
+        if int(load_in_4bit) + int(load_in_8bit) + int(load_in_16bit) >= 2:
+            raise RuntimeError("Unsloth: Can only load in 4bit or 8bit or 16bit, not a combination!")
         if load_in_4bit:
             bnb_config = BitsAndBytesConfig(
                 load_in_4bit              = True,
@@ -481,6 +483,8 @@ def from_pretrained(
                 load_in_8bit              = True,
                 llm_int8_skip_modules     = SKIP_QUANTIZATION_MODULES.copy(),
             )
+        elif load_in_16bit:
+            bnb_config = None
         elif not load_in_4bit and not load_in_8bit and not full_finetuning:
             print("Unsloth: QLoRA and full finetuning all not selected. Switching to 16bit LoRA.")
         pass

From e7174b161f3dfb083076ed5844a7358e90019cea Mon Sep 17 00:00:00 2001
From: Daniel Han <danielhanchen@gmail.com>
Date: Wed, 24 Sep 2025 00:14:02 -0700
Subject: [PATCH 197/272] Update vision.py

---
 unsloth/models/vision.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/unsloth/models/vision.py b/unsloth/models/vision.py
index 506d16bc57..f1ff53a526 100644
--- a/unsloth/models/vision.py
+++ b/unsloth/models/vision.py
@@ -525,7 +525,7 @@ def from_pretrained(
                 quantizer_kwargs = {}
                 if "dequantize" in inspect.signature(quantizer).parameters:
                     quantizer_kwargs["dequantize"] = True
-                quantization_config = quantizer.from_dict(config, **quantizer_kwargs)
+                quantization_config = quantizer.from_dict(quantization_config, **quantizer_kwargs)
                 kwargs["quantization_config"] = quantization_config
             pass
         pass

From ffe5aca5d7b622d4b7de5297570519085931e9d4 Mon Sep 17 00:00:00 2001
From: Daniel Han <danielhanchen@gmail.com>
Date: Wed, 24 Sep 2025 01:57:28 -0700
Subject: [PATCH 198/272] Update vision.py

---
 unsloth/models/vision.py | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/unsloth/models/vision.py b/unsloth/models/vision.py
index f1ff53a526..78d2cd6573 100644
--- a/unsloth/models/vision.py
+++ b/unsloth/models/vision.py
@@ -760,7 +760,7 @@ def pre_compile_for_inference(model_type, model, tokenizer):
             return_tensors = "pt",
             return_dict = True,
         ).to(model.device)
-        _ = model.generate(**inputs, max_new_tokens = 3)
+        _ = model.generate(**inputs, max_new_tokens = 1)
         # Do batched inference
         messages = [
             [
@@ -773,7 +773,7 @@ def pre_compile_for_inference(model_type, model, tokenizer):
             return_tensors = "pt",
             return_dict = True,
         ).to(model.device)
-        _ = model.generate(**inputs, max_new_tokens = 3)
+        _ = model.generate(**inputs, max_new_tokens = 2)
         # Set we already pre compiled
         model._pre_compiled_for_inference = True
     pass
@@ -984,7 +984,7 @@ def post_patch_model(
                         if module.padding_idx < module.weight.shape[0]:
                             module.weight[module.padding_idx] = 0
         # Patch for torch.compiled inference
-        # FastBaseModel.pre_compile_for_inference(model_type, model, tokenizer)
+        FastBaseModel.pre_compile_for_inference(model_type, model, tokenizer)
         return model
     pass
 

From 81356cc76031d5684aabee020394398acec8f785 Mon Sep 17 00:00:00 2001
From: Daniel Han <danielhanchen@gmail.com>
Date: Wed, 24 Sep 2025 07:22:30 -0700
Subject: [PATCH 199/272] Update vision.py

---
 unsloth/models/vision.py | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/unsloth/models/vision.py b/unsloth/models/vision.py
index 78d2cd6573..ff3e216c3a 100644
--- a/unsloth/models/vision.py
+++ b/unsloth/models/vision.py
@@ -511,7 +511,6 @@ def from_pretrained(
             else:
                 kwargs["quantization_config"] = bnb_config
         else:
-            # Try dequantizing the quantized model if it's a quantized model
             if auto_config is None:
                 auto_config = AutoConfig.from_pretrained(
                     model_name,
@@ -523,8 +522,9 @@ def from_pretrained(
                 quantization_config = auto_config.quantization_config
                 quantizer = AUTO_QUANTIZATION_CONFIG_MAPPING[quantization_config["quant_method"]]
                 quantizer_kwargs = {}
-                if "dequantize" in inspect.signature(quantizer).parameters:
-                    quantizer_kwargs["dequantize"] = True
+                # We cannot dequantize since gpt-oss-20b MXFP4 will now be gpt-oss-20b-BF16
+                # if "dequantize" in inspect.signature(quantizer).parameters:
+                #     quantizer_kwargs["dequantize"] = True
                 quantization_config = quantizer.from_dict(quantization_config, **quantizer_kwargs)
                 kwargs["quantization_config"] = quantization_config
             pass

From 2313ea949292209f602bad9b75a9000f8d3f217e Mon Sep 17 00:00:00 2001
From: Daniel Han <danielhanchen@gmail.com>
Date: Thu, 25 Sep 2025 03:32:05 -0700
Subject: [PATCH 200/272] Update rl.py

---
 unsloth/models/rl.py | 13 +++++++++++++
 1 file changed, 13 insertions(+)

diff --git a/unsloth/models/rl.py b/unsloth/models/rl.py
index 65434bb095..2b6293993d 100644
--- a/unsloth/models/rl.py
+++ b/unsloth/models/rl.py
@@ -963,6 +963,19 @@ def patch_functions(RLTrainer, trainer_file, RLTrainer_name, all_imports, import
             source = edit_function(function, source)
         pass
 
+        """
+        import torch
+        X = torch.ones((2, 2048, 201088), dtype = torch.bfloat16, device = "cuda")
+        X[torch.randperm(2, dtype = torch.int64, device = X.device)]
+
+        will error out in torch 2.8 AcceleratorError: CUDA error: invalid configuration argument
+        """
+        source = re.sub(
+            r"(\n[\s]{4,})generation_batch = shuffle_sequence_dict\(generation_batch\)\n",
+            r"\n\1try: generation_batch = shuffle_sequence_dict(generation_batch)\n\1except: pass\n",
+            source,
+        )
+
         # llm_model = self.llm.llm_engine.model_executor.driver_worker.model_runner.model
         source = re.sub(
             r"(\n[\s]{4,}).+?model_executor\.driver_worker.+?\n",

From 0c18d86fbf64269a54f924102eb65bd26b424f7b Mon Sep 17 00:00:00 2001
From: Daniel Han <danielhanchen@gmail.com>
Date: Thu, 25 Sep 2025 22:00:45 -0700
Subject: [PATCH 201/272] Update vision.py

---
 unsloth/models/vision.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/unsloth/models/vision.py b/unsloth/models/vision.py
index ff3e216c3a..5b99cef8a7 100644
--- a/unsloth/models/vision.py
+++ b/unsloth/models/vision.py
@@ -984,7 +984,7 @@ def post_patch_model(
                         if module.padding_idx < module.weight.shape[0]:
                             module.weight[module.padding_idx] = 0
         # Patch for torch.compiled inference
-        FastBaseModel.pre_compile_for_inference(model_type, model, tokenizer)
+        # FastBaseModel.pre_compile_for_inference(model_type, model, tokenizer)
         return model
     pass
 

From 19017fd5ba998296851d2eb6a7ad8a80b553da2e Mon Sep 17 00:00:00 2001
From: Daniel Han <danielhanchen@gmail.com>
Date: Fri, 26 Sep 2025 02:33:44 -0700
Subject: [PATCH 202/272] offload_embedding

---
 unsloth/models/loader.py |  5 +++++
 unsloth/models/vision.py | 10 ++++++++++
 2 files changed, 15 insertions(+)

diff --git a/unsloth/models/loader.py b/unsloth/models/loader.py
index 0e130fb973..98396bb754 100644
--- a/unsloth/models/loader.py
+++ b/unsloth/models/loader.py
@@ -123,6 +123,7 @@ def from_pretrained(
         resize_model_vocab         = None,
         revision                   = None,
         use_exact_model_name       = False,
+        offload_embedding          = False,
 
         fast_inference             = False, # uses vLLM
         gpu_memory_utilization     = 0.5,
@@ -161,6 +162,7 @@ def from_pretrained(
                 return_logits              = False, # Return logits
                 fullgraph                  = True, # No graph breaks
                 use_exact_model_name       = use_exact_model_name,
+                offload_embedding          = offload_embedding,
 
                 # Pass vLLM/inference parameters
                 fast_inference             = fast_inference,
@@ -401,6 +403,7 @@ def from_pretrained(
                 return_logits              = False, # Return logits
                 fullgraph                  = True, # No graph breaks
                 use_exact_model_name       = use_exact_model_name,
+                offload_embedding          = offload_embedding,
 
                 # Pass vLLM/inference parameters
                 fast_inference             = fast_inference,
@@ -545,6 +548,7 @@ def from_pretrained(
         whisper_language           = None,
         whisper_task               = None,
         unsloth_force_compile      = False,
+        offload_embedding          = False,
 
         # Add the missing vLLM/inference parameters
         fast_inference             = False, # uses vLLM
@@ -918,6 +922,7 @@ def from_pretrained(
             whisper_language  = whisper_language,
             whisper_task      = whisper_task,
             auto_config       = model_config,
+            offload_embedding = offload_embedding,
 
             # Pass vLLM/inference parameters
             fast_inference         = fast_inference,
diff --git a/unsloth/models/vision.py b/unsloth/models/vision.py
index 5b99cef8a7..e658f18347 100644
--- a/unsloth/models/vision.py
+++ b/unsloth/models/vision.py
@@ -302,6 +302,7 @@ def from_pretrained(
         whisper_language  = None,
         whisper_task      = None,
         auto_config       = None,
+        offload_embedding = False,
         # vLLM parameters
         fast_inference    = False,
         gpu_memory_utilization = 0.5,
@@ -551,6 +552,15 @@ def from_pretrained(
             if hasattr(model, 'generate'):
                 model.fast_generate = model.generate
                 model.fast_generate_batches = error_out_no_vllm
+            if offload_embedding:
+                embed_tokens = model.get_input_embeddings()
+                nbytes = embed_tokens.weight.numel() * embed_tokens.weight.itemsize
+                ngb = round(nbytes / 1024 / 1024 / 1024, 2)
+                print(f"Unsloth: Offloading embeddings to RAM to save {ngb} GB.")
+                embed_tokens.to("cpu")
+                # Must free GPU memory otherwise will not free!
+                torch.cuda.empty_cache()
+                gc.collect()
         else:
             from unsloth_zoo.vllm_utils import (
                 load_vllm,

From 77fca7998e09db683d80ebfdfc0b5b2715c601cb Mon Sep 17 00:00:00 2001
From: Daniel Han <danielhanchen@gmail.com>
Date: Fri, 26 Sep 2025 03:04:53 -0700
Subject: [PATCH 203/272] Update vision.py

---
 unsloth/models/vision.py | 11 +++++++++++
 1 file changed, 11 insertions(+)

diff --git a/unsloth/models/vision.py b/unsloth/models/vision.py
index e658f18347..dc1def7a7b 100644
--- a/unsloth/models/vision.py
+++ b/unsloth/models/vision.py
@@ -558,6 +558,17 @@ def from_pretrained(
                 ngb = round(nbytes / 1024 / 1024 / 1024, 2)
                 print(f"Unsloth: Offloading embeddings to RAM to save {ngb} GB.")
                 embed_tokens.to("cpu")
+                embed_tokens.weight.pin_memory()
+
+                # Add hooks to move inputs to CPU and back to CUDA
+                def pre_hook(module, args):
+                    args[0]._old_device = args[0].device
+                    return (args[0].to("cpu", non_blocking = True))
+                def post_hook(module, args, output):
+                    old_device = getattr(args[0], "_old_device", "cuda")
+                    return output.to(old_device, non_blocking = True)
+                embed_tokens.register_forward_pre_hook(pre_hook,  prepend = True)
+                embed_tokens.register_forward_hook    (post_hook, prepend = True)
                 # Must free GPU memory otherwise will not free!
                 torch.cuda.empty_cache()
                 gc.collect()

From 92084ba38bac1bb770dae817ce7acc94412d5cad Mon Sep 17 00:00:00 2001
From: Daniel Han <danielhanchen@gmail.com>
Date: Fri, 26 Sep 2025 03:08:47 -0700
Subject: [PATCH 204/272] Update vision.py

---
 unsloth/models/vision.py | 1 -
 1 file changed, 1 deletion(-)

diff --git a/unsloth/models/vision.py b/unsloth/models/vision.py
index dc1def7a7b..cedaa14125 100644
--- a/unsloth/models/vision.py
+++ b/unsloth/models/vision.py
@@ -558,7 +558,6 @@ def from_pretrained(
                 ngb = round(nbytes / 1024 / 1024 / 1024, 2)
                 print(f"Unsloth: Offloading embeddings to RAM to save {ngb} GB.")
                 embed_tokens.to("cpu")
-                embed_tokens.weight.pin_memory()
 
                 # Add hooks to move inputs to CPU and back to CUDA
                 def pre_hook(module, args):

From 499f939c9b118d59a85952da626c913f6a915cfd Mon Sep 17 00:00:00 2001
From: Daniel Han <danielhanchen@gmail.com>
Date: Fri, 26 Sep 2025 03:17:35 -0700
Subject: [PATCH 205/272] Update vision.py

---
 unsloth/models/vision.py | 17 +++++++++--------
 1 file changed, 9 insertions(+), 8 deletions(-)

diff --git a/unsloth/models/vision.py b/unsloth/models/vision.py
index cedaa14125..2cd95b0377 100644
--- a/unsloth/models/vision.py
+++ b/unsloth/models/vision.py
@@ -560,14 +560,15 @@ def from_pretrained(
                 embed_tokens.to("cpu")
 
                 # Add hooks to move inputs to CPU and back to CUDA
-                def pre_hook(module, args):
-                    args[0]._old_device = args[0].device
-                    return (args[0].to("cpu", non_blocking = True))
-                def post_hook(module, args, output):
-                    old_device = getattr(args[0], "_old_device", "cuda")
-                    return output.to(old_device, non_blocking = True)
-                embed_tokens.register_forward_pre_hook(pre_hook,  prepend = True)
-                embed_tokens.register_forward_hook    (post_hook, prepend = True)
+                # [TODO] Doesn't seem to work!
+                # def pre_hook(module, args):
+                #     args[0]._old_device = args[0].device
+                #     return (args[0].to("cpu", non_blocking = True))
+                # def post_hook(module, args, output):
+                #     old_device = getattr(args[0], "_old_device", "cuda")
+                #     return output.to(old_device, non_blocking = True)
+                # embed_tokens.register_forward_pre_hook(pre_hook,  prepend = True)
+                # embed_tokens.register_forward_hook    (post_hook, prepend = True)
                 # Must free GPU memory otherwise will not free!
                 torch.cuda.empty_cache()
                 gc.collect()

From f72c0a9b26b89ad1387b020aa25b242d0e0a7833 Mon Sep 17 00:00:00 2001
From: Daniel Han <danielhanchen@gmail.com>
Date: Sat, 27 Sep 2025 17:44:58 -0700
Subject: [PATCH 206/272] Update vision.py

---
 unsloth/models/vision.py | 52 ++--------------------------------------
 1 file changed, 2 insertions(+), 50 deletions(-)

diff --git a/unsloth/models/vision.py b/unsloth/models/vision.py
index 2cd95b0377..071416dd1b 100644
--- a/unsloth/models/vision.py
+++ b/unsloth/models/vision.py
@@ -524,8 +524,8 @@ def from_pretrained(
                 quantizer = AUTO_QUANTIZATION_CONFIG_MAPPING[quantization_config["quant_method"]]
                 quantizer_kwargs = {}
                 # We cannot dequantize since gpt-oss-20b MXFP4 will now be gpt-oss-20b-BF16
-                # if "dequantize" in inspect.signature(quantizer).parameters:
-                #     quantizer_kwargs["dequantize"] = True
+                if load_in_16bit and "dequantize" in inspect.signature(quantizer).parameters:
+                    quantizer_kwargs["dequantize"] = True
                 quantization_config = quantizer.from_dict(quantization_config, **quantizer_kwargs)
                 kwargs["quantization_config"] = quantization_config
             pass
@@ -753,52 +753,6 @@ def from_pretrained(
         return model, tokenizer
     pass
 
-    @staticmethod
-    def pre_compile_for_inference(model_type, model, tokenizer):
-        """
-        We need to invoke torch.compile to save VRAM usage and make it faster downstream.
-        Sometimes torch.compile can use 3GB weirdly on large batches, then it goes down to <1GB.
-        So we invoke torch.compile on short batches to reduce VRAM usage.
-        """
-        if model_type is None or model is None or tokenizer is None: return
-        if str(model_type).lower() not in PRE_COMPILE_INFERENCE: return
-        if getattr(tokenizer, "chat_template", None) is None: return
-        # Check if already compiled and exit
-        for module in model.modules():
-            if hasattr(module, "_pre_compiled_for_inference"): return
-        pass
-        print(f"🦥 Unsloth: Pre compiling {model_type.title()} model for faster inference - this might take 3 minutes or so!")
-        print("========= Pre compiling model for faster inference. Please be patient thank you! =========")
-        # Do single inference
-        messages = [
-            [
-                 {"role": "user", "content": f"What is 1+1 equal to?"},
-            ],
-        ]*1
-        inputs = tokenizer.apply_chat_template(
-            messages,
-            add_generation_prompt = True,
-            return_tensors = "pt",
-            return_dict = True,
-        ).to(model.device)
-        _ = model.generate(**inputs, max_new_tokens = 1)
-        # Do batched inference
-        messages = [
-            [
-                 {"role": "user", "content": f"1+1"},
-            ],
-        ]*4
-        inputs = tokenizer.apply_chat_template(
-            messages,
-            add_generation_prompt = True,
-            return_tensors = "pt",
-            return_dict = True,
-        ).to(model.device)
-        _ = model.generate(**inputs, max_new_tokens = 2)
-        # Set we already pre compiled
-        model._pre_compiled_for_inference = True
-    pass
-
     @staticmethod
     def get_peft_model(
         model,
@@ -1004,8 +958,6 @@ def post_patch_model(
                     if getattr(module, "weight", None) is not None and getattr(module, "padding_idx", None) is not None:
                         if module.padding_idx < module.weight.shape[0]:
                             module.weight[module.padding_idx] = 0
-        # Patch for torch.compiled inference
-        # FastBaseModel.pre_compile_for_inference(model_type, model, tokenizer)
         return model
     pass
 

From 2a7cfa0ecf137fedd970e67c330d893c7e8f60f0 Mon Sep 17 00:00:00 2001
From: Daniel Han <danielhanchen@gmail.com>
Date: Sat, 27 Sep 2025 17:47:28 -0700
Subject: [PATCH 207/272] Update vision.py

---
 unsloth/models/vision.py | 6 +++++-
 1 file changed, 5 insertions(+), 1 deletion(-)

diff --git a/unsloth/models/vision.py b/unsloth/models/vision.py
index 071416dd1b..0510b26128 100644
--- a/unsloth/models/vision.py
+++ b/unsloth/models/vision.py
@@ -856,7 +856,11 @@ def get_peft_model(
         # Enable gradients on modules which are trainable
         requires_grad_for_gradient_checkpointing(model)
         trust_remote_code = getattr(model, "_unsloth_trust_remote_code", False)
-        model = FastBaseModel.post_patch_model(model, use_gradient_checkpointing, trust_remote_code = trust_remote_code)
+        model = FastBaseModel.post_patch_model(
+            model,
+            use_gradient_checkpointing = use_gradient_checkpointing,
+            trust_remote_code = trust_remote_code,
+        )
         model.max_seq_length = max_seq_length
         # Save to modules as well
         for module in model.modules():

From 2577d8162f9bfe734a30404dff8980530a6701c3 Mon Sep 17 00:00:00 2001
From: Daniel Han <danielhanchen@gmail.com>
Date: Sat, 27 Sep 2025 17:50:53 -0700
Subject: [PATCH 208/272] Update vision.py

---
 unsloth/models/vision.py | 15 ++++++++++++---
 1 file changed, 12 insertions(+), 3 deletions(-)

diff --git a/unsloth/models/vision.py b/unsloth/models/vision.py
index 0510b26128..3522d46f0c 100644
--- a/unsloth/models/vision.py
+++ b/unsloth/models/vision.py
@@ -549,7 +549,7 @@ def from_pretrained(
                 # attn_implementation   = attn_implementation,
                 **kwargs,
             )
-            if hasattr(model, 'generate'):
+            if hasattr(model, "generate"):
                 model.fast_generate = model.generate
                 model.fast_generate_batches = error_out_no_vllm
             if offload_embedding:
@@ -612,8 +612,17 @@ def from_pretrained(
             llm = load_vllm(**load_vllm_kwargs)
 
             # Convert to HF format
-            _, quant_state_dict = get_vllm_state_dict(llm, config = model_config, is_vision_model = True)
-            model = convert_vllm_to_huggingface(quant_state_dict, model_config, dtype, bnb_config, is_vision_model = True)
+            _, quant_state_dict = get_vllm_state_dict(
+                llm,
+                config = model_config,
+                is_vision_model = True,
+            )
+            model = convert_vllm_to_huggingface(
+                quant_state_dict,
+                model_config,
+                dtype, bnb_config,
+                is_vision_model = True,
+            )
             model.vllm_engine = llm
             model.fast_generate = model.vllm_engine.generate
             model.fast_generate_batches = functools.partial(generate_batches, model.vllm_engine)

From 1eee987e60d908ee415caa9108dc50398ace64ba Mon Sep 17 00:00:00 2001
From: Daniel Han <danielhanchen@gmail.com>
Date: Tue, 30 Sep 2025 02:32:05 -0700
Subject: [PATCH 209/272] Update rl_replacements.py

---
 unsloth/models/rl_replacements.py | 50 ++++++++++++++++++++++++++++++-
 1 file changed, 49 insertions(+), 1 deletion(-)

diff --git a/unsloth/models/rl_replacements.py b/unsloth/models/rl_replacements.py
index ec81106890..a207514e72 100644
--- a/unsloth/models/rl_replacements.py
+++ b/unsloth/models/rl_replacements.py
@@ -27,6 +27,7 @@
 from collections import defaultdict
 from unsloth_zoo.rl_replacements import RL_REPLACEMENTS, left_pack_padding
 from unsloth import DEVICE_TYPE
+import textwrap
 
 RL_EXTRA_ARGS      = defaultdict(list)
 RL_FUNCTIONS       = defaultdict(list)
@@ -295,12 +296,59 @@ def grpo_trainer__generate_and_score_completions(function_name, function):
         if self.use_vllm:"""
             function = function.replace(replace_part, new_replacement)
 
-
     return function
 pass
 RL_FUNCTIONS["grpo_trainer"].append(grpo_trainer__generate_and_score_completions)
 
 
+# Fix {"reasoning_effort" : "high"} not applied
+def grpo_trainer_fix_maybe_apply_chat_template(function_name, function):
+    spaces = function.find("def ")
+    if spaces % 4 != 0: return function
+    spaces += 4
+    replacement = """
+        _chat_template_ = getattr(self.processing_class, "chat_template", None)
+        if _chat_template_ is None: _chat_template_ = ""
+        _supported_keys_ = set(("prompt", "chosen", "rejected", "completion", "messages", "label"))
+
+        prompts_text = []
+        for _example_ in __INPUTS__REPLACEMENT__:
+            _tokenizer_kwargs_ = {}
+            if type(_example_) is not dict:
+                _example_ = {"prompt": _example_}
+            _left_keys_ = _example_.keys() - _supported_keys_
+            for k in _left_keys_:
+                if k in _chat_template_:
+                    v = _example_[k]
+                    if type(v) is str:
+                        _tokenizer_kwargs_[k] = v
+            _x_ = maybe_apply_chat_template(_example_, self.processing_class, **_tokenizer_kwargs_)["prompt"]
+            prompts_text.append(_x_)
+    """
+    replacement = textwrap.dedent(replacement).strip()
+    replacement = textwrap.indent(replacement, spaces*" ")
+    replacement = f"\n{replacement}\n"
+    what = 'prompts_text = [maybe_apply_chat_template(example, self.processing_class)["prompt"] for example in inputs]'
+    function = function.replace(what, replacement.replace("__INPUTS__REPLACEMENT__", "inputs"))
+
+    """prompts_text = [
+        maybe_apply_chat_template({"prompt": prompt}, self.processing_class)["prompt"] for prompt in prompts
+    ]"""
+    function = re.sub(
+        r"prompts_text = \["\
+        r"[\s]{0,}"\
+        r"maybe_apply_chat_template\(\{[\"\']prompt[\"\'][\s]{0,}\:[\s]{0,}prompt[\s]{0,}\}[\s]{0,}\,[\s]{0,}self\.processing_class\)"\
+        r"\[[\"\']prompt[\"\']\] for prompt in prompts"\
+        r"[\s]{0,}"\
+        r"\]",
+        replacement.replace("__INPUTS__REPLACEMENT__", "prompts"),
+        function,
+    )
+    return function
+pass
+RL_FUNCTIONS["grpo_trainer"].append(grpo_trainer_fix_maybe_apply_chat_template)
+
+
 # Remove _move_model_to_vllm
 def grpo_trainer__move_model_to_vllm(function_name, function):
     if  function_name != "_move_model_to_vllm": return function

From 1edc796df17ec4c2da37b92168a79042d417eba8 Mon Sep 17 00:00:00 2001
From: Daniel Han <danielhanchen@gmail.com>
Date: Tue, 30 Sep 2025 03:47:23 -0700
Subject: [PATCH 210/272] Update loader.py

---
 unsloth/models/loader.py | 11 ++++++++---
 1 file changed, 8 insertions(+), 3 deletions(-)

diff --git a/unsloth/models/loader.py b/unsloth/models/loader.py
index 98396bb754..3bcab1ce87 100644
--- a/unsloth/models/loader.py
+++ b/unsloth/models/loader.py
@@ -662,10 +662,15 @@ def from_pretrained(
         )
         model_types_all = ",".join(model_types) + ","
 
-        # Check versions
+        # Save model types and loading method
         lowered_model_name = model_name.lower()
-        if os.environ.get("UNSLOTH_MODEL_NAME", "") == "":
-            os.environ["UNSLOTH_MODEL_NAME"] = lowered_model_name
+        string = os.environ.get("UNSLOTH_MODEL_NAME", "") + model_types_all
+        if load_in_4bit:  string += "_load_in_4bit_"
+        if load_in_8bit:  string += "_load_in_8bit_"
+        if load_in_16bit: string += "_load_in_16bit_"
+        os.environ["UNSLOTH_MODEL_NAME"] = string
+
+        # Check versions
         LATEST  = '\nPlease use transformers via `pip install --no-deps git+https://github.com/huggingface/transformers.git`'
         NIGHTLY = '\nPlease use nightly transformers via pip install --upgrade "transformers>=4.49.0"`'
         # Pixtral

From 205d09cb1af754bcd2b948028ee729d3706e1a3e Mon Sep 17 00:00:00 2001
From: Daniel Han <danielhanchen@gmail.com>
Date: Tue, 30 Sep 2025 04:35:03 -0700
Subject: [PATCH 211/272] Fix padding issue

---
 unsloth/models/_utils.py |  2 +-
 unsloth/models/vision.py | 15 +++++++++------
 2 files changed, 10 insertions(+), 7 deletions(-)

diff --git a/unsloth/models/_utils.py b/unsloth/models/_utils.py
index 5af8c756e3..2df9b878d0 100644
--- a/unsloth/models/_utils.py
+++ b/unsloth/models/_utils.py
@@ -12,7 +12,7 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-__version__ = "2025.9.9"
+__version__ = "2025.9.10"
 
 __all__ = [
     "SUPPORTS_BFLOAT16",
diff --git a/unsloth/models/vision.py b/unsloth/models/vision.py
index 524bf64cdb..ae76c573d0 100644
--- a/unsloth/models/vision.py
+++ b/unsloth/models/vision.py
@@ -965,12 +965,15 @@ def post_patch_model(
             m.for_inference = functools.partial(FastBaseModel.for_inference, m)
             m = m.model
         # Set weight[padding_idx] = 0
-        with torch.no_grad():
-            for name, module in model.named_modules():
-                if type(module) is torch.nn.Embedding:
-                    if getattr(module, "weight", None) is not None and getattr(module, "padding_idx", None) is not None:
-                        if module.padding_idx < module.weight.shape[0]:
-                            module.weight[module.padding_idx] = 0
+        # Only do this if tokenizer is defined since eos_token == pad_token sometimes!
+        pad_token_id = getattr(tokenizer, "pad_token_id", None)
+        if tokenizer is not None and getattr(tokenizer, "eos_token_id", None) != pad_token_id:
+            with torch.no_grad():
+                for name, module in model.named_modules():
+                    if type(module) is torch.nn.Embedding:
+                        if getattr(module, "weight", None) is not None and getattr(module, "padding_idx", None) is not None:
+                            if module.padding_idx == pad_token_id and module.padding_idx < module.weight.shape[0]:
+                                module.weight[module.padding_idx] = 0
         return model
     pass
 

From 07cc6ed405743f81e711ac4e2c53059a723111ff Mon Sep 17 00:00:00 2001
From: Daniel Han <danielhanchen@gmail.com>
Date: Tue, 30 Sep 2025 04:59:34 -0700
Subject: [PATCH 212/272] Update pyproject.toml

---
 pyproject.toml | 14 +++++++-------
 1 file changed, 7 insertions(+), 7 deletions(-)

diff --git a/pyproject.toml b/pyproject.toml
index 0a3cfa1f79..a3aa62d37b 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -37,18 +37,18 @@ triton = [
 ]
 
 huggingface = [
-    "unsloth_zoo>=2025.9.11",
+    "unsloth_zoo>=2025.9.12",
     "packaging",
     "tyro",
     "transformers>=4.51.3,!=4.52.0,!=4.52.1,!=4.52.2,!=4.52.3,!=4.53.0,!=4.54.0,!=4.55.0,!=4.55.1,<=4.56.2",
-    "datasets>=3.4.1,<4.0.0",
+    "datasets>=3.4.1,!=4.0.*,!=4.1.0",
     "sentencepiece>=0.2.0",
     "tqdm",
     "psutil",
     "wheel>=0.42.0",
     "numpy",
     "accelerate>=0.34.1",
-    "trl>=0.7.9,!=0.9.0,!=0.9.1,!=0.9.2,!=0.9.3,!=0.15.0,!=0.19.0",
+    "trl>=0.7.9,!=0.9.0,!=0.9.1,!=0.9.2,!=0.9.3,!=0.15.0,!=0.19.0,<=0.23.0",
     "peft>=0.7.1,!=0.11.0",
     "protobuf",
     "huggingface_hub>=0.34.0",
@@ -453,11 +453,11 @@ colab-ampere-torch220 = [
     "flash-attn>=2.6.3",
 ]
 colab-new = [
-    "unsloth_zoo>=2025.9.11",
+    "unsloth_zoo>=2025.9.12",
     "packaging",
     "tyro",
-    "transformers>=4.51.3,!=4.47.0,!=4.52.0,!=4.52.1,!=4.52.2,!=4.52.3,!=4.53.0,!=4.54.0,!=4.55.0,!=4.55.1,<=4.56.2",
-    "datasets>=3.4.1,<4.0.0",
+    "transformers>=4.51.3,!=4.52.0,!=4.52.1,!=4.52.2,!=4.52.3,!=4.53.0,!=4.54.0,!=4.55.0,!=4.55.1,<=4.56.2",
+    "datasets>=3.4.1,!=4.0.*,!=4.1.0",
     "sentencepiece>=0.2.0",
     "tqdm",
     "psutil",
@@ -471,7 +471,7 @@ colab-new = [
 ]
 colab-no-deps = [
     "accelerate>=0.34.1",
-    "trl>=0.7.9,!=0.9.0,!=0.9.1,!=0.9.2,!=0.9.3,!=0.15.0,!=0.19.0",
+    "trl>=0.7.9,!=0.9.0,!=0.9.1,!=0.9.2,!=0.9.3,!=0.15.0,!=0.19.0,<=0.23.0",
     "peft>=0.7.1",
     "xformers",
     "bitsandbytes>=0.45.5",

From d225f7f1eb4f143812ec5637b4c8cc9a3fe846c7 Mon Sep 17 00:00:00 2001
From: Daniel Han <danielhanchen@gmail.com>
Date: Tue, 30 Sep 2025 04:59:50 -0700
Subject: [PATCH 213/272] Update _utils.py

---
 unsloth/models/_utils.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/unsloth/models/_utils.py b/unsloth/models/_utils.py
index 2df9b878d0..3844df7e97 100644
--- a/unsloth/models/_utils.py
+++ b/unsloth/models/_utils.py
@@ -12,7 +12,7 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-__version__ = "2025.9.10"
+__version__ = "2025.9.11"
 
 __all__ = [
     "SUPPORTS_BFLOAT16",

From 5d6c3d9c9eaf080454ad83607e9c83f29bb4ecaa Mon Sep 17 00:00:00 2001
From: Daniel Han <danielhanchen@gmail.com>
Date: Tue, 30 Sep 2025 05:01:49 -0700
Subject: [PATCH 214/272] Update pyproject.toml

---
 pyproject.toml | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/pyproject.toml b/pyproject.toml
index a3aa62d37b..510e186cde 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -37,7 +37,7 @@ triton = [
 ]
 
 huggingface = [
-    "unsloth_zoo>=2025.9.12",
+    "unsloth_zoo>=2025.9.13",
     "packaging",
     "tyro",
     "transformers>=4.51.3,!=4.52.0,!=4.52.1,!=4.52.2,!=4.52.3,!=4.53.0,!=4.54.0,!=4.55.0,!=4.55.1,<=4.56.2",
@@ -453,7 +453,7 @@ colab-ampere-torch220 = [
     "flash-attn>=2.6.3",
 ]
 colab-new = [
-    "unsloth_zoo>=2025.9.12",
+    "unsloth_zoo>=2025.9.13",
     "packaging",
     "tyro",
     "transformers>=4.51.3,!=4.52.0,!=4.52.1,!=4.52.2,!=4.52.3,!=4.53.0,!=4.54.0,!=4.55.0,!=4.55.1,<=4.56.2",

From af56af339e62d569028802f54ffd26b62923c57c Mon Sep 17 00:00:00 2001
From: Daniel Han <danielhanchen@gmail.com>
Date: Tue, 30 Sep 2025 05:02:47 -0700
Subject: [PATCH 215/272] Update _utils.py

---
 unsloth/models/_utils.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/unsloth/models/_utils.py b/unsloth/models/_utils.py
index 3844df7e97..2df9b878d0 100644
--- a/unsloth/models/_utils.py
+++ b/unsloth/models/_utils.py
@@ -12,7 +12,7 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-__version__ = "2025.9.11"
+__version__ = "2025.9.10"
 
 __all__ = [
     "SUPPORTS_BFLOAT16",

From eb2d403be4434e60a88cade57df5785bfbe01e31 Mon Sep 17 00:00:00 2001
From: Daniel Han <danielhanchen@gmail.com>
Date: Wed, 1 Oct 2025 04:40:24 -0700
Subject: [PATCH 216/272] Update vision.py

---
 unsloth/models/vision.py | 6 +++++-
 1 file changed, 5 insertions(+), 1 deletion(-)

diff --git a/unsloth/models/vision.py b/unsloth/models/vision.py
index 9d086a04d5..71e4a98a90 100644
--- a/unsloth/models/vision.py
+++ b/unsloth/models/vision.py
@@ -561,7 +561,9 @@ def from_pretrained(
                 # model.device also will change to CPU so change back
                 m = model
                 while hasattr(m, "model"):
-                    if hasattr(m, "device"): m._old_device_ = m.device
+                    if hasattr(m, "device"):
+                        m._old_device_ = m.device
+                        print(m._old_device_)
                     m = m.model
                 if hasattr(m, "device"): m._old_device_ = m.device
 
@@ -575,11 +577,13 @@ def from_pretrained(
                         try: m.device = m._old_device_
                         except: pass
                         del m._old_device_
+                        print(m._old_device_)
                     m = m.model
                 if hasattr(m, "device"):
                     try: m.device = m._old_device_
                     except: pass
                     del m._old_device_
+                    print(m._old_device_)
 
                 # Add hooks to move inputs to CPU and back to CUDA
                 # [TODO] Doesn't seem to work!

From 9bc76e8db536334dc6a69aeeffa9d578bddc0554 Mon Sep 17 00:00:00 2001
From: Daniel Han <danielhanchen@gmail.com>
Date: Wed, 1 Oct 2025 04:43:01 -0700
Subject: [PATCH 217/272] Update vision.py

---
 unsloth/models/vision.py | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

diff --git a/unsloth/models/vision.py b/unsloth/models/vision.py
index 71e4a98a90..8bd2eea182 100644
--- a/unsloth/models/vision.py
+++ b/unsloth/models/vision.py
@@ -576,14 +576,15 @@ def from_pretrained(
                     if hasattr(m, "device") and hasattr(m, "_old_device_"):
                         try: m.device = m._old_device_
                         except: pass
+                        print(m._old_device_, m.device)
                         del m._old_device_
-                        print(m._old_device_)
                     m = m.model
                 if hasattr(m, "device"):
                     try: m.device = m._old_device_
                     except: pass
                     del m._old_device_
-                    print(m._old_device_)
+                    print(m._old_device_, m.device)
+                print(model.device)
 
                 # Add hooks to move inputs to CPU and back to CUDA
                 # [TODO] Doesn't seem to work!

From a0425bb45737499ce2dc87e0cd8ef982b2b0a24c Mon Sep 17 00:00:00 2001
From: Daniel Han <danielhanchen@gmail.com>
Date: Wed, 1 Oct 2025 04:44:21 -0700
Subject: [PATCH 218/272] Update vision.py

---
 unsloth/models/vision.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/unsloth/models/vision.py b/unsloth/models/vision.py
index 8bd2eea182..fd8cf04a64 100644
--- a/unsloth/models/vision.py
+++ b/unsloth/models/vision.py
@@ -563,7 +563,7 @@ def from_pretrained(
                 while hasattr(m, "model"):
                     if hasattr(m, "device"):
                         m._old_device_ = m.device
-                        print(m._old_device_)
+                        print(m._old_device_, m.device)
                     m = m.model
                 if hasattr(m, "device"): m._old_device_ = m.device
 
@@ -582,8 +582,8 @@ def from_pretrained(
                 if hasattr(m, "device"):
                     try: m.device = m._old_device_
                     except: pass
-                    del m._old_device_
                     print(m._old_device_, m.device)
+                    del m._old_device_
                 print(model.device)
 
                 # Add hooks to move inputs to CPU and back to CUDA

From b0ba73cd7e6f5005e271dade905f4fd6fe327ec9 Mon Sep 17 00:00:00 2001
From: Daniel Han <danielhanchen@gmail.com>
Date: Wed, 1 Oct 2025 04:46:20 -0700
Subject: [PATCH 219/272] Update vision.py

---
 unsloth/models/vision.py | 6 ++----
 1 file changed, 2 insertions(+), 4 deletions(-)

diff --git a/unsloth/models/vision.py b/unsloth/models/vision.py
index fd8cf04a64..c07c3221f9 100644
--- a/unsloth/models/vision.py
+++ b/unsloth/models/vision.py
@@ -574,14 +574,12 @@ def from_pretrained(
                 m = model
                 while hasattr(m, "model"):
                     if hasattr(m, "device") and hasattr(m, "_old_device_"):
-                        try: m.device = m._old_device_
-                        except: pass
+                        m.device = m._old_device_
                         print(m._old_device_, m.device)
                         del m._old_device_
                     m = m.model
                 if hasattr(m, "device"):
-                    try: m.device = m._old_device_
-                    except: pass
+                    m.device = m._old_device_
                     print(m._old_device_, m.device)
                     del m._old_device_
                 print(model.device)

From f85a91a90b04281880afa88e28996ae9e43c4aa5 Mon Sep 17 00:00:00 2001
From: Daniel Han <danielhanchen@gmail.com>
Date: Wed, 1 Oct 2025 04:57:03 -0700
Subject: [PATCH 220/272] Update vision.py

---
 unsloth/models/vision.py | 25 -------------------------
 1 file changed, 25 deletions(-)

diff --git a/unsloth/models/vision.py b/unsloth/models/vision.py
index c07c3221f9..fbe3c832c2 100644
--- a/unsloth/models/vision.py
+++ b/unsloth/models/vision.py
@@ -557,33 +557,8 @@ def from_pretrained(
                 nbytes = embed_tokens.weight.numel() * embed_tokens.weight.itemsize
                 ngb = round(nbytes / 1024 / 1024 / 1024, 2)
                 print(f"Unsloth: Offloading embeddings to RAM to save {ngb} GB.")
-
-                # model.device also will change to CPU so change back
-                m = model
-                while hasattr(m, "model"):
-                    if hasattr(m, "device"):
-                        m._old_device_ = m.device
-                        print(m._old_device_, m.device)
-                    m = m.model
-                if hasattr(m, "device"): m._old_device_ = m.device
-
-                # Move embeddings to CPU
                 embed_tokens.to("cpu")
 
-                # model.device also will change to CPU so change back
-                m = model
-                while hasattr(m, "model"):
-                    if hasattr(m, "device") and hasattr(m, "_old_device_"):
-                        m.device = m._old_device_
-                        print(m._old_device_, m.device)
-                        del m._old_device_
-                    m = m.model
-                if hasattr(m, "device"):
-                    m.device = m._old_device_
-                    print(m._old_device_, m.device)
-                    del m._old_device_
-                print(model.device)
-
                 # Add hooks to move inputs to CPU and back to CUDA
                 # [TODO] Doesn't seem to work!
                 # def pre_hook(module, args):

From 47f2ef72631b6515bac8c1d353ec371fd129c6d1 Mon Sep 17 00:00:00 2001
From: Daniel Han <danielhanchen@gmail.com>
Date: Sat, 4 Oct 2025 23:39:09 -0700
Subject: [PATCH 221/272] Update vision.py

---
 unsloth/models/vision.py | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/unsloth/models/vision.py b/unsloth/models/vision.py
index fbe3c832c2..a7c12e9644 100644
--- a/unsloth/models/vision.py
+++ b/unsloth/models/vision.py
@@ -636,7 +636,9 @@ def from_pretrained(
         # Check float32 norm weights
         if os.environ.get("UNSLOTH_HIGH_PRECISION_LAYERNORM", "0") == "1":
             for jj, (name, module) in enumerate(model.named_modules()):
-                if name.endswith("norm") and hasattr(module, "weight"):
+                if (name.endswith(("norm", "norm1", "norm2", "norm3", "norm4")) \
+                    or "layernorm" in name or "layer_norm" in name) \
+                    and hasattr(module, "weight"):
                     module._pre_set_compute_dtype = torch.float32
         pass
         # Edit data-types

From 06fc86f084a898c356ed137f11a2b671a50bbe64 Mon Sep 17 00:00:00 2001
From: Daniel Han <danielhanchen@gmail.com>
Date: Sun, 5 Oct 2025 00:40:16 -0700
Subject: [PATCH 222/272] New models

---
 pyproject.toml           |  4 ++--
 unsloth/models/_utils.py |  2 +-
 unsloth/models/mapper.py | 46 ++++++++++++++++++++++++++++++++++++++++
 3 files changed, 49 insertions(+), 3 deletions(-)

diff --git a/pyproject.toml b/pyproject.toml
index 6c0b7f8ca1..1bbb9c657a 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -37,7 +37,7 @@ triton = [
 ]
 
 huggingface = [
-    "unsloth_zoo>=2025.9.14",
+    "unsloth_zoo>=2025.10.1",
     "packaging",
     "tyro",
     "transformers>=4.51.3,!=4.52.0,!=4.52.1,!=4.52.2,!=4.52.3,!=4.53.0,!=4.54.0,!=4.55.0,!=4.55.1,<=4.56.2",
@@ -453,7 +453,7 @@ colab-ampere-torch220 = [
     "flash-attn>=2.6.3",
 ]
 colab-new = [
-    "unsloth_zoo>=2025.9.14",
+    "unsloth_zoo>=2025.10.1",
     "packaging",
     "tyro",
     "transformers>=4.51.3,!=4.52.0,!=4.52.1,!=4.52.2,!=4.52.3,!=4.53.0,!=4.54.0,!=4.55.0,!=4.55.1,<=4.56.2",
diff --git a/unsloth/models/_utils.py b/unsloth/models/_utils.py
index 3079196e68..8650e21438 100644
--- a/unsloth/models/_utils.py
+++ b/unsloth/models/_utils.py
@@ -12,7 +12,7 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-__version__ = "2025.9.11"
+__version__ = "2025.10.1"
 
 __all__ = [
     "SUPPORTS_BFLOAT16",
diff --git a/unsloth/models/mapper.py b/unsloth/models/mapper.py
index eb9119b681..600396ed46 100644
--- a/unsloth/models/mapper.py
+++ b/unsloth/models/mapper.py
@@ -966,6 +966,52 @@
         "mistralai/Magistral-Small-2509",
         "unsloth/Magistral-Small-2509-bnb-4bit",
     ),
+    "unsloth/Apertus-70B-Instruct-2509-unsloth-bnb-4bit" : (
+        "unsloth/Apertus-70B-Instruct-2509",
+        "swiss-ai/Apertus-70B-2509",
+        "unsloth/Apertus-70B-Instruct-2509-unsloth-bnb-4bit",
+    ),
+    "unsloth/Apertus-8B-Instruct-2509-unsloth-bnb-4bit" : (
+        "unsloth/Apertus-8B-Instruct-2509",
+        "swiss-ai/Apertus-8B-2509",
+        "unsloth/Apertus-8B-Instruct-2509-unsloth-bnb-4bit",
+    ),
+    "unsloth/granite-4.0-micro-unsloth-bnb-4bit" : (
+        "unsloth/granite-4.0-micro",
+        "ibm-granite/granite-4.0-micro",
+        "unsloth/granite-4.0-micro-bnb-4bit",
+    ),
+    "unsloth/granite-4.0-h-micro-unsloth-bnb-4bit" : (
+        "unsloth/granite-4.0-h-micro",
+        "ibm-granite/granite-4.0-h-micro",
+        "unsloth/granite-4.0-h-micro-bnb-4bit",
+    ),
+    "unsloth/granite-4.0-micro-base-unsloth-bnb-4bit" : (
+        "unsloth/granite-4.0-micro-base",
+        "ibm-granite/granite-4.0-micro-base",
+        "unsloth/granite-4.0-micro-base-bnb-4bit",
+    ),
+    "unsloth/granite-4.0-h-micro-base-unsloth-bnb-4bit" : (
+        "unsloth/granite-4.0-h-micro-base",
+        "ibm-granite/granite-4.0-h-micro-base",
+        "unsloth/granite-4.0-h-micro-base-bnb-4bit",
+    ),
+    "unsloth/granite-4.0-h-tiny" : (
+        "unsloth/granite-4.0-h-tiny",
+        "ibm-granite/granite-4.0-h-tiny",
+    ),
+    "unsloth/granite-4.0-h-small" : (
+        "unsloth/granite-4.0-h-small",
+        "ibm-granite/granite-4.0-h-small",
+    ),
+    "unsloth/granite-4.0-h-tiny-base" : (
+        "unsloth/granite-4.0-h-tiny-base",
+        "ibm-granite/granite-4.0-h-tiny-base",
+    ),
+    "unsloth/granite-4.0-h-small-base" : (
+        "unsloth/granite-4.0-h-small-base",
+        "ibm-granite/granite-4.0-h-small-base",
+    ),
 }
 
 INT_TO_FLOAT_MAPPER  = {}

From 778da7dcf8e375c71339d77405ad6b491c149515 Mon Sep 17 00:00:00 2001
From: Daniel Han <danielhanchen@gmail.com>
Date: Thu, 16 Oct 2025 05:32:29 -0700
Subject: [PATCH 223/272] Update llama.py

---
 unsloth/models/llama.py | 1 +
 1 file changed, 1 insertion(+)

diff --git a/unsloth/models/llama.py b/unsloth/models/llama.py
index 8ff74872a3..21b10f15cf 100644
--- a/unsloth/models/llama.py
+++ b/unsloth/models/llama.py
@@ -1195,6 +1195,7 @@ def _CausalLM_fast_forward(
             logits = self.lm_head(hidden_states[:, -num_logits_to_keep:, :].to(dtype))
         else:
             RETURN_LOGITS = os.environ.get("UNSLOTH_RETURN_LOGITS", "0") == "1"
+            print("RETURN_LOGITS", RETURN_LOGITS)
             # < 1024 Normal Unsloth uses less VRAM!
             if bsz*q_len <= 1024: RETURN_LOGITS = True
 

From ed443ee0c8057831d029bdd5fa6920994d9e80a8 Mon Sep 17 00:00:00 2001
From: Daniel Han <danielhanchen@gmail.com>
Date: Thu, 16 Oct 2025 05:36:22 -0700
Subject: [PATCH 224/272] Versioning

---
 pyproject.toml           | 4 ++--
 unsloth/models/_utils.py | 2 +-
 2 files changed, 3 insertions(+), 3 deletions(-)

diff --git a/pyproject.toml b/pyproject.toml
index 0b10d0ca13..2f812c769f 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -40,7 +40,7 @@ triton = [
     "triton-windows ; (sys_platform == 'win32') and (platform_machine == 'AMD64' or platform_machine == 'x86_64')",
 ]
 huggingface = [
-    "unsloth_zoo>=2025.10.3",
+    "unsloth_zoo>=2025.10.4",
     "wheel>=0.42.0",
     "packaging",
     "torchvision",
@@ -458,7 +458,7 @@ colab-ampere-torch220 = [
     "flash-attn>=2.6.3 ; ('linux' in sys_platform)",
 ]
 colab-new = [
-    "unsloth_zoo>=2025.10.3",
+    "unsloth_zoo>=2025.10.4",
     "packaging",
     "tyro",
     "transformers>=4.51.3,!=4.52.0,!=4.52.1,!=4.52.2,!=4.52.3,!=4.53.0,!=4.54.0,!=4.55.0,!=4.55.1,<=4.56.2",
diff --git a/unsloth/models/_utils.py b/unsloth/models/_utils.py
index 93575a043d..5746f91694 100644
--- a/unsloth/models/_utils.py
+++ b/unsloth/models/_utils.py
@@ -12,7 +12,7 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-__version__ = "2025.10.3"
+__version__ = "2025.10.4"
 
 __all__ = [
     "SUPPORTS_BFLOAT16",

From da00e2f0d1f9859ad9941adfce7dc8d60bb30622 Mon Sep 17 00:00:00 2001
From: Daniel Han <danielhanchen@gmail.com>
Date: Thu, 16 Oct 2025 05:36:59 -0700
Subject: [PATCH 225/272] Update _utils.py

---
 unsloth/models/_utils.py | 15 +++++++++------
 1 file changed, 9 insertions(+), 6 deletions(-)

diff --git a/unsloth/models/_utils.py b/unsloth/models/_utils.py
index 5746f91694..f5948cef1a 100644
--- a/unsloth/models/_utils.py
+++ b/unsloth/models/_utils.py
@@ -1652,14 +1652,17 @@ def error_out_no_vllm(*args, **kwargs):
     raise NotImplementedError("Unsloth: vLLM is not yet supported for fast inference for this model! Please use `.generate` instead")
 
 
-from torchao.core.config import AOBaseConfig
 try:
-    from torchao.quantization import Int4WeightOnlyConfig
+    from torchao.core.config import AOBaseConfig
+    try:
+        from torchao.quantization import Int4WeightOnlyConfig
+    except:
+        print("Unsloth: TorchAO changed `torchao.quantization.Int4WeightOnlyConfig`")
+        Int4WeightOnlyConfig = None
+    pass
 except:
-    print("Unsloth: TorchAO changed `torchao.quantization.Int4WeightOnlyConfig`")
-    Int4WeightOnlyConfig = None
-pass
-
+    AOBaseConfig = None
+    pass
 @dataclass
 class TorchAOConfig:
     qat_scheme : str = "int4"

From 250ea60650ebc245cb2278bc63756e7df5c13db4 Mon Sep 17 00:00:00 2001
From: Daniel Han <danielhanchen@gmail.com>
Date: Thu, 16 Oct 2025 05:37:50 -0700
Subject: [PATCH 226/272] Update llama.py

---
 unsloth/models/llama.py | 1 -
 1 file changed, 1 deletion(-)

diff --git a/unsloth/models/llama.py b/unsloth/models/llama.py
index 21b10f15cf..8ff74872a3 100644
--- a/unsloth/models/llama.py
+++ b/unsloth/models/llama.py
@@ -1195,7 +1195,6 @@ def _CausalLM_fast_forward(
             logits = self.lm_head(hidden_states[:, -num_logits_to_keep:, :].to(dtype))
         else:
             RETURN_LOGITS = os.environ.get("UNSLOTH_RETURN_LOGITS", "0") == "1"
-            print("RETURN_LOGITS", RETURN_LOGITS)
             # < 1024 Normal Unsloth uses less VRAM!
             if bsz*q_len <= 1024: RETURN_LOGITS = True
 

From a921ea6450a8c3f2dc62e78cb4821abd317066e4 Mon Sep 17 00:00:00 2001
From: Daniel Han <danielhanchen@gmail.com>
Date: Thu, 16 Oct 2025 05:38:16 -0700
Subject: [PATCH 227/272] Update _utils.py

---
 unsloth/models/_utils.py | 1 +
 1 file changed, 1 insertion(+)

diff --git a/unsloth/models/_utils.py b/unsloth/models/_utils.py
index f5948cef1a..bf7d441c38 100644
--- a/unsloth/models/_utils.py
+++ b/unsloth/models/_utils.py
@@ -1662,6 +1662,7 @@ def error_out_no_vllm(*args, **kwargs):
     pass
 except:
     AOBaseConfig = None
+    Int4WeightOnlyConfig = None
     pass
 @dataclass
 class TorchAOConfig:

From c90df8745fab97c254902cd6f453341ee8d004c7 Mon Sep 17 00:00:00 2001
From: Daniel Han <danielhanchen@gmail.com>
Date: Thu, 16 Oct 2025 05:40:44 -0700
Subject: [PATCH 228/272] Update llama.py

---
 unsloth/models/llama.py | 1 +
 1 file changed, 1 insertion(+)

diff --git a/unsloth/models/llama.py b/unsloth/models/llama.py
index 8ff74872a3..21b10f15cf 100644
--- a/unsloth/models/llama.py
+++ b/unsloth/models/llama.py
@@ -1195,6 +1195,7 @@ def _CausalLM_fast_forward(
             logits = self.lm_head(hidden_states[:, -num_logits_to_keep:, :].to(dtype))
         else:
             RETURN_LOGITS = os.environ.get("UNSLOTH_RETURN_LOGITS", "0") == "1"
+            print("RETURN_LOGITS", RETURN_LOGITS)
             # < 1024 Normal Unsloth uses less VRAM!
             if bsz*q_len <= 1024: RETURN_LOGITS = True
 

From c64f0113f2fe9bed8b7d3130435bba9eb7f6c4cf Mon Sep 17 00:00:00 2001
From: Daniel Han <danielhanchen@gmail.com>
Date: Thu, 16 Oct 2025 05:47:41 -0700
Subject: [PATCH 229/272] Fix AMD

---
 unsloth/models/_utils.py  | 4 ++--
 unsloth/models/llama.py   | 9 ++++++---
 unsloth/models/mistral.py | 7 ++++++-
 3 files changed, 14 insertions(+), 6 deletions(-)

diff --git a/unsloth/models/_utils.py b/unsloth/models/_utils.py
index bf7d441c38..ab3a9e058e 100644
--- a/unsloth/models/_utils.py
+++ b/unsloth/models/_utils.py
@@ -458,8 +458,8 @@ def patch_mistral_nemo_config(config):
     config = re.sub(
         r"(\*\*kwargs)[\s]{0,}\,[\s]{0,}\)[\s]{0,}\:",
         r"rope_scaling=None,"\
-        r"\n        **kwargs):\n"\
-        r"\n        self.rope_scaling = rope_scaling\n",
+        r"\n        \*\*kwargs):\n"\
+        r"\n        self\.rope_scaling = rope_scaling\n",
         config,
     )
 
diff --git a/unsloth/models/llama.py b/unsloth/models/llama.py
index 21b10f15cf..75c4e14308 100644
--- a/unsloth/models/llama.py
+++ b/unsloth/models/llama.py
@@ -1195,12 +1195,15 @@ def _CausalLM_fast_forward(
             logits = self.lm_head(hidden_states[:, -num_logits_to_keep:, :].to(dtype))
         else:
             RETURN_LOGITS = os.environ.get("UNSLOTH_RETURN_LOGITS", "0") == "1"
-            print("RETURN_LOGITS", RETURN_LOGITS)
             # < 1024 Normal Unsloth uses less VRAM!
-            if bsz*q_len <= 1024: RETURN_LOGITS = True
+            if DEVICE_TYPE == "hip":
+                # [TODO] AMD GPUs fail on chunked_cross_entropy loss!
+                # RuntimeError: Triton Error [HIP]:  Code: 1, Messsage: invalid argument
+                RETURN_LOGITS = False
+            elif bsz*q_len <= 1024:
+                RETURN_LOGITS = True
 
             if not RETURN_LOGITS and labels is not None:
-
                 n_items = kwargs.get("num_items_in_batch", None)
                 if n_items is None: n_items = kwargs.get("n_items", None)
 
diff --git a/unsloth/models/mistral.py b/unsloth/models/mistral.py
index faab2d30b1..b547739df2 100644
--- a/unsloth/models/mistral.py
+++ b/unsloth/models/mistral.py
@@ -298,7 +298,12 @@ def MistralForCausalLM_fast_forward(
     else:
         RETURN_LOGITS = os.environ.get("UNSLOTH_RETURN_LOGITS", "0") == "1"
         # < 1024 Normal Unsloth uses less VRAM!
-        if bsz * q_len <= 1024: RETURN_LOGITS = True
+        if DEVICE_TYPE == "hip":
+            # [TODO] AMD GPUs fail on chunked_cross_entropy loss!
+            # RuntimeError: Triton Error [HIP]:  Code: 1, Messsage: invalid argument
+            RETURN_LOGITS = False
+        elif bsz*q_len <= 1024:
+            RETURN_LOGITS = True
 
         if not RETURN_LOGITS and labels is not None:
             n_items = kwargs.get("num_items_in_batch", None) or kwargs.get("n_items", None)

From 8eecf7d62e481b03e4f27b232c8d5eeb641a9973 Mon Sep 17 00:00:00 2001
From: Daniel Han <danielhanchen@gmail.com>
Date: Thu, 16 Oct 2025 05:49:48 -0700
Subject: [PATCH 230/272] Update _utils.py

---
 unsloth/models/_utils.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/unsloth/models/_utils.py b/unsloth/models/_utils.py
index ab3a9e058e..bf7d441c38 100644
--- a/unsloth/models/_utils.py
+++ b/unsloth/models/_utils.py
@@ -458,8 +458,8 @@ def patch_mistral_nemo_config(config):
     config = re.sub(
         r"(\*\*kwargs)[\s]{0,}\,[\s]{0,}\)[\s]{0,}\:",
         r"rope_scaling=None,"\
-        r"\n        \*\*kwargs):\n"\
-        r"\n        self\.rope_scaling = rope_scaling\n",
+        r"\n        **kwargs):\n"\
+        r"\n        self.rope_scaling = rope_scaling\n",
         config,
     )
 

From c22b9a351993c89bf2b05f364fe476222c9f4d41 Mon Sep 17 00:00:00 2001
From: Daniel Han <danielhanchen@gmail.com>
Date: Thu, 16 Oct 2025 05:54:01 -0700
Subject: [PATCH 231/272] Update llama.py

---
 unsloth/models/llama.py | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/unsloth/models/llama.py b/unsloth/models/llama.py
index 75c4e14308..42c0eaf8ed 100644
--- a/unsloth/models/llama.py
+++ b/unsloth/models/llama.py
@@ -1764,7 +1764,8 @@ def unsloth_fast_generate(
     kwargs["pad_token_id"] = kwargs.pop("pad_token_id", model_eos_token_id)
 
     # Mixed precision autocast
-    with torch.inference_mode(), torch.autocast(device_type = DEVICE_TYPE, dtype = dtype):
+    device_type = DEVICE_TYPE if DEVICE_TYPE != "hip" else "cuda" # hip doesn't work
+    with torch.inference_mode(), torch.autocast(device_type = device_type, dtype = dtype):
         output = self._old_generate(*args, **kwargs)
     pass
 

From 38b9e00d81c119a858822c2a902f48744fbcfc14 Mon Sep 17 00:00:00 2001
From: Daniel Han <danielhanchen@gmail.com>
Date: Thu, 16 Oct 2025 05:56:24 -0700
Subject: [PATCH 232/272] Update vision.py

---
 unsloth/models/vision.py | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

diff --git a/unsloth/models/vision.py b/unsloth/models/vision.py
index b90ad00cf8..cfc1d0d082 100644
--- a/unsloth/models/vision.py
+++ b/unsloth/models/vision.py
@@ -203,11 +203,12 @@ def unsloth_base_fast_generate(
     except: pass
 
     # Mixed precision autocast
+    device_type = DEVICE_TYPE if DEVICE_TYPE != "hip" else "cuda" # hip doesn't work
     if os.environ.get("UNSLOTH_FORCE_FLOAT32", "0") == "1":
-        autocaster = torch.autocast(device_type = "cuda", dtype = torch.float16)
+        autocaster = torch.autocast(device_type = device_type, dtype = torch.float16)
         dtype = torch.float16
     else:
-        autocaster = torch.autocast(device_type = "cuda", dtype = dtype)
+        autocaster = torch.autocast(device_type = device_type, dtype = dtype)
 
     # Prepare LoRA
     # state_dict = convert_lora_modules(self, dtype = dtype)

From b99dcd5e469d1150bb0f930dc1414541574befcf Mon Sep 17 00:00:00 2001
From: Daniel Han <danielhanchen@gmail.com>
Date: Thu, 16 Oct 2025 06:03:08 -0700
Subject: [PATCH 233/272] DEVICE_TYPE_TORCH

---
 unsloth/__init__.py      |  2 ++
 unsloth/models/_utils.py |  2 +-
 unsloth/models/llama.py  | 33 ++++++++++++++++-----------------
 unsloth/models/vision.py |  7 +++----
 4 files changed, 22 insertions(+), 22 deletions(-)

diff --git a/unsloth/__init__.py b/unsloth/__init__.py
index 5dd16bae99..45719d472c 100644
--- a/unsloth/__init__.py
+++ b/unsloth/__init__.py
@@ -96,6 +96,8 @@ def get_device_type():
     raise NotImplementedError("Unsloth currently only works on NVIDIA, AMD and Intel GPUs.")
 pass
 DEVICE_TYPE : str = get_device_type()
+# HIP fails for autocast and other torch functions. Use CUDA instead
+DEVICE_TYPE_TORCH = DEVICE_TYPE if DEVICE_TYPE != "hip" else DEVICE_TYPE
 
 @functools.cache
 def get_device_count():
diff --git a/unsloth/models/_utils.py b/unsloth/models/_utils.py
index bf7d441c38..e787f55532 100644
--- a/unsloth/models/_utils.py
+++ b/unsloth/models/_utils.py
@@ -87,7 +87,7 @@
 import warnings, subprocess, re, inspect, psutil, os, math
 from unsloth_zoo.utils import Version
 from importlib.metadata import version as importlib_version
-from unsloth import DEVICE_TYPE, DEVICE_COUNT
+from unsloth import DEVICE_TYPE, DEVICE_COUNT, DEVICE_TYPE_TORCH
 from unsloth_zoo.log import logger
 from unsloth_zoo.tokenizer_utils import (
     patch_tokenizer as _patch_tokenizer,
diff --git a/unsloth/models/llama.py b/unsloth/models/llama.py
index 42c0eaf8ed..596042288d 100644
--- a/unsloth/models/llama.py
+++ b/unsloth/models/llama.py
@@ -27,7 +27,7 @@
 from unsloth_zoo.utils import Version, _get_dtype
 from unsloth_zoo.hf_utils import dtype_from_config, add_dtype_kwargs, fix_lora_auto_mapping
 from unsloth_zoo.peft_utils import SKIP_QUANTIZATION_MODULES
-from unsloth import DEVICE_TYPE, DEVICE_COUNT
+from unsloth import DEVICE_TYPE, DEVICE_COUNT, DEVICE_TYPE_TORCH
 
 transformers_version = Version(transformers_version)
 # Transformers moved rotary embeddings out of all attention layers
@@ -732,7 +732,7 @@ def LlamaModel_fast_forward(
         position_ids = torch.arange(
             past_key_values_length, seq_length + past_key_values_length,
             dtype  = torch.int32,
-            device = f"{DEVICE_TYPE}:0",
+            device = f"{DEVICE_TYPE_TORCH}:0",
         )
         position_ids = position_ids.unsqueeze(0).view(-1, seq_length)
     elif position_ids is not None:
@@ -905,13 +905,13 @@ def LlamaModel_fast_forward(
                     is_causal = True,
                     sliding_window = self.config.sliding_window,
                 )\
-                    .to_causal_4d(1, n, n, dtype = inputs_embeds.dtype, device = DEVICE_TYPE,)\
+                    .to_causal_4d(1, n, n, dtype = inputs_embeds.dtype, device = DEVICE_TYPE_TORCH,)\
                     .squeeze(0).squeeze(0)
 
                 self.GA_mask = AttentionMaskConverter(
                     is_causal = True,
                 )\
-                    .to_causal_4d(1, n, n, dtype = inputs_embeds.dtype, device = DEVICE_TYPE,)\
+                    .to_causal_4d(1, n, n, dtype = inputs_embeds.dtype, device = DEVICE_TYPE_TORCH,)\
                     .squeeze(0).squeeze(0)
             pass
         pass
@@ -1028,11 +1028,11 @@ def LlamaModel_fast_forward_inference_custom(
         bsz, q_len, hd = X.shape
         assert(q_len == 1)
         # Get saved buffers to reduce memory movement
-        residual = torch.empty((bsz, q_len, hd), dtype = torch.float32, device = f"{DEVICE_TYPE}:0")
-        _XX = torch.empty((2, bsz, q_len, hd), dtype = torch.float32, device = f"{DEVICE_TYPE}:0")
+        residual = torch.empty((bsz, q_len, hd), dtype = torch.float32, device = f"{DEVICE_TYPE_TORCH}:0")
+        _XX = torch.empty((2, bsz, q_len, hd), dtype = torch.float32, device = f"{DEVICE_TYPE_TORCH}:0")
         XX, XX2 = _XX[0], _XX[1]
-        variance = torch.empty((bsz, q_len, 1), dtype = torch.float32, device = f"{DEVICE_TYPE}:0")
-        temp_mlp = torch.empty((2, bsz, 1, mlp_size), dtype = X.dtype, device = f"{DEVICE_TYPE}:0")
+        variance = torch.empty((bsz, q_len, 1), dtype = torch.float32, device = f"{DEVICE_TYPE_TORCH}:0")
+        temp_mlp = torch.empty((2, bsz, 1, mlp_size), dtype = X.dtype, device = f"{DEVICE_TYPE_TORCH}:0")
         temp_gates, temp_ups = tuple(temp_mlp[0].to(torch.device(x)) for x in range(DEVICE_COUNT)), tuple(temp_mlp[1].to(torch.device(x)) for x in range(DEVICE_COUNT))
 
         seq_len = past_key_values[0][0].shape[-2]
@@ -1378,7 +1378,7 @@ def __init__(self, dim = None, max_position_embeddings=2048, base=10000, device=
             partial_rotary_factor = config.partial_rotary_factor if hasattr(config, "partial_rotary_factor") else 1.0
             dim = getattr(config, "head_dim", None)
             if dim is None: dim = int((config.hidden_size // config.num_attention_heads))
-            device = DEVICE_TYPE
+            device = DEVICE_TYPE_TORCH
             max_position_embeddings = config.max_position_embeddings
         pass
 
@@ -1490,7 +1490,7 @@ def __init__(self, dim = None, max_position_embeddings=2048, base=10000, device=
             base = config.rope_theta
             partial_rotary_factor = config.partial_rotary_factor if hasattr(config, "partial_rotary_factor") else 1.0
             dim = int((config.hidden_size // config.num_attention_heads))
-            device = DEVICE_TYPE
+            device = DEVICE_TYPE_TORCH
             max_position_embeddings = config.max_position_embeddings
         pass
 
@@ -1610,7 +1610,7 @@ def __init__(self,
             base = config.rope_theta
             partial_rotary_factor = config.partial_rotary_factor if hasattr(config, "partial_rotary_factor") else 1.0
             dim = int((config.hidden_size // config.num_attention_heads))
-            device = DEVICE_TYPE
+            device = DEVICE_TYPE_TORCH
             max_position_embeddings = config.max_position_embeddings
         pass
 
@@ -1764,8 +1764,7 @@ def unsloth_fast_generate(
     kwargs["pad_token_id"] = kwargs.pop("pad_token_id", model_eos_token_id)
 
     # Mixed precision autocast
-    device_type = DEVICE_TYPE if DEVICE_TYPE != "hip" else "cuda" # hip doesn't work
-    with torch.inference_mode(), torch.autocast(device_type = device_type, dtype = dtype):
+    with torch.inference_mode(), torch.autocast(device_type = DEVICE_TYPE_TORCH, dtype = dtype):
         output = self._old_generate(*args, **kwargs)
     pass
 
@@ -2389,7 +2388,7 @@ def get_peft_model(
                     pass
 
                     model.get_input_embeddings().modules_to_save.default\
-                        .to(device = DEVICE_TYPE, dtype = new_dtype, non_blocking = True)
+                        .to(device = DEVICE_TYPE_TORCH, dtype = new_dtype, non_blocking = True)
                     model.get_input_embeddings().modules_to_save.default.requires_grad_(True)
 
                     # [TODO] Move old embed_tokens to CPU - should be disk!
@@ -2409,7 +2408,7 @@ def get_peft_model(
                     pass
 
                     model.get_output_embeddings().modules_to_save.default\
-                        .to(device = DEVICE_TYPE, dtype = new_dtype, non_blocking = True)
+                        .to(device = DEVICE_TYPE_TORCH, dtype = new_dtype, non_blocking = True)
                     model.get_output_embeddings().modules_to_save.default.requires_grad_(True)
 
                     # [TODO] Move old lm_head to CPU - should be disk!
@@ -2678,7 +2677,7 @@ def get_peft_model(
             pass
 
             model.get_input_embeddings().modules_to_save.default\
-                .to(device = DEVICE_TYPE, dtype = new_dtype, non_blocking = True)
+                .to(device = DEVICE_TYPE_TORCH, dtype = new_dtype, non_blocking = True)
             model.get_input_embeddings().modules_to_save.default.requires_grad_(True)
         pass
 
@@ -2694,7 +2693,7 @@ def get_peft_model(
             pass
 
             model.get_output_embeddings().modules_to_save.default\
-                .to(device = DEVICE_TYPE, dtype = new_dtype, non_blocking = True)
+                .to(device = DEVICE_TYPE_TORCH, dtype = new_dtype, non_blocking = True)
             model.get_output_embeddings().modules_to_save.default.requires_grad_(True)
         pass
 
diff --git a/unsloth/models/vision.py b/unsloth/models/vision.py
index cfc1d0d082..b2704876e3 100644
--- a/unsloth/models/vision.py
+++ b/unsloth/models/vision.py
@@ -71,7 +71,7 @@
     # Old HF Hub versions <= 0.0.25
     from huggingface_hub.utils._token import get_token
 pass
-from unsloth import DEVICE_TYPE, DEVICE_COUNT
+from unsloth import DEVICE_TYPE, DEVICE_COUNT, DEVICE_TYPE_TORCH
 
 __all__ = [
     "FastBaseModel",
@@ -203,12 +203,11 @@ def unsloth_base_fast_generate(
     except: pass
 
     # Mixed precision autocast
-    device_type = DEVICE_TYPE if DEVICE_TYPE != "hip" else "cuda" # hip doesn't work
     if os.environ.get("UNSLOTH_FORCE_FLOAT32", "0") == "1":
-        autocaster = torch.autocast(device_type = device_type, dtype = torch.float16)
+        autocaster = torch.autocast(device_type = DEVICE_TYPE_TORCH, dtype = torch.float16)
         dtype = torch.float16
     else:
-        autocaster = torch.autocast(device_type = device_type, dtype = dtype)
+        autocaster = torch.autocast(device_type = DEVICE_TYPE_TORCH, dtype = dtype)
 
     # Prepare LoRA
     # state_dict = convert_lora_modules(self, dtype = dtype)

From 19bc977f21fbb678d6bd961ca89cdbc00f66b90d Mon Sep 17 00:00:00 2001
From: Daniel Han <danielhanchen@gmail.com>
Date: Thu, 16 Oct 2025 06:07:00 -0700
Subject: [PATCH 234/272] Update __init__.py

---
 unsloth/__init__.py | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/unsloth/__init__.py b/unsloth/__init__.py
index 45719d472c..d2b77f6b37 100644
--- a/unsloth/__init__.py
+++ b/unsloth/__init__.py
@@ -97,7 +97,8 @@ def get_device_type():
 pass
 DEVICE_TYPE : str = get_device_type()
 # HIP fails for autocast and other torch functions. Use CUDA instead
-DEVICE_TYPE_TORCH = DEVICE_TYPE if DEVICE_TYPE != "hip" else DEVICE_TYPE
+DEVICE_TYPE_TORCH = DEVICE_TYPE
+if DEVICE_TYPE_TORCH == "hip": DEVICE_TYPE_TORCH = "cuda"
 
 @functools.cache
 def get_device_count():

From 5aa6a39ad848ef901db2788cd8d32f333ba8b463 Mon Sep 17 00:00:00 2001
From: Daniel Han <danielhanchen@gmail.com>
Date: Thu, 16 Oct 2025 06:37:29 -0700
Subject: [PATCH 235/272] Update __init__.py

---
 unsloth/__init__.py | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/unsloth/__init__.py b/unsloth/__init__.py
index d2b77f6b37..99d651ae5f 100644
--- a/unsloth/__init__.py
+++ b/unsloth/__init__.py
@@ -149,7 +149,9 @@ def get_device_count():
 # OutOfResources: out of resource: shared memory, Required: 98304, Hardware limit: 65536. Reducing block sizes or `num_stages`
 if (major_torch >= 2 and minor_torch >= 8) or (major_torch > 2):
     os.environ["UNSLOTH_ENABLE_CCE"] = "0"
-pass
+elif DEVICE_TYPE == "hip":
+    # CCE also fails in HIP / AMD
+    os.environ["UNSLOTH_ENABLE_CCE"] = "0"
 
 # Fix other issues
 import importlib.util

From 0576c13e0ce1fa300ed08244c10b1b8bfad197f4 Mon Sep 17 00:00:00 2001
From: Daniel Han <danielhanchen@gmail.com>
Date: Thu, 16 Oct 2025 21:36:05 -0700
Subject: [PATCH 236/272] Update _utils.py

---
 unsloth/models/_utils.py | 26 ++++++++++++++++++++++++++
 1 file changed, 26 insertions(+)

diff --git a/unsloth/models/_utils.py b/unsloth/models/_utils.py
index beefada7bf..cb98db00f1 100644
--- a/unsloth/models/_utils.py
+++ b/unsloth/models/_utils.py
@@ -1346,6 +1346,32 @@ def patch_gradient_accumulation_fix(Trainer):
 
             # Also fix passing in num_items_in_batch
             if not hasattr(Trainer, "_old_compute_loss"):
+
+                # Fix transformers 4.57.0 causing `Output 0 of UnslothFusedLossBackward is a view and is being modified inplace.`
+                function = inspect.getsource(Trainer.compute_loss)
+                if "loss *=" in function or "loss*=" in function:
+                    where = function.find("def")
+                    function = function.split("\n")
+                    function = "\n".join(x[where:] for x in function)
+
+                    # Import all variables that need importing
+                    import transformers.trainer
+                    items_in_trainer = dir(transformers.trainer)
+                    good_items = []
+                    for item in items_in_trainer:
+                        if item in function: good_items.append(item)
+                    pass
+                    exec("from transformers.trainer import (" + ", ".join(x for x in good_items) + ")", globals())
+
+                    # Replace loss*= with loss = loss *
+                    function = re.sub(
+                        r"loss[\s]{0,}\*\=",
+                        "loss = loss *",
+                        function,
+                    )
+                    exec(function, globals())
+                    Trainer.compute_loss = compute_loss
+                pass
                 Trainer._old_compute_loss = Trainer.compute_loss
                 Trainer.compute_loss = _unsloth_pre_compute_loss
             pass

From ee46343c57d8c8628d3de401a8865cac83bfe416 Mon Sep 17 00:00:00 2001
From: Daniel Han <danielhanchen@gmail.com>
Date: Thu, 16 Oct 2025 23:26:06 -0700
Subject: [PATCH 237/272] Move DEVICE_TYPE

---
 unsloth/__init__.py               | 56 +++++-----------------
 unsloth/device_type.py            | 80 +++++++++++++++++++++++++++++++
 unsloth/kernels/utils.py          |  9 +++-
 unsloth/models/_utils.py          | 10 +++-
 unsloth/models/llama.py           |  9 +++-
 unsloth/models/loader.py          | 33 ++++++++++++-
 unsloth/models/rl_replacements.py |  9 +++-
 unsloth/models/vision.py          |  9 +++-
 8 files changed, 165 insertions(+), 50 deletions(-)
 create mode 100644 unsloth/device_type.py

diff --git a/unsloth/__init__.py b/unsloth/__init__.py
index 99d651ae5f..b336ad03fe 100644
--- a/unsloth/__init__.py
+++ b/unsloth/__init__.py
@@ -69,49 +69,14 @@
     raise exception
 pass
 
-@functools.cache
-def is_hip():
-    return bool(getattr(getattr(torch, "version", None), "hip", None))
-pass
-
-@functools.cache
-def get_device_type():
-    if hasattr(torch, "cuda") and torch.cuda.is_available():
-        if is_hip():
-            return "hip"
-        return "cuda"
-    elif hasattr(torch, "xpu") and torch.xpu.is_available():
-        return "xpu"
-    # Check torch.accelerator
-    if hasattr(torch, "accelerator"):
-        if not torch.accelerator.is_available():
-            raise NotImplementedError("Unsloth cannot find any torch accelerator? You need a GPU.")
-        accelerator = str(torch.accelerator.current_accelerator())
-        if accelerator in ("cuda", "xpu", "hip"):
-            raise RuntimeError(
-                f"Unsloth: Weirdly `torch.cuda.is_available()`, `torch.xpu.is_available()` and `is_hip` all failed.\n"\
-                f"But `torch.accelerator.current_accelerator()` works with it being = `{accelerator}`\n"\
-                f"Please reinstall torch - it's most likely broken :("
-            )
-    raise NotImplementedError("Unsloth currently only works on NVIDIA, AMD and Intel GPUs.")
-pass
-DEVICE_TYPE : str = get_device_type()
-# HIP fails for autocast and other torch functions. Use CUDA instead
-DEVICE_TYPE_TORCH = DEVICE_TYPE
-if DEVICE_TYPE_TORCH == "hip": DEVICE_TYPE_TORCH = "cuda"
-
-@functools.cache
-def get_device_count():
-    if DEVICE_TYPE in ("cuda", "hip"):
-        return torch.cuda.device_count()
-    elif DEVICE_TYPE == "xpu":
-        return torch.xpu.device_count()
-    else:
-        return 1
-pass
-
-DEVICE_COUNT : int = get_device_count()
-
+from .device_type import (
+    is_hip,
+    get_device_type,
+    DEVICE_TYPE,
+    DEVICE_TYPE_TORCH,
+    DEVICE_COUNT,
+    ALLOW_PREQUANTIZED_MODELS,
+)
 # Reduce VRAM usage by reducing fragmentation
 # And optimize pinning of memory
 # TODO(billishyahao): need to add hip related optimization...
@@ -201,7 +166,10 @@ def is_bf16_supported(): return SUPPORTS_BFLOAT16
     else: from triton.common.build import libcuda_dirs
 
     # Try loading bitsandbytes and triton
-    import bitsandbytes as bnb
+    try:
+        import bitsandbytes as bnb
+    except:
+        print("Unsloth: `bitsandbytes` is not installed - 4bit QLoRA unallowed, but 16bit and full finetuning works!")
     try:
         cdequantize_blockwise_fp32 = bnb.functional.lib.cdequantize_blockwise_fp32
         libcuda_dirs()
diff --git a/unsloth/device_type.py b/unsloth/device_type.py
new file mode 100644
index 0000000000..547750019a
--- /dev/null
+++ b/unsloth/device_type.py
@@ -0,0 +1,80 @@
+# Copyright 2023-present Daniel Han-Chen & the Unsloth team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+__all__ = [
+    "is_hip",
+    "get_device_type",
+    "DEVICE_TYPE",
+    "DEVICE_TYPE_TORCH",
+    "DEVICE_COUNT",
+    "ALLOW_PREQUANTIZED_MODELS",
+]
+
+import torch
+import functools
+
+@functools.cache
+def is_hip():
+    return bool(getattr(getattr(torch, "version", None), "hip", None))
+pass
+
+@functools.cache
+def get_device_type():
+    if hasattr(torch, "cuda") and torch.cuda.is_available():
+        if is_hip():
+            return "hip"
+        return "cuda"
+    elif hasattr(torch, "xpu") and torch.xpu.is_available():
+        return "xpu"
+    # Check torch.accelerator
+    if hasattr(torch, "accelerator"):
+        if not torch.accelerator.is_available():
+            raise NotImplementedError("Unsloth cannot find any torch accelerator? You need a GPU.")
+        accelerator = str(torch.accelerator.current_accelerator())
+        if accelerator in ("cuda", "xpu", "hip"):
+            raise RuntimeError(
+                f"Unsloth: Weirdly `torch.cuda.is_available()`, `torch.xpu.is_available()` and `is_hip` all failed.\n"\
+                f"But `torch.accelerator.current_accelerator()` works with it being = `{accelerator}`\n"\
+                f"Please reinstall torch - it's most likely broken :("
+            )
+    raise NotImplementedError("Unsloth currently only works on NVIDIA, AMD and Intel GPUs.")
+pass
+DEVICE_TYPE : str = get_device_type()
+# HIP fails for autocast and other torch functions. Use CUDA instead
+DEVICE_TYPE_TORCH = DEVICE_TYPE
+if DEVICE_TYPE_TORCH == "hip": DEVICE_TYPE_TORCH = "cuda"
+
+@functools.cache
+def get_device_count():
+    if DEVICE_TYPE in ("cuda", "hip"):
+        return torch.cuda.device_count()
+    elif DEVICE_TYPE == "xpu":
+        return torch.xpu.device_count()
+    else:
+        return 1
+pass
+
+DEVICE_COUNT : int = get_device_count()
+
+# Check blocksize for 4bit -> 64 for CUDA, 128 for AMD
+# If AMD, we cannot load pre-quantized models for now :(
+ALLOW_PREQUANTIZED_MODELS : bool = True
+if DEVICE_TYPE == "hip":
+    try:
+        from bitsandbytes.nn.modules import Params4bit
+        if "blocksize = 64 if not HIP_ENVIRONMENT else 128" in inspect.getsource(Params4bit):
+            ALLOW_PREQUANTIZED_MODELS = False
+    except:
+        pass
+pass
diff --git a/unsloth/kernels/utils.py b/unsloth/kernels/utils.py
index 9a46d0d5d7..16fc694230 100644
--- a/unsloth/kernels/utils.py
+++ b/unsloth/kernels/utils.py
@@ -19,7 +19,14 @@
 import functools
 from typing import Optional
 
-from .. import DEVICE_TYPE, DEVICE_COUNT
+from ..device_type import (
+    is_hip,
+    get_device_type,
+    DEVICE_TYPE,
+    DEVICE_TYPE_TORCH,
+    DEVICE_COUNT,
+    ALLOW_PREQUANTIZED_MODELS,
+)
 from .fp8 import weight_dequant, fp8_linear
 
 # torch.cuda.amp.custom_fwd is deprecated >= 2.4
diff --git a/unsloth/models/_utils.py b/unsloth/models/_utils.py
index cb98db00f1..35c0a2fcd5 100644
--- a/unsloth/models/_utils.py
+++ b/unsloth/models/_utils.py
@@ -87,7 +87,14 @@
 import warnings, subprocess, re, inspect, psutil, os, math
 from unsloth_zoo.utils import Version
 from importlib.metadata import version as importlib_version
-from unsloth import DEVICE_TYPE, DEVICE_COUNT, DEVICE_TYPE_TORCH
+from ..device_type import (
+    is_hip,
+    get_device_type,
+    DEVICE_TYPE,
+    DEVICE_TYPE_TORCH,
+    DEVICE_COUNT,
+    ALLOW_PREQUANTIZED_MODELS,
+)
 from unsloth_zoo.log import logger
 from unsloth_zoo.tokenizer_utils import (
     patch_tokenizer as _patch_tokenizer,
@@ -1331,6 +1338,7 @@ def _unsloth_pre_compute_loss(self, model, inputs, *args, **kwargs):
 
 def patch_gradient_accumulation_fix(Trainer):
     # Fixes gradient accumulation
+    # Fixes Output 0 of UnslothFusedLossBackward is a view and is being modified inplace.
     import inspect
     if hasattr(Trainer, "get_batch_samples"):
         if Trainer.get_batch_samples.__name__ == "_unsloth_get_batch_samples": return
diff --git a/unsloth/models/llama.py b/unsloth/models/llama.py
index 596042288d..535537a3a1 100644
--- a/unsloth/models/llama.py
+++ b/unsloth/models/llama.py
@@ -27,7 +27,14 @@
 from unsloth_zoo.utils import Version, _get_dtype
 from unsloth_zoo.hf_utils import dtype_from_config, add_dtype_kwargs, fix_lora_auto_mapping
 from unsloth_zoo.peft_utils import SKIP_QUANTIZATION_MODULES
-from unsloth import DEVICE_TYPE, DEVICE_COUNT, DEVICE_TYPE_TORCH
+from ..device_type import (
+    is_hip,
+    get_device_type,
+    DEVICE_TYPE,
+    DEVICE_TYPE_TORCH,
+    DEVICE_COUNT,
+    ALLOW_PREQUANTIZED_MODELS,
+)
 
 transformers_version = Version(transformers_version)
 # Transformers moved rotary embeddings out of all attention layers
diff --git a/unsloth/models/loader.py b/unsloth/models/loader.py
index 356bca8a29..ecafc5b2ce 100644
--- a/unsloth/models/loader.py
+++ b/unsloth/models/loader.py
@@ -45,6 +45,14 @@
 pass
 from huggingface_hub import HfFileSystem
 import importlib.util
+from ...device_type import (
+    is_hip,
+    get_device_type,
+    DEVICE_TYPE,
+    DEVICE_TYPE_TORCH,
+    DEVICE_COUNT,
+    ALLOW_PREQUANTIZED_MODELS,
+)
 
 # https://github.com/huggingface/transformers/pull/26037 allows 4 bit loading!
 from unsloth_zoo.utils import Version, _get_dtype
@@ -195,6 +203,12 @@ def from_pretrained(
         old_model_name = model_name
         if not use_exact_model_name:
             model_name = get_model_name(model_name, load_in_4bit)
+        # Check if pre-quantized models are allowed
+        # For eg AMD GPUs need blocksize = 128, but our pre-quants are blocksize = 64
+        if not ALLOW_PREQUANTIZED_MODELS and model_name.endswith(("-unsloth-bnb-4bit", "-bnb-4bit")):
+            model_name = model_name.removesuffix("-unsloth-bnb-4bit")
+            model_name = model_name.removesuffix("-bnb-4bit")
+        pass
 
         if USE_MODELSCOPE and not os.path.exists(model_name):
             from modelscope import snapshot_download
@@ -306,6 +320,12 @@ def from_pretrained(
             model_name = peft_config.base_model_name_or_path
             if not use_exact_model_name:
                 model_name = get_model_name(model_name, load_in_4bit)
+            # Check if pre-quantized models are allowed
+            # For eg AMD GPUs need blocksize = 128, but our pre-quants are blocksize = 64
+            if not ALLOW_PREQUANTIZED_MODELS and model_name.endswith(("-unsloth-bnb-4bit", "-bnb-4bit")):
+                model_name = model_name.removesuffix("-unsloth-bnb-4bit")
+                model_name = model_name.removesuffix("-bnb-4bit")
+            pass
             model_config = AutoConfig.from_pretrained(
                 model_name,
                 token = token,
@@ -618,6 +638,12 @@ def from_pretrained(
         old_model_name = model_name
         if not use_exact_model_name:
             model_name = get_model_name(model_name, load_in_4bit)
+        # Check if pre-quantized models are allowed
+        # For eg AMD GPUs need blocksize = 128, but our pre-quants are blocksize = 64
+        if not ALLOW_PREQUANTIZED_MODELS and model_name.endswith(("-unsloth-bnb-4bit", "-bnb-4bit")):
+            model_name = model_name.removesuffix("-unsloth-bnb-4bit")
+            model_name = model_name.removesuffix("-bnb-4bit")
+        pass
 
         # Check modelscope
         if USE_MODELSCOPE and not os.path.exists(model_name):
@@ -833,7 +859,12 @@ def from_pretrained(
             model_name = peft_config.base_model_name_or_path
             if not use_exact_model_name:
                 model_name = get_model_name(model_name, load_in_4bit)
-
+            # Check if pre-quantized models are allowed
+            # For eg AMD GPUs need blocksize = 128, but our pre-quants are blocksize = 64
+            if not ALLOW_PREQUANTIZED_MODELS and model_name.endswith(("-unsloth-bnb-4bit", "-bnb-4bit")):
+                model_name = model_name.removesuffix("-unsloth-bnb-4bit")
+                model_name = model_name.removesuffix("-bnb-4bit")
+            pass
             model_config = AutoConfig.from_pretrained(
                 model_name,
                 token = token,
diff --git a/unsloth/models/rl_replacements.py b/unsloth/models/rl_replacements.py
index a207514e72..1e68790004 100644
--- a/unsloth/models/rl_replacements.py
+++ b/unsloth/models/rl_replacements.py
@@ -26,7 +26,14 @@
 import inspect
 from collections import defaultdict
 from unsloth_zoo.rl_replacements import RL_REPLACEMENTS, left_pack_padding
-from unsloth import DEVICE_TYPE
+from .device_type import (
+    is_hip,
+    get_device_type,
+    DEVICE_TYPE,
+    DEVICE_TYPE_TORCH,
+    DEVICE_COUNT,
+    ALLOW_PREQUANTIZED_MODELS,
+)
 import textwrap
 
 RL_EXTRA_ARGS      = defaultdict(list)
diff --git a/unsloth/models/vision.py b/unsloth/models/vision.py
index b2704876e3..f2bd7c306b 100644
--- a/unsloth/models/vision.py
+++ b/unsloth/models/vision.py
@@ -71,7 +71,14 @@
     # Old HF Hub versions <= 0.0.25
     from huggingface_hub.utils._token import get_token
 pass
-from unsloth import DEVICE_TYPE, DEVICE_COUNT, DEVICE_TYPE_TORCH
+from ..device_type import (
+    is_hip,
+    get_device_type,
+    DEVICE_TYPE,
+    DEVICE_TYPE_TORCH,
+    DEVICE_COUNT,
+    ALLOW_PREQUANTIZED_MODELS,
+)
 
 __all__ = [
     "FastBaseModel",

From 09fd92546b09ec2107144f73cf8a27de183162b8 Mon Sep 17 00:00:00 2001
From: Daniel Han <danielhanchen@gmail.com>
Date: Thu, 16 Oct 2025 23:27:53 -0700
Subject: [PATCH 238/272] Update rl_replacements.py

---
 unsloth/models/rl_replacements.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/unsloth/models/rl_replacements.py b/unsloth/models/rl_replacements.py
index 1e68790004..7ecffdf63d 100644
--- a/unsloth/models/rl_replacements.py
+++ b/unsloth/models/rl_replacements.py
@@ -26,7 +26,7 @@
 import inspect
 from collections import defaultdict
 from unsloth_zoo.rl_replacements import RL_REPLACEMENTS, left_pack_padding
-from .device_type import (
+from ..device_type import (
     is_hip,
     get_device_type,
     DEVICE_TYPE,

From bd22cb1a7c5d5b5d7d13422d8af181d2093f56d4 Mon Sep 17 00:00:00 2001
From: Daniel Han <danielhanchen@gmail.com>
Date: Thu, 16 Oct 2025 23:30:14 -0700
Subject: [PATCH 239/272] Update loader.py

---
 unsloth/models/loader.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/unsloth/models/loader.py b/unsloth/models/loader.py
index ecafc5b2ce..5d1896fd5b 100644
--- a/unsloth/models/loader.py
+++ b/unsloth/models/loader.py
@@ -45,7 +45,7 @@
 pass
 from huggingface_hub import HfFileSystem
 import importlib.util
-from ...device_type import (
+from ..device_type import (
     is_hip,
     get_device_type,
     DEVICE_TYPE,

From 9fe4d319fd44ec36103bd2d26da54c69f768efe2 Mon Sep 17 00:00:00 2001
From: Daniel Han <danielhanchen@gmail.com>
Date: Fri, 17 Oct 2025 03:32:42 -0700
Subject: [PATCH 240/272] AMD install script

---
 pyproject.toml                 |  4 ++--
 unsloth/models/_amd_install.sh | 31 +++++++++++++++++++++++++++++++
 unsloth/models/_utils.py       |  2 +-
 3 files changed, 34 insertions(+), 3 deletions(-)
 create mode 100644 unsloth/models/_amd_install.sh

diff --git a/pyproject.toml b/pyproject.toml
index 2f812c769f..b490114e12 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -40,7 +40,7 @@ triton = [
     "triton-windows ; (sys_platform == 'win32') and (platform_machine == 'AMD64' or platform_machine == 'x86_64')",
 ]
 huggingface = [
-    "unsloth_zoo>=2025.10.4",
+    "unsloth_zoo>=2025.10.5",
     "wheel>=0.42.0",
     "packaging",
     "torchvision",
@@ -458,7 +458,7 @@ colab-ampere-torch220 = [
     "flash-attn>=2.6.3 ; ('linux' in sys_platform)",
 ]
 colab-new = [
-    "unsloth_zoo>=2025.10.4",
+    "unsloth_zoo>=2025.10.5",
     "packaging",
     "tyro",
     "transformers>=4.51.3,!=4.52.0,!=4.52.1,!=4.52.2,!=4.52.3,!=4.53.0,!=4.54.0,!=4.55.0,!=4.55.1,<=4.56.2",
diff --git a/unsloth/models/_amd_install.sh b/unsloth/models/_amd_install.sh
new file mode 100644
index 0000000000..637fdba7de
--- /dev/null
+++ b/unsloth/models/_amd_install.sh
@@ -0,0 +1,31 @@
+#!/usr/bin/env bash
+# _amd_install.sh
+# Non-interactive installer: build tools, PyTorch (ROCm 6.4), bitsandbytes (HIP), and Unsloth from source.
+# Usage:
+#   bash _amd_install.sh
+#
+
+set -euo pipefail
+export DEBIAN_FRONTEND=noninteractive
+
+apt-get update
+apt-get install -y --no-install-recommends build-essential cmake git
+
+pip install \
+    torch==2.8.0 torchvision torchaudio torchao==0.13.0 xformers \
+    --index-url https://download.pytorch.org/whl/rocm6.4
+
+WORKDIR="$(pwd)"
+TMPDIR="$(mktemp -d)"
+cd "$TMPDIR"
+git clone https://github.com/bitsandbytes-foundation/bitsandbytes.git
+cd bitsandbytes
+arch
+cmake -DCOMPUTE_BACKEND=hip -S .
+make -j"$(nproc)"
+pip install .
+cd "$WORKDIR"
+rm -rf "$TMPDIR"
+
+pip install "unsloth_zoo[base] @ git+https://github.com/unslothai/unsloth-zoo"
+pip install "unsloth[base] @ git+https://github.com/unslothai/unsloth"
diff --git a/unsloth/models/_utils.py b/unsloth/models/_utils.py
index 35c0a2fcd5..e713c8efff 100644
--- a/unsloth/models/_utils.py
+++ b/unsloth/models/_utils.py
@@ -12,7 +12,7 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-__version__ = "2025.10.4"
+__version__ = "2025.10.5"
 
 __all__ = [
     "SUPPORTS_BFLOAT16",

From 302649864f1bd47768975937c7c005b8dbe715bc Mon Sep 17 00:00:00 2001
From: Daniel Han <danielhanchen@gmail.com>
Date: Fri, 17 Oct 2025 03:34:06 -0700
Subject: [PATCH 241/272] Move AMD

---
 unsloth/{models => }/_amd_install.sh | 0
 1 file changed, 0 insertions(+), 0 deletions(-)
 rename unsloth/{models => }/_amd_install.sh (100%)

diff --git a/unsloth/models/_amd_install.sh b/unsloth/_amd_install.sh
similarity index 100%
rename from unsloth/models/_amd_install.sh
rename to unsloth/_amd_install.sh

From c8150dcefafe3b7753cb47ed1750622573381ba8 Mon Sep 17 00:00:00 2001
From: Daniel Han <danielhanchen@gmail.com>
Date: Fri, 17 Oct 2025 03:36:42 -0700
Subject: [PATCH 242/272] Update _amd_install.sh

---
 unsloth/_amd_install.sh | 1 +
 1 file changed, 1 insertion(+)

diff --git a/unsloth/_amd_install.sh b/unsloth/_amd_install.sh
index 637fdba7de..b83dbd2ad0 100644
--- a/unsloth/_amd_install.sh
+++ b/unsloth/_amd_install.sh
@@ -27,5 +27,6 @@ pip install .
 cd "$WORKDIR"
 rm -rf "$TMPDIR"
 
+pip install --no-deps unsloth unsloth-zoo
 pip install "unsloth_zoo[base] @ git+https://github.com/unslothai/unsloth-zoo"
 pip install "unsloth[base] @ git+https://github.com/unslothai/unsloth"

From f25c3650cb15b3acdce8086034691b9f9366cde4 Mon Sep 17 00:00:00 2001
From: Daniel Han <danielhanchen@gmail.com>
Date: Fri, 17 Oct 2025 04:00:20 -0700
Subject: [PATCH 243/272] Update pyproject.toml

---
 pyproject.toml | 14 +++++++++++---
 1 file changed, 11 insertions(+), 3 deletions(-)

diff --git a/pyproject.toml b/pyproject.toml
index b490114e12..6409d74c05 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -39,11 +39,10 @@ triton = [
     "triton>=3.0.0 ; ('linux' in sys_platform)",
     "triton-windows ; (sys_platform == 'win32') and (platform_machine == 'AMD64' or platform_machine == 'x86_64')",
 ]
-huggingface = [
+huggingfacenotorch = [
     "unsloth_zoo>=2025.10.5",
     "wheel>=0.42.0",
     "packaging",
-    "torchvision",
     "numpy",
     "tqdm",
     "psutil",
@@ -58,6 +57,10 @@ huggingface = [
     "diffusers",
     "transformers>=4.51.3,!=4.52.0,!=4.52.1,!=4.52.2,!=4.52.3,!=4.53.0,!=4.54.0,!=4.55.0,!=4.55.1,<=4.56.2",
     "trl>=0.7.9,!=0.9.0,!=0.9.1,!=0.9.2,!=0.9.3,!=0.15.0,!=0.19.0,<=0.23.0",
+]
+huggingface = [
+    "unsloth[huggingfacenotorch]",
+    "torchvision",
     "unsloth[triton]",
 ]
 windows = [
@@ -740,7 +743,12 @@ intel-gpu-torch270 = [
     "torch @ https://download.pytorch.org/whl/xpu/torch-2.7.0%2Bxpu-cp312-cp312-linux_x86_64.whl#sha256=c806d44aa2ca5d225629f6fbc6c994d5deaac2d2cde449195bc8e3522ddd219a ; ('linux' in sys_platform) and python_version == '3.12' and (platform_machine == 'AMD64' or platform_machine == 'x86_64')",
     "torch @ https://download.pytorch.org/whl/xpu/torch-2.7.0%2Bxpu-cp313-cp313-linux_x86_64.whl#sha256=25d8277b7f01d42e2e014ccbab57a2692b6ec4eff8dcf894eda1b297407cf97a ; ('linux' in sys_platform) and python_version == '3.13' and (platform_machine == 'AMD64' or platform_machine == 'x86_64')",
 ]
-
+amd = [
+    "unsloth[huggingfacenotorch]",
+    "bitsandbytes @ https://github.com/bitsandbytes-foundation/bitsandbytes/releases/download/continuous-release_main/bitsandbytes-1.33.7.preview-py3-none-manylinux_2_24_x86_64.whl ; ('linux' in sys_platform) and (platform_machine == 'AMD64' or platform_machine == 'x86_64')",
+    "bitsandbytes @ https://github.com/bitsandbytes-foundation/bitsandbytes/releases/download/continuous-release_main/bitsandbytes-1.33.7.preview-py3-none-win_amd64.whl ; (sys_platform == 'win32') and (platform_machine == 'AMD64' or platform_machine == 'x86_64')",
+    "bitsandbytes @ https://github.com/bitsandbytes-foundation/bitsandbytes/releases/download/continuous-release_main/bitsandbytes-1.33.7.preview-py3-none-manylinux_2_24_aarch64.whl ; ('linux' in sys_platform) and (platform_machine == 'aarch64')",
+]
 [project.urls]
 homepage = "http://www.unsloth.ai"
 documentation = "https://github.com/unslothai/unsloth"

From 315c2cfcc3e554b73a5744736e0c59f32263b20f Mon Sep 17 00:00:00 2001
From: Daniel Han <danielhanchen@gmail.com>
Date: Fri, 17 Oct 2025 06:53:45 -0700
Subject: [PATCH 244/272] Update pyproject.toml

---
 pyproject.toml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/pyproject.toml b/pyproject.toml
index 6409d74c05..57fab8baf3 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -40,7 +40,6 @@ triton = [
     "triton-windows ; (sys_platform == 'win32') and (platform_machine == 'AMD64' or platform_machine == 'x86_64')",
 ]
 huggingfacenotorch = [
-    "unsloth_zoo>=2025.10.5",
     "wheel>=0.42.0",
     "packaging",
     "numpy",
@@ -60,6 +59,7 @@ huggingfacenotorch = [
 ]
 huggingface = [
     "unsloth[huggingfacenotorch]",
+    "unsloth_zoo>=2025.10.5",
     "torchvision",
     "unsloth[triton]",
 ]

From b5d3df84541421054b695aec6340f03d26036866 Mon Sep 17 00:00:00 2001
From: Daniel Han <danielhanchen@gmail.com>
Date: Fri, 17 Oct 2025 06:53:55 -0700
Subject: [PATCH 245/272] Delete _amd_install.sh

---
 unsloth/_amd_install.sh | 32 --------------------------------
 1 file changed, 32 deletions(-)
 delete mode 100644 unsloth/_amd_install.sh

diff --git a/unsloth/_amd_install.sh b/unsloth/_amd_install.sh
deleted file mode 100644
index b83dbd2ad0..0000000000
--- a/unsloth/_amd_install.sh
+++ /dev/null
@@ -1,32 +0,0 @@
-#!/usr/bin/env bash
-# _amd_install.sh
-# Non-interactive installer: build tools, PyTorch (ROCm 6.4), bitsandbytes (HIP), and Unsloth from source.
-# Usage:
-#   bash _amd_install.sh
-#
-
-set -euo pipefail
-export DEBIAN_FRONTEND=noninteractive
-
-apt-get update
-apt-get install -y --no-install-recommends build-essential cmake git
-
-pip install \
-    torch==2.8.0 torchvision torchaudio torchao==0.13.0 xformers \
-    --index-url https://download.pytorch.org/whl/rocm6.4
-
-WORKDIR="$(pwd)"
-TMPDIR="$(mktemp -d)"
-cd "$TMPDIR"
-git clone https://github.com/bitsandbytes-foundation/bitsandbytes.git
-cd bitsandbytes
-arch
-cmake -DCOMPUTE_BACKEND=hip -S .
-make -j"$(nproc)"
-pip install .
-cd "$WORKDIR"
-rm -rf "$TMPDIR"
-
-pip install --no-deps unsloth unsloth-zoo
-pip install "unsloth_zoo[base] @ git+https://github.com/unslothai/unsloth-zoo"
-pip install "unsloth[base] @ git+https://github.com/unslothai/unsloth"

From 55dd1f66a86c08f938e189bb8da2f14c151e6975 Mon Sep 17 00:00:00 2001
From: Daniel Han <danielhanchen@gmail.com>
Date: Fri, 17 Oct 2025 06:54:35 -0700
Subject: [PATCH 246/272] Update device_type.py

---
 unsloth/device_type.py | 7 +++++++
 1 file changed, 7 insertions(+)

diff --git a/unsloth/device_type.py b/unsloth/device_type.py
index 547750019a..ac70d26795 100644
--- a/unsloth/device_type.py
+++ b/unsloth/device_type.py
@@ -19,10 +19,13 @@
     "DEVICE_TYPE_TORCH",
     "DEVICE_COUNT",
     "ALLOW_PREQUANTIZED_MODELS",
+    "ALLOW_BITSANDBYTES",
 ]
 
 import torch
 import functools
+from unsloth_zoo.utils import Version
+import inspect
 
 @functools.cache
 def is_hip():
@@ -70,11 +73,15 @@ def get_device_count():
 # Check blocksize for 4bit -> 64 for CUDA, 128 for AMD
 # If AMD, we cannot load pre-quantized models for now :(
 ALLOW_PREQUANTIZED_MODELS : bool = True
+# HSA_STATUS_ERROR_EXCEPTION checks - sometimes AMD fails for BnB
+ALLOW_BITSANDBYTES : bool = True
 if DEVICE_TYPE == "hip":
     try:
         from bitsandbytes.nn.modules import Params4bit
         if "blocksize = 64 if not HIP_ENVIRONMENT else 128" in inspect.getsource(Params4bit):
             ALLOW_PREQUANTIZED_MODELS = False
+        import bitsandbytes
+        ALLOW_BITSANDBYTES = Version(bitsandbytes.__version__) > Version("0.48.2.dev0")
     except:
         pass
 pass

From 0960fe45a38d06d91864d374ca3f062394fcfd85 Mon Sep 17 00:00:00 2001
From: Daniel Han <danielhanchen@gmail.com>
Date: Fri, 17 Oct 2025 06:54:53 -0700
Subject: [PATCH 247/272] Update loader.py

---
 unsloth/models/loader.py | 9 +++++++++
 1 file changed, 9 insertions(+)

diff --git a/unsloth/models/loader.py b/unsloth/models/loader.py
index 5d1896fd5b..4c6c91f6fc 100644
--- a/unsloth/models/loader.py
+++ b/unsloth/models/loader.py
@@ -52,6 +52,7 @@
     DEVICE_TYPE_TORCH,
     DEVICE_COUNT,
     ALLOW_PREQUANTIZED_MODELS,
+    ALLOW_BITSANDBYTES,
 )
 
 # https://github.com/huggingface/transformers/pull/26037 allows 4 bit loading!
@@ -199,6 +200,10 @@ def from_pretrained(
                 )
             pass
         pass
+        # Check if 4bit is allowed specifically for AMD
+        if not ALLOW_BITSANDBYTES and not use_exact_model_name:
+            print("Unsloth: AMD currently is not stable with 4bit bitsandbytes. Disabling for now.")
+            load_in_4bit = False
 
         old_model_name = model_name
         if not use_exact_model_name:
@@ -634,6 +639,10 @@ def from_pretrained(
                 "compatible with `full_finetuning=True`. If you wish to use QAT with LoRA, "
                 "please pass in `qat_scheme` in `FastLanguageModel.get_peft_model(...)` instead."
             )
+        # Check if 4bit is allowed specifically for AMD
+        if not ALLOW_BITSANDBYTES and not use_exact_model_name:
+            print("Unsloth: AMD currently is not stable with 4bit bitsandbytes. Disabling for now.")
+            load_in_4bit = False
 
         old_model_name = model_name
         if not use_exact_model_name:

From 5f0d9fa7540e7627141d043427ea36550c2f4026 Mon Sep 17 00:00:00 2001
From: Daniel Han <danielhanchen@gmail.com>
Date: Sun, 19 Oct 2025 22:20:38 -0700
Subject: [PATCH 248/272] Update _utils.py

---
 unsloth/models/_utils.py | 10 +++++++++-
 1 file changed, 9 insertions(+), 1 deletion(-)

diff --git a/unsloth/models/_utils.py b/unsloth/models/_utils.py
index 78ca68b449..1f50b8206e 100644
--- a/unsloth/models/_utils.py
+++ b/unsloth/models/_utils.py
@@ -163,7 +163,15 @@
 warnings.filterwarnings(action = "ignore", category = RuntimeWarning, module = "multiprocessing")
 warnings.filterwarnings(action = "ignore", category = RuntimeWarning, module = "multiprocess")
 warnings.filterwarnings(action = "ignore", category = UserWarning,    module = "triton")
-
+try:
+    # pydantic/_internal/_generate_schema.py:2249: UnsupportedFieldAttributeWarning: The 'frozen' attribute with value True
+    # was provided to the `Field()` function, which has no effect in the context it was used.
+    # 'frozen' is field-specific metadata, and can only be attached to a model field using `Annotated` metadata or by assignment.
+    # This may have happened because an `Annotated` type alias using the `type` statement was used, or if the `Field()` function was attached to a single member of a union type.
+    from pydantic.warnings import UnsupportedFieldAttributeWarning
+    warnings.filterwarnings(action = "ignore", category = UnsupportedFieldAttributeWarning, module = "pydantic")
+except:
+    pass
 # Stop "Special tokens have been added in the vocabulary, ..."
 import logging
 logging.getLogger("transformers.tokenization_utils_base").setLevel(logging.CRITICAL+1)

From 1f726a42943af972aa4c13642cffd401c11cc495 Mon Sep 17 00:00:00 2001
From: Daniel Han <danielhanchen@gmail.com>
Date: Sun, 19 Oct 2025 22:26:12 -0700
Subject: [PATCH 249/272] Update _utils.py

---
 unsloth/models/_utils.py | 8 ++++++++
 1 file changed, 8 insertions(+)

diff --git a/unsloth/models/_utils.py b/unsloth/models/_utils.py
index 1f50b8206e..fab02563a6 100644
--- a/unsloth/models/_utils.py
+++ b/unsloth/models/_utils.py
@@ -352,6 +352,14 @@ def filter(self, x): return not (self.text in x.getMessage())
 except:
     pass
 
+# We detected that you are using `from_pretrained` with a meta device context manager or `torch.set_default_device('meta')
+try:
+    from transformers.modeling_utils import logger as modeling_utils_logger
+    modeling_utils_logger.addFilter(HideLoggingMessage("anti-pattern"))
+    del modeling_utils_logger
+except:
+    pass
+
 # Errors out on
 # Some weights of Gemma3nForConditionalGeneration were not initialized from the model checkpoint
 from transformers.modeling_utils import logger as transformers_logger

From 8d29c64257bc1be5f54977f96590cdb5a45708be Mon Sep 17 00:00:00 2001
From: Daniel Han <danielhanchen@gmail.com>
Date: Sun, 19 Oct 2025 22:44:16 -0700
Subject: [PATCH 250/272] Update _utils.py

---
 unsloth/models/_utils.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/unsloth/models/_utils.py b/unsloth/models/_utils.py
index fab02563a6..c7284c5e96 100644
--- a/unsloth/models/_utils.py
+++ b/unsloth/models/_utils.py
@@ -169,7 +169,7 @@
     # 'frozen' is field-specific metadata, and can only be attached to a model field using `Annotated` metadata or by assignment.
     # This may have happened because an `Annotated` type alias using the `type` statement was used, or if the `Field()` function was attached to a single member of a union type.
     from pydantic.warnings import UnsupportedFieldAttributeWarning
-    warnings.filterwarnings(action = "ignore", category = UnsupportedFieldAttributeWarning, module = "pydantic")
+    warnings.filterwarnings(action = "ignore", category = UnsupportedFieldAttributeWarning)
 except:
     pass
 # Stop "Special tokens have been added in the vocabulary, ..."

From 0e9fb1da1e10a902ffba06776e2026b8435a3fb3 Mon Sep 17 00:00:00 2001
From: Daniel Han <danielhanchen@gmail.com>
Date: Sun, 19 Oct 2025 22:49:03 -0700
Subject: [PATCH 251/272] Update _utils.py

---
 unsloth/models/_utils.py | 9 ---------
 1 file changed, 9 deletions(-)

diff --git a/unsloth/models/_utils.py b/unsloth/models/_utils.py
index c7284c5e96..337413aca4 100644
--- a/unsloth/models/_utils.py
+++ b/unsloth/models/_utils.py
@@ -163,15 +163,6 @@
 warnings.filterwarnings(action = "ignore", category = RuntimeWarning, module = "multiprocessing")
 warnings.filterwarnings(action = "ignore", category = RuntimeWarning, module = "multiprocess")
 warnings.filterwarnings(action = "ignore", category = UserWarning,    module = "triton")
-try:
-    # pydantic/_internal/_generate_schema.py:2249: UnsupportedFieldAttributeWarning: The 'frozen' attribute with value True
-    # was provided to the `Field()` function, which has no effect in the context it was used.
-    # 'frozen' is field-specific metadata, and can only be attached to a model field using `Annotated` metadata or by assignment.
-    # This may have happened because an `Annotated` type alias using the `type` statement was used, or if the `Field()` function was attached to a single member of a union type.
-    from pydantic.warnings import UnsupportedFieldAttributeWarning
-    warnings.filterwarnings(action = "ignore", category = UnsupportedFieldAttributeWarning)
-except:
-    pass
 # Stop "Special tokens have been added in the vocabulary, ..."
 import logging
 logging.getLogger("transformers.tokenization_utils_base").setLevel(logging.CRITICAL+1)

From 9950e27d8d107efba09109225c702d2b6c33ea75 Mon Sep 17 00:00:00 2001
From: Daniel Han <danielhanchen@gmail.com>
Date: Sun, 19 Oct 2025 22:56:01 -0700
Subject: [PATCH 252/272] Update _utils.py

---
 unsloth/models/_utils.py | 6 ++++++
 1 file changed, 6 insertions(+)

diff --git a/unsloth/models/_utils.py b/unsloth/models/_utils.py
index 337413aca4..b38fb57c4d 100644
--- a/unsloth/models/_utils.py
+++ b/unsloth/models/_utils.py
@@ -214,6 +214,12 @@ def filter(self, x): return not (self.text in x.getMessage())
         del vllm_lora_model_logger
     except:
         pass
+    try:
+        from vllm.attention.utils.fa_utils import logger as vllm_attention_utils_fa_utils_logger
+        vllm_attention_utils_fa_utils_logger.addFilter(HideLoggingMessage("Cannot use FA version"))
+        del vllm_attention_utils_fa_utils_logger
+    except:
+        pass
 pass
 
 # The speedups for torchdynamo mostly come with GPU Ampere or higher and which is not detected here.

From d995f71d86f2bb3a25473d46ac0543e3dc306713 Mon Sep 17 00:00:00 2001
From: Daniel Han <danielhanchen@gmail.com>
Date: Sun, 19 Oct 2025 23:15:26 -0700
Subject: [PATCH 253/272] Update tokenizer_utils.py

---
 unsloth/tokenizer_utils.py | 18 +++++++++++++++++-
 1 file changed, 17 insertions(+), 1 deletion(-)

diff --git a/unsloth/tokenizer_utils.py b/unsloth/tokenizer_utils.py
index 067f2596c6..b35c1326ca 100644
--- a/unsloth/tokenizer_utils.py
+++ b/unsloth/tokenizer_utils.py
@@ -345,7 +345,23 @@ def fix_sentencepiece_tokenizer(
 ):
     # From https://github.com/google/sentencepiece/issues/121
     # We need to manually edit the sentencepiece tokenizer!
-    from transformers.utils import sentencepiece_model_pb2
+    try:
+        from transformers.convert_slow_tokenizer import import_protobuf
+        sentencepiece_model_pb2 = import_protobuf()
+    except Exception as e:
+        try:
+            import google.protobuf
+            from unsloth_zoo.utils import Version
+            protobuf_version = Version(google.protobuf.__version__)
+            if protobuf_version > Version("3.20.3"):
+                raise RuntimeError(
+                    f"Unsloth: Your protobuf version = {protobuf_version} is too new.\n"\
+                    f"Please downgrade via `pip install --force-reinstall protobuf==3.20.3`"
+                )
+        except:
+            # This will only work for older SentencePiece versions <= 3.20.3
+            from transformers.utils import sentencepiece_model_pb2
+    pass
 
     if not os.path.exists(temporary_location):
         os.makedirs(temporary_location)

From c4db81bc773329ee5488b325e69c5232497338f6 Mon Sep 17 00:00:00 2001
From: Daniel Han <danielhanchen@gmail.com>
Date: Sun, 19 Oct 2025 23:19:04 -0700
Subject: [PATCH 254/272] Versioning

---
 pyproject.toml           | 4 ++--
 unsloth/models/_utils.py | 2 +-
 2 files changed, 3 insertions(+), 3 deletions(-)

diff --git a/pyproject.toml b/pyproject.toml
index fb63f89ae6..624054caed 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -59,7 +59,7 @@ huggingfacenotorch = [
 ]
 huggingface = [
     "unsloth[huggingfacenotorch]",
-    "unsloth_zoo>=2025.10.6",
+    "unsloth_zoo>=2025.10.7",
     "torchvision",
     "unsloth[triton]",
 ]
@@ -461,7 +461,7 @@ colab-ampere-torch220 = [
     "flash-attn>=2.6.3 ; ('linux' in sys_platform)",
 ]
 colab-new = [
-    "unsloth_zoo>=2025.10.6",
+    "unsloth_zoo>=2025.10.7",
     "packaging",
     "tyro",
     "transformers>=4.51.3,!=4.52.0,!=4.52.1,!=4.52.2,!=4.52.3,!=4.53.0,!=4.54.0,!=4.55.0,!=4.55.1,<=4.56.2",
diff --git a/unsloth/models/_utils.py b/unsloth/models/_utils.py
index b38fb57c4d..22ce2fd9e5 100644
--- a/unsloth/models/_utils.py
+++ b/unsloth/models/_utils.py
@@ -12,7 +12,7 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-__version__ = "2025.10.6"
+__version__ = "2025.10.7"
 
 __all__ = [
     "SUPPORTS_BFLOAT16",

From ea37dd68e2a1c7e222ceeb41b7cf11e80ee1e9ef Mon Sep 17 00:00:00 2001
From: Daniel Han <danielhanchen@gmail.com>
Date: Sun, 19 Oct 2025 23:19:33 -0700
Subject: [PATCH 255/272] Update pyproject.toml

---
 pyproject.toml | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/pyproject.toml b/pyproject.toml
index 624054caed..1d46c8824b 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -59,7 +59,7 @@ huggingfacenotorch = [
 ]
 huggingface = [
     "unsloth[huggingfacenotorch]",
-    "unsloth_zoo>=2025.10.7",
+    "unsloth_zoo>=2025.10.8",
     "torchvision",
     "unsloth[triton]",
 ]
@@ -461,7 +461,7 @@ colab-ampere-torch220 = [
     "flash-attn>=2.6.3 ; ('linux' in sys_platform)",
 ]
 colab-new = [
-    "unsloth_zoo>=2025.10.7",
+    "unsloth_zoo>=2025.10.8",
     "packaging",
     "tyro",
     "transformers>=4.51.3,!=4.52.0,!=4.52.1,!=4.52.2,!=4.52.3,!=4.53.0,!=4.54.0,!=4.55.0,!=4.55.1,<=4.56.2",

From 5ff72340553952847773cd5a94a49e2ae9107a8c Mon Sep 17 00:00:00 2001
From: Daniel Han <danielhanchen@gmail.com>
Date: Mon, 20 Oct 2025 01:39:56 -0700
Subject: [PATCH 256/272] Update loader.py

---
 unsloth/models/loader.py | 50 +++++++++++++++++++++++++++-------------
 1 file changed, 34 insertions(+), 16 deletions(-)

diff --git a/unsloth/models/loader.py b/unsloth/models/loader.py
index 4c6c91f6fc..165b7e0551 100644
--- a/unsloth/models/loader.py
+++ b/unsloth/models/loader.py
@@ -210,10 +210,14 @@ def from_pretrained(
             model_name = get_model_name(model_name, load_in_4bit)
         # Check if pre-quantized models are allowed
         # For eg AMD GPUs need blocksize = 128, but our pre-quants are blocksize = 64
-        if not ALLOW_PREQUANTIZED_MODELS and model_name.endswith(("-unsloth-bnb-4bit", "-bnb-4bit")):
-            model_name = model_name.removesuffix("-unsloth-bnb-4bit")
-            model_name = model_name.removesuffix("-bnb-4bit")
-        pass
+        if not ALLOW_PREQUANTIZED_MODELS and model_name.lower().endswith(("-unsloth-bnb-4bit", "-bnb-4bit")):
+            model_name = model_name.lower().removesuffix("-unsloth-bnb-4bit")
+            model_name = model_name.lower().removesuffix("-bnb-4bit")
+        # Change -BF16 to all False for 4bit, 8bit etc
+        if model_name.lower().endswith("-BF16"):
+            load_in_4bit = False
+            load_in_8bit = False
+            load_in_16bit = True
 
         if USE_MODELSCOPE and not os.path.exists(model_name):
             from modelscope import snapshot_download
@@ -327,10 +331,15 @@ def from_pretrained(
                 model_name = get_model_name(model_name, load_in_4bit)
             # Check if pre-quantized models are allowed
             # For eg AMD GPUs need blocksize = 128, but our pre-quants are blocksize = 64
-            if not ALLOW_PREQUANTIZED_MODELS and model_name.endswith(("-unsloth-bnb-4bit", "-bnb-4bit")):
-                model_name = model_name.removesuffix("-unsloth-bnb-4bit")
-                model_name = model_name.removesuffix("-bnb-4bit")
-            pass
+            if not ALLOW_PREQUANTIZED_MODELS and model_name.lower().endswith(("-unsloth-bnb-4bit", "-bnb-4bit")):
+                model_name = model_name.lower().removesuffix("-unsloth-bnb-4bit")
+                model_name = model_name.lower().removesuffix("-bnb-4bit")
+            # Change -BF16 to all False for 4bit, 8bit etc
+            if model_name.lower().endswith("-BF16"):
+                load_in_4bit = False
+                load_in_8bit = False
+                load_in_16bit = True
+
             model_config = AutoConfig.from_pretrained(
                 model_name,
                 token = token,
@@ -649,10 +658,14 @@ def from_pretrained(
             model_name = get_model_name(model_name, load_in_4bit)
         # Check if pre-quantized models are allowed
         # For eg AMD GPUs need blocksize = 128, but our pre-quants are blocksize = 64
-        if not ALLOW_PREQUANTIZED_MODELS and model_name.endswith(("-unsloth-bnb-4bit", "-bnb-4bit")):
-            model_name = model_name.removesuffix("-unsloth-bnb-4bit")
-            model_name = model_name.removesuffix("-bnb-4bit")
-        pass
+        if not ALLOW_PREQUANTIZED_MODELS and model_name.lower().endswith(("-unsloth-bnb-4bit", "-bnb-4bit")):
+            model_name = model_name.lower().removesuffix("-unsloth-bnb-4bit")
+            model_name = model_name.lower().removesuffix("-bnb-4bit")
+        # Change -BF16 to all False for 4bit, 8bit etc
+        if model_name.lower().endswith("-BF16"):
+            load_in_4bit = False
+            load_in_8bit = False
+            load_in_16bit = True
 
         # Check modelscope
         if USE_MODELSCOPE and not os.path.exists(model_name):
@@ -870,10 +883,15 @@ def from_pretrained(
                 model_name = get_model_name(model_name, load_in_4bit)
             # Check if pre-quantized models are allowed
             # For eg AMD GPUs need blocksize = 128, but our pre-quants are blocksize = 64
-            if not ALLOW_PREQUANTIZED_MODELS and model_name.endswith(("-unsloth-bnb-4bit", "-bnb-4bit")):
-                model_name = model_name.removesuffix("-unsloth-bnb-4bit")
-                model_name = model_name.removesuffix("-bnb-4bit")
-            pass
+            if not ALLOW_PREQUANTIZED_MODELS and model_name.lower().endswith(("-unsloth-bnb-4bit", "-bnb-4bit")):
+                model_name = model_name.lower().removesuffix("-unsloth-bnb-4bit")
+                model_name = model_name.lower().removesuffix("-bnb-4bit")
+            # Change -BF16 to all False for 4bit, 8bit etc
+            if model_name.lower().endswith("-BF16"):
+                load_in_4bit = False
+                load_in_8bit = False
+                load_in_16bit = True
+
             model_config = AutoConfig.from_pretrained(
                 model_name,
                 token = token,

From 47c2dd6eeb3f8f044f4759701d6e14d09ce743e7 Mon Sep 17 00:00:00 2001
From: Daniel Han <danielhanchen@gmail.com>
Date: Mon, 20 Oct 2025 02:22:33 -0700
Subject: [PATCH 257/272] Update _utils.py

---
 unsloth/models/_utils.py | 155 ++++++++++++++++++++++-----------------
 1 file changed, 87 insertions(+), 68 deletions(-)

diff --git a/unsloth/models/_utils.py b/unsloth/models/_utils.py
index 22ce2fd9e5..0f5dc50450 100644
--- a/unsloth/models/_utils.py
+++ b/unsloth/models/_utils.py
@@ -905,6 +905,25 @@ def prepare_model_for_kbit_training(
 pass
 
 # =============================================
+import importlib
+global USE_MODELSCOPE
+USE_MODELSCOPE = os.environ.get("UNSLOTH_USE_MODELSCOPE", "0") == "1"
+if USE_MODELSCOPE:
+    if importlib.util.find_spec("modelscope") is None:
+        raise ImportError(f'You are using the modelscope hub, please install modelscope by `pip install modelscope -U`')
+    pass
+pass
+
+import socket
+def has_internet(host = "8.8.8.8", port = 53, timeout = 3):
+    if os.environ.get("TRANSFORMERS_OFFLINE", "0") == "1": return False
+    try:
+        socket.setdefaulttimeout(timeout)
+        socket.socket(socket.AF_INET, socket.SOCK_STREAM).connect((host, port))
+        return True
+    except socket.error as ex:
+        return False
+pass
 
 import psutil
 def _get_statistics(statistics = None, force_download = True):
@@ -912,56 +931,71 @@ def _get_statistics(statistics = None, force_download = True):
     # We simply download a README.md file from HF - all data is made public.
     # This is simply so we can check if some envs are broken or not.
     # You can disable this by commenting the below out
-    try:
-        n_cpus = psutil.cpu_count(logical = False)
-        keynames = "\n" + "\n".join(os.environ.keys())
-        if statistics is not None: pass
-        elif "\nCOLAB_"  in keynames and n_cpus == 1: statistics = "colab"
-        elif "\nCOLAB_"  in keynames: statistics = "colabpro"
-        elif "\nKAGGLE_" in keynames: statistics = "kaggle"
-        elif "\nRUNPOD_" in keynames: statistics = "runpod"
-        elif "\nAWS_"    in keynames: statistics = "aws"
-        elif "\nAZURE_"  in keynames: statistics = "azure"
-        # elif "\nK_" in keynames or "\nFUNCTION_" in keynames: statistics = "gcp"
-        elif "\nINVOCATION_ID" in keynames: statistics = "lambda"
-        # else: statistics = "other"
-        else:
-            def try_vllm_check():
-                vendor_files = (
-                    "/sys/class/dmi/id/product_version",
-                    "/sys/class/dmi/id/bios_vendor",
-                    "/sys/class/dmi/id/product_name",
-                    "/sys/class/dmi/id/chassis_asset_tag",
-                    "/sys/class/dmi/id/sys_vendor",
-                )
-                from pathlib import Path
-                for vendor_file in vendor_files:
-                    path = Path(vendor_file)
-                    if path.is_file():
-                        file_content = path.read_text().lower()
-                        if   "amazon"                in file_content: return "aws"
-                        elif "microsoft corporation" in file_content: return "azure"
-                        elif "google"                in file_content: return "gcp"
-                return "other"
-            pass
-            try:    statistics = try_vllm_check()
-            except: statistics = "other"
-        pass
-        if statistics is not None:
-            from transformers import AutoModelForCausalLM
-            stats_model = AutoModelForCausalLM.from_pretrained(
-                f"unslothai/{statistics}",
-                force_download = force_download,
+    n_cpus = psutil.cpu_count(logical = False)
+    keynames = "\n" + "\n".join(os.environ.keys())
+    if statistics is not None: pass
+    # Check modelscope for down detection
+    global USE_MODELSCOPE
+    USE_MODELSCOPE = os.environ.get("UNSLOTH_USE_MODELSCOPE", "0") == "1"
+    elif "\nCOLAB_"  in keynames and n_cpus == 1: statistics = "colab"
+    elif "\nCOLAB_"  in keynames: statistics = "colabpro"
+    elif "\nKAGGLE_" in keynames: statistics = "kaggle"
+    elif "\nRUNPOD_" in keynames: statistics = "runpod"
+    elif "\nAWS_"    in keynames: statistics = "aws"
+    elif "\nAZURE_"  in keynames: statistics = "azure"
+    # elif "\nK_" in keynames or "\nFUNCTION_" in keynames: statistics = "gcp"
+    elif "\nINVOCATION_ID" in keynames: statistics = "lambda"
+    # else: statistics = "other"
+    else:
+        def try_vllm_check():
+            vendor_files = (
+                "/sys/class/dmi/id/product_version",
+                "/sys/class/dmi/id/bios_vendor",
+                "/sys/class/dmi/id/product_name",
+                "/sys/class/dmi/id/chassis_asset_tag",
+                "/sys/class/dmi/id/sys_vendor",
             )
-            del stats_model
+            from pathlib import Path
+            for vendor_file in vendor_files:
+                path = Path(vendor_file)
+                if path.is_file():
+                    file_content = path.read_text().lower()
+                    if   "amazon"                in file_content: return "aws"
+                    elif "microsoft corporation" in file_content: return "azure"
+                    elif "google"                in file_content: return "gcp"
+            return "other"
         pass
-    except:
+        try:    statistics = try_vllm_check()
+        except: statistics = "other"
+    pass
+    if statistics is not None:
+        import tempfile
+        from huggingface_hub import snapshot_download
+        from unsloth_zoo.rl_environments import execute_with_time_limit
+        if has_internet():
+            @execute_with_time_limit(60)
+            def stats_check():
+                with tempfile.TemporaryDirectory(ignore_cleanup_errors = True) as f:
+                    snapshot_download(statistics, force_download = True, cache_dir = f, local_dir = f)
+            try:
+                stats_check()
+            except TimeoutError:
+                raise TimeoutError(
+                    "Unsloth: HuggingFace seems to be down :( Check https://status.huggingface.co/\n"\
+                    "As a temporary measure, use modelscope ie:\n"\
+                    "pip install modelscope\n"\
+                    "import os; os.environ['UNSLOTH_USE_MODELSCOPE'] = '1'\n"\
+                    "from unsloth import FastLanguageModel\n"\
+                    "model = FastLanguageModel.from_pretrained(...)"
+                )
         pass
+    pass
 pass
 
 
 def get_statistics():
     # We log some basic stats about which environment is being used.
+    # This is also to check if HuggingFace is down or not!
     # We simply download a README.md file from HF - all data is made public.
     # This is simply so we can check if some envs are broken or not.
     # You can disable this by setting UNSLOTH_DISABLE_STATISTICS
@@ -975,24 +1009,17 @@ def get_statistics():
     pass
     _get_statistics(None)
     _get_statistics("repeat", force_download = False)
-    try:
-        vram = torch.cuda.get_device_properties(0).total_memory / 1024 / 1024 / 1024
-        if   vram <= 8 : vram = 8
-        elif vram <= 16: vram = 16
-        elif vram <= 20: vram = 20
-        elif vram <= 24: vram = 24
-        elif vram <= 40: vram = 40
-        elif vram <= 48: vram = 48
-        elif vram <= 80: vram = 80
-        else: vram = 96
-        _get_statistics(f"vram-{vram}")
-    except:
-        pass
-    pass
-    try:
-        _get_statistics(f"{DEVICE_COUNT if DEVICE_COUNT <= 8 else 9}")
-    except:
-        pass
+    vram = torch.cuda.get_device_properties(0).total_memory / 1024 / 1024 / 1024
+    if   vram <= 8 : vram = 8
+    elif vram <= 16: vram = 16
+    elif vram <= 20: vram = 20
+    elif vram <= 24: vram = 24
+    elif vram <= 40: vram = 40
+    elif vram <= 48: vram = 48
+    elif vram <= 80: vram = 80
+    else: vram = 96
+    _get_statistics(f"vram-{vram}")
+    _get_statistics(f"{DEVICE_COUNT if DEVICE_COUNT <= 8 else 9}")
     if disabled: enable_progress_bars()
 pass
 
@@ -1592,14 +1619,6 @@ def __str__ (self): return LOGITS_ERROR_STRING
         except: continue
 pass
 
-import importlib
-USE_MODELSCOPE = os.environ.get("UNSLOTH_USE_MODELSCOPE", "0") == "1"
-if USE_MODELSCOPE:
-    if importlib.util.find_spec("modelscope") is None:
-        raise ImportError(f'You are using the modelscope hub, please install modelscope by `pip install modelscope -U`')
-    pass
-pass
-
 
 def validate_loftq_config(loftq_config, lora_dropout, bias, init_lora_weights, model):
     from peft import LoraConfig

From ead800eadfc9e02674fd2e89779c12005669006b Mon Sep 17 00:00:00 2001
From: Daniel Han <danielhanchen@gmail.com>
Date: Mon, 20 Oct 2025 02:55:54 -0700
Subject: [PATCH 258/272] Update pyproject.toml

---
 pyproject.toml | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/pyproject.toml b/pyproject.toml
index 1d46c8824b..624054caed 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -59,7 +59,7 @@ huggingfacenotorch = [
 ]
 huggingface = [
     "unsloth[huggingfacenotorch]",
-    "unsloth_zoo>=2025.10.8",
+    "unsloth_zoo>=2025.10.7",
     "torchvision",
     "unsloth[triton]",
 ]
@@ -461,7 +461,7 @@ colab-ampere-torch220 = [
     "flash-attn>=2.6.3 ; ('linux' in sys_platform)",
 ]
 colab-new = [
-    "unsloth_zoo>=2025.10.8",
+    "unsloth_zoo>=2025.10.7",
     "packaging",
     "tyro",
     "transformers>=4.51.3,!=4.52.0,!=4.52.1,!=4.52.2,!=4.52.3,!=4.53.0,!=4.54.0,!=4.55.0,!=4.55.1,<=4.56.2",

From 2dc242099b1b125100683f483bc23ebd87dc62dc Mon Sep 17 00:00:00 2001
From: Daniel Han <danielhanchen@gmail.com>
Date: Mon, 20 Oct 2025 02:57:34 -0700
Subject: [PATCH 259/272] Update pyproject.toml

---
 pyproject.toml | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/pyproject.toml b/pyproject.toml
index 624054caed..fb63f89ae6 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -59,7 +59,7 @@ huggingfacenotorch = [
 ]
 huggingface = [
     "unsloth[huggingfacenotorch]",
-    "unsloth_zoo>=2025.10.7",
+    "unsloth_zoo>=2025.10.6",
     "torchvision",
     "unsloth[triton]",
 ]
@@ -461,7 +461,7 @@ colab-ampere-torch220 = [
     "flash-attn>=2.6.3 ; ('linux' in sys_platform)",
 ]
 colab-new = [
-    "unsloth_zoo>=2025.10.7",
+    "unsloth_zoo>=2025.10.6",
     "packaging",
     "tyro",
     "transformers>=4.51.3,!=4.52.0,!=4.52.1,!=4.52.2,!=4.52.3,!=4.53.0,!=4.54.0,!=4.55.0,!=4.55.1,<=4.56.2",

From eba9bb35cc1b47cb7741e081b07cc0143536dc1c Mon Sep 17 00:00:00 2001
From: Daniel Han <danielhanchen@gmail.com>
Date: Mon, 20 Oct 2025 02:58:40 -0700
Subject: [PATCH 260/272] Update _utils.py

---
 unsloth/models/_utils.py | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/unsloth/models/_utils.py b/unsloth/models/_utils.py
index 0f5dc50450..4f613f1f2c 100644
--- a/unsloth/models/_utils.py
+++ b/unsloth/models/_utils.py
@@ -933,10 +933,11 @@ def _get_statistics(statistics = None, force_download = True):
     # You can disable this by commenting the below out
     n_cpus = psutil.cpu_count(logical = False)
     keynames = "\n" + "\n".join(os.environ.keys())
-    if statistics is not None: pass
     # Check modelscope for down detection
     global USE_MODELSCOPE
     USE_MODELSCOPE = os.environ.get("UNSLOTH_USE_MODELSCOPE", "0") == "1"
+
+    if statistics is not None: pass
     elif "\nCOLAB_"  in keynames and n_cpus == 1: statistics = "colab"
     elif "\nCOLAB_"  in keynames: statistics = "colabpro"
     elif "\nKAGGLE_" in keynames: statistics = "kaggle"

From 3c3765cfde9036dc3b919c0afd0dd6e8744cb97c Mon Sep 17 00:00:00 2001
From: Daniel Han <danielhanchen@gmail.com>
Date: Mon, 20 Oct 2025 02:59:15 -0700
Subject: [PATCH 261/272] Update pyproject.toml

---
 pyproject.toml | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/pyproject.toml b/pyproject.toml
index fb63f89ae6..1d46c8824b 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -59,7 +59,7 @@ huggingfacenotorch = [
 ]
 huggingface = [
     "unsloth[huggingfacenotorch]",
-    "unsloth_zoo>=2025.10.6",
+    "unsloth_zoo>=2025.10.8",
     "torchvision",
     "unsloth[triton]",
 ]
@@ -461,7 +461,7 @@ colab-ampere-torch220 = [
     "flash-attn>=2.6.3 ; ('linux' in sys_platform)",
 ]
 colab-new = [
-    "unsloth_zoo>=2025.10.6",
+    "unsloth_zoo>=2025.10.8",
     "packaging",
     "tyro",
     "transformers>=4.51.3,!=4.52.0,!=4.52.1,!=4.52.2,!=4.52.3,!=4.53.0,!=4.54.0,!=4.55.0,!=4.55.1,<=4.56.2",

From 367d6dcc05f9e77ce6e0d2b560178d84151b7521 Mon Sep 17 00:00:00 2001
From: Daniel Han <danielhanchen@gmail.com>
Date: Mon, 20 Oct 2025 03:02:51 -0700
Subject: [PATCH 262/272] Update _utils.py

---
 unsloth/models/_utils.py | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/unsloth/models/_utils.py b/unsloth/models/_utils.py
index 4f613f1f2c..547ccfd428 100644
--- a/unsloth/models/_utils.py
+++ b/unsloth/models/_utils.py
@@ -915,6 +915,7 @@ def prepare_model_for_kbit_training(
 pass
 
 import socket
+@functools.lru_cache(1)
 def has_internet(host = "8.8.8.8", port = 53, timeout = 3):
     if os.environ.get("TRANSFORMERS_OFFLINE", "0") == "1": return False
     try:
@@ -974,7 +975,7 @@ def try_vllm_check():
         from huggingface_hub import snapshot_download
         from unsloth_zoo.rl_environments import execute_with_time_limit
         if has_internet():
-            @execute_with_time_limit(60)
+            @execute_with_time_limit(120)
             def stats_check():
                 with tempfile.TemporaryDirectory(ignore_cleanup_errors = True) as f:
                     snapshot_download(statistics, force_download = True, cache_dir = f, local_dir = f)

From ca3f6882cba744fc7ee1704716bb0dd1dda72fbe Mon Sep 17 00:00:00 2001
From: Daniel Han <danielhanchen@gmail.com>
Date: Mon, 20 Oct 2025 03:04:50 -0700
Subject: [PATCH 263/272] Update _utils.py

---
 unsloth/models/_utils.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/unsloth/models/_utils.py b/unsloth/models/_utils.py
index 547ccfd428..353378bc06 100644
--- a/unsloth/models/_utils.py
+++ b/unsloth/models/_utils.py
@@ -978,7 +978,7 @@ def try_vllm_check():
             @execute_with_time_limit(120)
             def stats_check():
                 with tempfile.TemporaryDirectory(ignore_cleanup_errors = True) as f:
-                    snapshot_download(statistics, force_download = True, cache_dir = f, local_dir = f)
+                    snapshot_download(f"unslothai/{statistics}", force_download = True, cache_dir = f, local_dir = f)
             try:
                 stats_check()
             except TimeoutError:

From f6dd92b7088a063c0a104fe3a071797ef226b342 Mon Sep 17 00:00:00 2001
From: Daniel Han <danielhanchen@gmail.com>
Date: Mon, 20 Oct 2025 03:10:20 -0700
Subject: [PATCH 264/272] Update loader.py

---
 unsloth/models/loader.py | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/unsloth/models/loader.py b/unsloth/models/loader.py
index 165b7e0551..e17fb58a46 100644
--- a/unsloth/models/loader.py
+++ b/unsloth/models/loader.py
@@ -214,7 +214,7 @@ def from_pretrained(
             model_name = model_name.lower().removesuffix("-unsloth-bnb-4bit")
             model_name = model_name.lower().removesuffix("-bnb-4bit")
         # Change -BF16 to all False for 4bit, 8bit etc
-        if model_name.lower().endswith("-BF16"):
+        if model_name.lower().endswith("-bf16"):
             load_in_4bit = False
             load_in_8bit = False
             load_in_16bit = True
@@ -335,7 +335,7 @@ def from_pretrained(
                 model_name = model_name.lower().removesuffix("-unsloth-bnb-4bit")
                 model_name = model_name.lower().removesuffix("-bnb-4bit")
             # Change -BF16 to all False for 4bit, 8bit etc
-            if model_name.lower().endswith("-BF16"):
+            if model_name.lower().endswith("-bf16"):
                 load_in_4bit = False
                 load_in_8bit = False
                 load_in_16bit = True
@@ -662,7 +662,7 @@ def from_pretrained(
             model_name = model_name.lower().removesuffix("-unsloth-bnb-4bit")
             model_name = model_name.lower().removesuffix("-bnb-4bit")
         # Change -BF16 to all False for 4bit, 8bit etc
-        if model_name.lower().endswith("-BF16"):
+        if model_name.lower().endswith("-bf16"):
             load_in_4bit = False
             load_in_8bit = False
             load_in_16bit = True
@@ -887,7 +887,7 @@ def from_pretrained(
                 model_name = model_name.lower().removesuffix("-unsloth-bnb-4bit")
                 model_name = model_name.lower().removesuffix("-bnb-4bit")
             # Change -BF16 to all False for 4bit, 8bit etc
-            if model_name.lower().endswith("-BF16"):
+            if model_name.lower().endswith("-bf16"):
                 load_in_4bit = False
                 load_in_8bit = False
                 load_in_16bit = True

From 1393bd896cc413f949fcf232979d195a9dac7376 Mon Sep 17 00:00:00 2001
From: Daniel Han <danielhanchen@gmail.com>
Date: Mon, 20 Oct 2025 03:12:06 -0700
Subject: [PATCH 265/272] Update _utils.py

---
 unsloth/models/_utils.py | 6 ++++--
 1 file changed, 4 insertions(+), 2 deletions(-)

diff --git a/unsloth/models/_utils.py b/unsloth/models/_utils.py
index 353378bc06..9084437eb4 100644
--- a/unsloth/models/_utils.py
+++ b/unsloth/models/_utils.py
@@ -984,11 +984,13 @@ def stats_check():
             except TimeoutError:
                 raise TimeoutError(
                     "Unsloth: HuggingFace seems to be down :( Check https://status.huggingface.co/\n"\
-                    "As a temporary measure, use modelscope ie:\n"\
+                    "As a temporary measure, use modelscope with the same model name ie:\n"\
+                    "```\n"\
                     "pip install modelscope\n"\
                     "import os; os.environ['UNSLOTH_USE_MODELSCOPE'] = '1'\n"\
                     "from unsloth import FastLanguageModel\n"\
-                    "model = FastLanguageModel.from_pretrained(...)"
+                    "model = FastLanguageModel.from_pretrained('unsloth/gpt-oss-20b')\n"\
+                    "```"
                 )
         pass
     pass

From 0da81296c7d7b15d2b5bd3b3a079720d768b92ca Mon Sep 17 00:00:00 2001
From: Daniel Han <danielhanchen@gmail.com>
Date: Mon, 20 Oct 2025 03:12:43 -0700
Subject: [PATCH 266/272] Update _utils.py

---
 unsloth/models/_utils.py | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/unsloth/models/_utils.py b/unsloth/models/_utils.py
index 9084437eb4..5c959fe443 100644
--- a/unsloth/models/_utils.py
+++ b/unsloth/models/_utils.py
@@ -983,7 +983,8 @@ def stats_check():
                 stats_check()
             except TimeoutError:
                 raise TimeoutError(
-                    "Unsloth: HuggingFace seems to be down :( Check https://status.huggingface.co/\n"\
+                    "Unsloth: HuggingFace seems to be down after trying for 120 seconds :(\n"\
+                    "Check https://status.huggingface.co/ for more details.\n"\
                     "As a temporary measure, use modelscope with the same model name ie:\n"\
                     "```\n"\
                     "pip install modelscope\n"\

From 0a2ce91c75ca2c97fe1874cea70ce337cf2aa175 Mon Sep 17 00:00:00 2001
From: Daniel Han <danielhanchen@gmail.com>
Date: Mon, 20 Oct 2025 03:56:13 -0700
Subject: [PATCH 267/272] local_files_only

---
 unsloth/models/_utils.py | 3 ++-
 unsloth/models/llama.py  | 3 ++-
 unsloth/models/vision.py | 3 ++-
 3 files changed, 6 insertions(+), 3 deletions(-)

diff --git a/unsloth/models/_utils.py b/unsloth/models/_utils.py
index 5c959fe443..f2d2e9ee70 100644
--- a/unsloth/models/_utils.py
+++ b/unsloth/models/_utils.py
@@ -998,7 +998,7 @@ def stats_check():
 pass
 
 
-def get_statistics():
+def get_statistics(local_files_only = False):
     # We log some basic stats about which environment is being used.
     # This is also to check if HuggingFace is down or not!
     # We simply download a README.md file from HF - all data is made public.
@@ -1006,6 +1006,7 @@ def get_statistics():
     # You can disable this by setting UNSLOTH_DISABLE_STATISTICS
     import os
     if "UNSLOTH_DISABLE_STATISTICS" in os.environ: return
+    if local_files_only: return
     from huggingface_hub.utils import disable_progress_bars, enable_progress_bars, are_progress_bars_disabled
     disabled = False
     if not are_progress_bars_disabled():
diff --git a/unsloth/models/llama.py b/unsloth/models/llama.py
index 535537a3a1..d94699756b 100644
--- a/unsloth/models/llama.py
+++ b/unsloth/models/llama.py
@@ -1922,7 +1922,8 @@ def from_pretrained(
         if old_hf_transfer != "0": os.environ["HF_HUB_ENABLE_HF_TRANSFER"] = "1"
 
         model_patcher.pre_patch()
-        get_statistics() # For debugging - we use a download counter to see if environments are not breaking
+         # For debugging - we use a download counter to see if environments are not breaking or if HF is down
+        get_statistics(kwargs.get("local_files_only", False))
 
         if dtype is None:
             dtype = torch.float16 if not SUPPORTS_BFLOAT16 else torch.bfloat16
diff --git a/unsloth/models/vision.py b/unsloth/models/vision.py
index f2bd7c306b..d6322d77f2 100644
--- a/unsloth/models/vision.py
+++ b/unsloth/models/vision.py
@@ -416,7 +416,8 @@ def from_pretrained(
         pass
         if old_hf_transfer != "0": os.environ["HF_HUB_ENABLE_HF_TRANSFER"] = "1"
 
-        get_statistics() # For debugging - we use a download counter to see if environments are not breaking
+        # For debugging - we use a download counter to see if environments are not breaking or if HF is down
+        get_statistics(kwargs.get("local_files_only", False))
 
         if dtype is None:
             dtype = torch.float16 if not SUPPORTS_BFLOAT16 else torch.bfloat16

From c9f5c1aa6f06bbce9ff736253cb7428648b61602 Mon Sep 17 00:00:00 2001
From: Daniel Han <danielhanchen@gmail.com>
Date: Mon, 20 Oct 2025 04:00:21 -0700
Subject: [PATCH 268/272] Cut Cross Entropy

---
 unsloth/models/llama.py | 50 ++++++++++++++++++++---------------------
 unsloth/save.py         | 12 +++++-----
 2 files changed, 31 insertions(+), 31 deletions(-)

diff --git a/unsloth/models/llama.py b/unsloth/models/llama.py
index d94699756b..972d603cf9 100644
--- a/unsloth/models/llama.py
+++ b/unsloth/models/llama.py
@@ -1205,7 +1205,7 @@ def _CausalLM_fast_forward(
             # < 1024 Normal Unsloth uses less VRAM!
             if DEVICE_TYPE == "hip":
                 # [TODO] AMD GPUs fail on chunked_cross_entropy loss!
-                # RuntimeError: Triton Error [HIP]:  Code: 1, Messsage: invalid argument
+                # RuntimeError: Triton Error [HIP]: Code: 1, Messsage: invalid argument
                 RETURN_LOGITS = False
             elif bsz*q_len <= 1024:
                 RETURN_LOGITS = True
@@ -1217,36 +1217,36 @@ def _CausalLM_fast_forward(
                 if self.config.model_type == "falcon_h1":
                     hidden_states = hidden_states * self.config.lm_head_multiplier
 
-                # loss = fused_linear_cross_entropy(
-                #     hidden_states      = hidden_states,
-                #     lm_weight          = lm_head,
-                #     labels             = labels,
-                #     num_items_in_batch = n_items,
-                #     logit_softcapping  = logit_softcapping,
-                # )
-                loss = unsloth_fused_ce_loss(
-                    trainer              = None,
-                    hidden_states        = hidden_states,
-                    lm_head_weight       = lm_head,
-                    lm_head_bias         = None,
-                    labels               = labels,
-                    mask                 = None,
-                    n_items              = n_items,
-                    scaling              = getattr(self, "accelerator_scaler", None),
-                    target_gb            = None,
-                    torch_compile        = True,
-                    logit_softcapping    = logit_softcapping,
+                loss = fused_linear_cross_entropy(
+                    hidden_states      = hidden_states,
+                    lm_weight          = lm_head,
+                    labels             = labels,
+                    num_items_in_batch = n_items,
+                    logit_softcapping  = logit_softcapping,
                 )
+                # loss = unsloth_fused_ce_loss(
+                #     trainer              = None,
+                #     hidden_states        = hidden_states,
+                #     lm_head_weight       = lm_head,
+                #     lm_head_bias         = None,
+                #     labels               = labels,
+                #     mask                 = None,
+                #     n_items              = n_items,
+                #     scaling              = getattr(self, "accelerator_scaler", None),
+                #     target_gb            = None,
+                #     torch_compile        = True,
+                #     logit_softcapping    = logit_softcapping,
+                # )
                 if not return_dict:
                     output = (logits,) + outputs[1:]
                     return (loss,) + output if loss is not None else output
 
                 output = CausalLMOutputWithPast(
-                    loss=loss,
-                    logits=EMPTY_LOGITS,
-                    past_key_values=outputs.past_key_values,
-                    hidden_states=outputs.hidden_states,
-                    attentions=outputs.attentions,
+                    loss = loss,
+                    logits = EMPTY_LOGITS,
+                    past_key_values=  outputs.past_key_values,
+                    hidden_states = outputs.hidden_states,
+                    attentions = outputs.attentions,
                 )
                 return output
             pass
diff --git a/unsloth/save.py b/unsloth/save.py
index 506c8a68f1..a62d63fc86 100644
--- a/unsloth/save.py
+++ b/unsloth/save.py
@@ -2565,10 +2565,10 @@ def unsloth_save_pretrained_torchao(
     """
     # first merge the lora weights
     arguments = dict(locals())
-    arguments["model"]        = self
-    arguments["tokenizer"]    = tokenizer
-    arguments["push_to_hub"]  = False # We save ourselves
-    arguments["save_method"]  = "merged_16bit" # Must be 16bit
+    arguments["model"]       = self
+    arguments["tokenizer"]   = tokenizer
+    arguments["push_to_hub"] = False # We save ourselves
+    arguments["save_method"] = "merged_16bit" # Must be 16bit
     del arguments["self"]
     del arguments["torchao_config"]
 
@@ -2722,7 +2722,7 @@ def patch_saving_functions(model, vision = False):
             model.save_pretrained_merged  = types.MethodType(unsloth_generic_save_pretrained_merged,        model)
             model.push_to_hub_gguf        = types.MethodType(unsloth_push_to_hub_gguf,                      model)
             model.save_pretrained_gguf    = types.MethodType(unsloth_save_pretrained_gguf,                  model)
-            model.save_pretrained_torchao = types.MethodType(unsloth_save_pretrained_torchao,            model)
+            model.save_pretrained_torchao = types.MethodType(unsloth_save_pretrained_torchao,               model)
             model.push_to_hub_ggml        = types.MethodType(unsloth_convert_lora_to_ggml_and_push_to_hub,  model)
             model.save_pretrained_ggml    = types.MethodType(unsloth_convert_lora_to_ggml_and_save_locally, model)
         pass
@@ -2732,7 +2732,7 @@ def patch_saving_functions(model, vision = False):
         model.save_pretrained_merged  = types.MethodType(unsloth_generic_save_pretrained_merged, model)
         model.push_to_hub_gguf        = types.MethodType(unsloth_push_to_hub_gguf,               model)
         model.save_pretrained_gguf    = types.MethodType(unsloth_save_pretrained_gguf,           model)
-        model.save_pretrained_torchao = types.MethodType(unsloth_save_pretrained_torchao,       model)
+        model.save_pretrained_torchao = types.MethodType(unsloth_save_pretrained_torchao,        model)
     pass
     return model
 pass

From 76135417fe4867a54f8c87365e90c93489d95142 Mon Sep 17 00:00:00 2001
From: Daniel Han <danielhanchen@gmail.com>
Date: Mon, 20 Oct 2025 04:23:20 -0700
Subject: [PATCH 269/272] Update llama.py

---
 unsloth/models/llama.py | 40 +++++++++++++++++++++-------------------
 1 file changed, 21 insertions(+), 19 deletions(-)

diff --git a/unsloth/models/llama.py b/unsloth/models/llama.py
index 972d603cf9..d154dfe20a 100644
--- a/unsloth/models/llama.py
+++ b/unsloth/models/llama.py
@@ -1217,26 +1217,28 @@ def _CausalLM_fast_forward(
                 if self.config.model_type == "falcon_h1":
                     hidden_states = hidden_states * self.config.lm_head_multiplier
 
-                loss = fused_linear_cross_entropy(
-                    hidden_states      = hidden_states,
-                    lm_weight          = lm_head,
-                    labels             = labels,
-                    num_items_in_batch = n_items,
-                    logit_softcapping  = logit_softcapping,
-                )
-                # loss = unsloth_fused_ce_loss(
-                #     trainer              = None,
-                #     hidden_states        = hidden_states,
-                #     lm_head_weight       = lm_head,
-                #     lm_head_bias         = None,
-                #     labels               = labels,
-                #     mask                 = None,
-                #     n_items              = n_items,
-                #     scaling              = getattr(self, "accelerator_scaler", None),
-                #     target_gb            = None,
-                #     torch_compile        = True,
-                #     logit_softcapping    = logit_softcapping,
+                ### DISABLED since T4 breaks
+                # OutOfResources: out of resource: shared memory, Required: 98304, Hardware limit: 65536. Reducing block sizes or `num_stages` may help.
+                # loss = fused_linear_cross_entropy(
+                #     hidden_states      = hidden_states,
+                #     lm_weight          = lm_head,
+                #     labels             = labels,
+                #     num_items_in_batch = n_items,
+                #     logit_softcapping  = logit_softcapping,
                 # )
+                loss = unsloth_fused_ce_loss(
+                    trainer              = None,
+                    hidden_states        = hidden_states,
+                    lm_head_weight       = lm_head,
+                    lm_head_bias         = None,
+                    labels               = labels,
+                    mask                 = None,
+                    n_items              = n_items,
+                    scaling              = getattr(self, "accelerator_scaler", None),
+                    target_gb            = None,
+                    torch_compile        = True,
+                    logit_softcapping    = logit_softcapping,
+                )
                 if not return_dict:
                     output = (logits,) + outputs[1:]
                     return (loss,) + output if loss is not None else output

From 09657816afe995710745954e7dd79843f56ff9ad Mon Sep 17 00:00:00 2001
From: Daniel Han <danielhanchen@gmail.com>
Date: Thu, 30 Oct 2025 06:33:01 -0700
Subject: [PATCH 270/272] Update vision.py

---
 unsloth/models/vision.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/unsloth/models/vision.py b/unsloth/models/vision.py
index 95c6e65317..a564ad1058 100644
--- a/unsloth/models/vision.py
+++ b/unsloth/models/vision.py
@@ -218,7 +218,7 @@ def unsloth_base_fast_generate(
         dtype = torch.float16
     else:
         autocaster = torch.autocast(device_type = DEVICE_TYPE_TORCH, dtype = dtype)
-
+    print(dtype, autocaster)
     # Prepare LoRA
     # state_dict = convert_lora_modules(self, dtype = dtype)
 

From 6bf04ce034c7e6f9843a5bdd453617529d0d4484 Mon Sep 17 00:00:00 2001
From: Daniel Han <danielhanchen@gmail.com>
Date: Thu, 30 Oct 2025 06:36:53 -0700
Subject: [PATCH 271/272] Update vision.py

---
 unsloth/models/vision.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/unsloth/models/vision.py b/unsloth/models/vision.py
index a564ad1058..87a9adf53a 100644
--- a/unsloth/models/vision.py
+++ b/unsloth/models/vision.py
@@ -218,7 +218,6 @@ def unsloth_base_fast_generate(
         dtype = torch.float16
     else:
         autocaster = torch.autocast(device_type = DEVICE_TYPE_TORCH, dtype = dtype)
-    print(dtype, autocaster)
     # Prepare LoRA
     # state_dict = convert_lora_modules(self, dtype = dtype)
 
@@ -278,6 +277,7 @@ def unsloth_base_fast_generate(
     pass
 
     # DO INFERENCE
+    print(args, kwargs)
     with torch.inference_mode(), autocaster:
         output = self._old_generate(*args, **kwargs)
 

From 7ea715eaf5b1ed4211630e33f154bb086136197f Mon Sep 17 00:00:00 2001
From: Daniel Han <danielhanchen@gmail.com>
Date: Thu, 30 Oct 2025 06:42:20 -0700
Subject: [PATCH 272/272] Update vision.py

---
 unsloth/models/vision.py | 11 ++++++-----
 1 file changed, 6 insertions(+), 5 deletions(-)

diff --git a/unsloth/models/vision.py b/unsloth/models/vision.py
index 87a9adf53a..65ba9ccf9b 100644
--- a/unsloth/models/vision.py
+++ b/unsloth/models/vision.py
@@ -156,9 +156,9 @@ def unsloth_base_fast_generate(
 
     FastBaseModel.for_inference(self)
     dtype = _get_dtype(dtype_from_config(self.config))
-    # Handle float32 cases
-    if os.environ.get("UNSLOTH_BFLOAT16_MIXED_PRECISION", "0") == "1":
-        dtype = torch.bfloat16
+    # Handle full float32 cases as config.dtype == torch.float32!
+    do_bfloat16_mixed_precision = os.environ.get("UNSLOTH_BFLOAT16_MIXED_PRECISION", "0") == "1"
+    if do_bfloat16_mixed_precision: dtype = torch.bfloat16
 
     # Check if VLM
     is_vlm = any(
@@ -254,6 +254,8 @@ def unsloth_base_fast_generate(
                 cache_implementation = "hybrid"
             else:
                 cache_implementation = "static"
+    # [TODO] Unsure why static fails
+    if do_bfloat16_mixed_precision: cache_implementation = None
 
     if "generation_config" in kwargs:
         kwargs["generation_config"].cache_implementation = cache_implementation
@@ -277,7 +279,6 @@ def unsloth_base_fast_generate(
     pass
 
     # DO INFERENCE
-    print(args, kwargs)
     with torch.inference_mode(), autocaster:
         output = self._old_generate(*args, **kwargs)
 
@@ -525,7 +526,7 @@ def from_pretrained(
                         f"To enable bfloat16 training to reduce VRAM usage by 50% albeit with a slightly higher loss, do:\n"\
                         "use `float32_mixed_precision = False` during FastLanguageModel.from_pretrained"
                     )
-                os.environ["UNSLOTH_BFLOAT16_MIXED_PRECISION"] = "1"
+                    os.environ["UNSLOTH_BFLOAT16_MIXED_PRECISION"] = "1"
             else:
                 print("Unsloth: Float16 full finetuning uses more memory since we upcast weights to float32.")
         else: