From c39f56fce039742693814b7770bde020399251a3 Mon Sep 17 00:00:00 2001 From: Daniel Han Date: Sat, 9 Aug 2025 14:45:43 -0700 Subject: [PATCH 001/272] Fix mamba --- unsloth/models/loader.py | 2 ++ unsloth/models/vision.py | 1 + 2 files changed, 3 insertions(+) diff --git a/unsloth/models/loader.py b/unsloth/models/loader.py index ea746be43d..75561c4775 100644 --- a/unsloth/models/loader.py +++ b/unsloth/models/loader.py @@ -587,6 +587,8 @@ def from_pretrained( if transformers_version < Version("4.53.0"): raise RuntimeError("Unsloth: Gemma 3N only works on transformers >= 4.53.0" + LATEST) elif "falcon-h1" in lowered_model_name: + # Falcon must use float32 Triton ie TRITON_F32_DEFAULT = 'ieee' + # since Mamba kernels error out on using lower precision os.environ["UNSLOTH_FORCE_CUSTOM_DTYPE"] = \ "float16;torch.float32;torch.float16;"\ "if name.endswith(('q_proj', 'k_proj', 'v_proj', 'o_proj', 'gate_proj', 'up_proj', 'down_proj', 'head')): module.to(torch.float16); "\ diff --git a/unsloth/models/vision.py b/unsloth/models/vision.py index 5524d8f16d..bdf86196d4 100644 --- a/unsloth/models/vision.py +++ b/unsloth/models/vision.py @@ -373,6 +373,7 @@ def from_pretrained( custom_datatype = _custom_datatype # Execute code as well if len(execute_code.strip()) != 0: + print(execute_code) exec(execute_code) else: custom_datatype = None From 4bd35c509f26c4ff3409090175bba7fab4a604a9 Mon Sep 17 00:00:00 2001 From: Daniel Han Date: Sat, 9 Aug 2025 14:50:53 -0700 Subject: [PATCH 002/272] Update loader.py --- unsloth/models/loader.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/unsloth/models/loader.py b/unsloth/models/loader.py index 75561c4775..186d302d44 100644 --- a/unsloth/models/loader.py +++ b/unsloth/models/loader.py @@ -591,8 +591,8 @@ def from_pretrained( # since Mamba kernels error out on using lower precision os.environ["UNSLOTH_FORCE_CUSTOM_DTYPE"] = \ "float16;torch.float32;torch.float16;"\ - "if name.endswith(('q_proj', 'k_proj', 'v_proj', 'o_proj', 'gate_proj', 'up_proj', 'down_proj', 'head')): module.to(torch.float16); "\ - "os.environ['TRITON_F32_DEFAULT'] = 'ieee';" + "if name.endswith(('q_proj', 'k_proj', 'v_proj', 'o_proj', 'gate_proj', 'up_proj', 'down_proj', 'head')): "\ + "module, os.environ['TRITON_F32_DEFAULT'] = module.to(torch.float16), 'ieee'" elif "gpt-oss" in lowered_model_name: os.environ["UNSLOTH_DISABLE_STATIC_GENERATION"] = "1" # CCE fails on Tesla T4 From 1f0a4c32aac3ca721fb50cad39a8dbbf28e4fc1b Mon Sep 17 00:00:00 2001 From: Daniel Han Date: Sat, 9 Aug 2025 14:51:04 -0700 Subject: [PATCH 003/272] Update vision.py --- unsloth/models/vision.py | 1 - 1 file changed, 1 deletion(-) diff --git a/unsloth/models/vision.py b/unsloth/models/vision.py index bdf86196d4..5524d8f16d 100644 --- a/unsloth/models/vision.py +++ b/unsloth/models/vision.py @@ -373,7 +373,6 @@ def from_pretrained( custom_datatype = _custom_datatype # Execute code as well if len(execute_code.strip()) != 0: - print(execute_code) exec(execute_code) else: custom_datatype = None From 3cb97197d56f31c040c8bc17f68bb682aacb1928 Mon Sep 17 00:00:00 2001 From: Daniel Han Date: Sat, 9 Aug 2025 14:54:35 -0700 Subject: [PATCH 004/272] Update loader.py --- unsloth/models/loader.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/unsloth/models/loader.py b/unsloth/models/loader.py index 186d302d44..b8f2432fc0 100644 --- a/unsloth/models/loader.py +++ b/unsloth/models/loader.py @@ -591,8 +591,8 @@ def from_pretrained( # since Mamba kernels error out on using lower precision os.environ["UNSLOTH_FORCE_CUSTOM_DTYPE"] = \ "float16;torch.float32;torch.float16;"\ - "if name.endswith(('q_proj', 'k_proj', 'v_proj', 'o_proj', 'gate_proj', 'up_proj', 'down_proj', 'head')): "\ - "module, os.environ['TRITON_F32_DEFAULT'] = module.to(torch.float16), 'ieee'" + "if name.endswith(('q_proj', 'k_proj', 'v_proj', 'o_proj', 'gate_proj', 'up_proj', 'down_proj', 'head')): module.to(torch.float16);"\ + "os.environ['TRITON_F32_DEFAULT'] = 'ieee'" elif "gpt-oss" in lowered_model_name: os.environ["UNSLOTH_DISABLE_STATIC_GENERATION"] = "1" # CCE fails on Tesla T4 From 1432eac9d0b82ab732e4e4f1f9fbb0fbbb4c63df Mon Sep 17 00:00:00 2001 From: Datta Nimmaturi Date: Wed, 13 Aug 2025 08:16:43 +0530 Subject: [PATCH 005/272] Filter vLLM standby logs (#3131) * filter vLLM standby logs * safeguard standby logger patch * Update unsloth/models/_utils.py * Update unsloth/models/_utils.py * Update unsloth/models/_utils.py --------- Co-authored-by: Daniel Han --- unsloth/models/_utils.py | 34 ++++++++++++++++++++++++++++++++++ 1 file changed, 34 insertions(+) diff --git a/unsloth/models/_utils.py b/unsloth/models/_utils.py index 4426a28266..d904d8674a 100644 --- a/unsloth/models/_utils.py +++ b/unsloth/models/_utils.py @@ -152,6 +152,40 @@ def __init__(self, text): self.text = text def filter(self, x): return not (self.text in x.getMessage()) pass +if os.environ.get('UNSLOTH_ENABLE_LOGGING', '0') != '1': + try: + from vllm.worker.worker import logger as vllm_worker_logger + vllm_worker_logger.addFilter(HideLoggingMessage("Sleep mode freed")) + del vllm_worker_logger + except: + pass + try: + from vllm.v1.worker.gpu_worker import logger as vllm_gpu_worker_logger + vllm_gpu_worker_logger.addFilter(HideLoggingMessage("Sleep mode freed")) + del vllm_gpu_worker_logger + except: + pass + try: + from vllm.executor.executor_base import logger as vllm_executor_logger + vllm_executor_logger.addFilter(HideLoggingMessage("to fall asleep")) + vllm_executor_logger.addFilter(HideLoggingMessage("to wake up")) + del vllm_executor_logger + except: + pass + try: + from vllm.core.block.prefix_caching_block import logger as vllm_prefix_caching_logger + vllm_prefix_caching_logger.addFilter(HideLoggingMessage("reset prefix cache")) + del vllm_prefix_caching_logger + except: + pass + try: + from vllm.v1.core.block_pool import logger as vllm_block_pool_logger + vllm_block_pool_logger.addFilter(HideLoggingMessage("reset prefix cache")) + del vllm_block_pool_logger + except: + pass +pass + # The speedups for torchdynamo mostly come with GPU Ampere or higher and which is not detected here. from transformers.training_args import logger as transformers_training_args_logger transformers_training_args_logger.addFilter(HideLoggingMessage("The speedups")) From fd1124ab64c96af40dbdf8294a9e2bdaa55e01cf Mon Sep 17 00:00:00 2001 From: Daniel Han Date: Tue, 12 Aug 2025 21:26:39 -0700 Subject: [PATCH 006/272] Update loader.py --- unsloth/models/loader.py | 15 +++++++++++++++ 1 file changed, 15 insertions(+) diff --git a/unsloth/models/loader.py b/unsloth/models/loader.py index b8f2432fc0..15f3e43aef 100644 --- a/unsloth/models/loader.py +++ b/unsloth/models/loader.py @@ -111,6 +111,14 @@ def from_pretrained( disable_log_stats = True, *args, **kwargs, ): + # Login to allow private models + if token is None: token = get_token() + if token is not None: + try: + from huggingface_hub import login + login(token = token) + except: + pass if load_in_8bit or full_finetuning: return FastModel.from_pretrained( model_name = model_name, @@ -513,6 +521,13 @@ def from_pretrained( *args, **kwargs, ): if token is None: token = get_token() + # Login to allow private models + if token is not None: + try: + from huggingface_hub import login + login(token = token) + except: + pass if whisper_language is not None: assert(type(whisper_language) is str) if whisper_task is not None: assert(type(whisper_task) is str) SUPPORTS_BFLOAT16 = is_bfloat16_supported() From b78189b2d5a127b43a10f5aed1359a1cfe3629c5 Mon Sep 17 00:00:00 2001 From: Daniel Han Date: Wed, 13 Aug 2025 03:27:54 -0700 Subject: [PATCH 007/272] Add scaler --- unsloth/models/_utils.py | 12 ++++++++++++ unsloth/models/rl.py | 14 ++++++++++++++ 2 files changed, 26 insertions(+) diff --git a/unsloth/models/_utils.py b/unsloth/models/_utils.py index d904d8674a..3bd3c2c294 100644 --- a/unsloth/models/_utils.py +++ b/unsloth/models/_utils.py @@ -152,6 +152,7 @@ def __init__(self, text): self.text = text def filter(self, x): return not (self.text in x.getMessage()) pass +# Stop vLLM messages if os.environ.get('UNSLOTH_ENABLE_LOGGING', '0') != '1': try: from vllm.worker.worker import logger as vllm_worker_logger @@ -258,6 +259,17 @@ def filter(self, x): return not (self.text in x.getMessage()) except: pass +# You passed `quantization_config` or equivalent parameters +try: + warnings.filterwarnings( + action = "ignore", + message = r".*quantization_config.*", + category = UserWarning, + append = True, + ) +except: + pass + # Errors out on # Some weights of Gemma3nForConditionalGeneration were not initialized from the model checkpoint from transformers.modeling_utils import logger as transformers_logger diff --git a/unsloth/models/rl.py b/unsloth/models/rl.py index deb779588c..e751ef5e30 100644 --- a/unsloth/models/rl.py +++ b/unsloth/models/rl.py @@ -421,6 +421,20 @@ def _patch_trl_rl_trainers(trainer_file = "grpo_trainer"): RLTrainer_post += neftune_check pass + # Add accelerator scaler to model + if "model" in call_args: + neftune_check = \ + "if hasattr(self, 'accelerator'):\n"\ + " scaler = self.accelerator.scaler\n"\ + " current_model = model\n"\ + " while hasattr(current_model, 'model'):\n"\ + " current_model.accelerator_scaler = scaler\n"\ + " current_model = current_model.model\n"\ + " current_model.accelerator_scaler = scaler\n"\ + "pass\n" + RLTrainer_post += neftune_check + pass + # Edit optional metrics other_metrics_processor = "" if trainer_file in RL_METRICS_CHANGES: From cd2e284c97bb60618da78fcf1314f3a3a5885dd8 Mon Sep 17 00:00:00 2001 From: Daniel Han Date: Wed, 13 Aug 2025 05:12:35 -0700 Subject: [PATCH 008/272] Update llama.py --- unsloth/models/llama.py | 25 +++++++++++++++++++------ 1 file changed, 19 insertions(+), 6 deletions(-) diff --git a/unsloth/models/llama.py b/unsloth/models/llama.py index 3c0d5012ae..eafbd5a433 100644 --- a/unsloth/models/llama.py +++ b/unsloth/models/llama.py @@ -1197,12 +1197,25 @@ def _CausalLM_fast_forward( if self.config.model_type == "falcon_h1": hidden_states = hidden_states * self.config.lm_head_multiplier - loss = fused_linear_cross_entropy( - hidden_states = hidden_states, - lm_weight = lm_head, - labels = labels, - num_items_in_batch = n_items, - logit_softcapping = logit_softcapping, + # loss = fused_linear_cross_entropy( + # hidden_states = hidden_states, + # lm_weight = lm_head, + # labels = labels, + # num_items_in_batch = n_items, + # logit_softcapping = logit_softcapping, + # ) + loss = unsloth_fused_ce_loss( + trainer = None, + hidden_states = hidden_states, + lm_head_weight = lm_head, + lm_head_bias = None, + labels = labels, + mask = None, + n_items = n_items, + scaling = getattr(self, "accelerator_scaler", None), + target_gb = 1, + torch_compile = True, + logit_softcapping = logit_softcapping, ) if not return_dict: output = (logits,) + outputs[1:] From 5e976a5881296f35c6affae56178d3a2abc1fb50 Mon Sep 17 00:00:00 2001 From: Daniel Han Date: Wed, 13 Aug 2025 05:18:55 -0700 Subject: [PATCH 009/272] Update _utils.py --- unsloth/models/_utils.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/unsloth/models/_utils.py b/unsloth/models/_utils.py index 3bd3c2c294..d6eb82f01c 100644 --- a/unsloth/models/_utils.py +++ b/unsloth/models/_utils.py @@ -58,6 +58,7 @@ "HAS_CUT_CROSS_ENTROPY", "EMPTY_LOGITS", "fused_linear_cross_entropy", + "unsloth_fused_ce_loss", "patch_unsloth_smart_gradient_checkpointing", "unpatch_unsloth_smart_gradient_checkpointing", @@ -109,6 +110,7 @@ HAS_CUT_CROSS_ENTROPY, fused_linear_cross_entropy, _unsloth_get_batch_samples, + unsloth_fused_ce_loss, ) from unsloth_zoo.vision_utils import ( process_vision_info, From f451adff6be85230da2cd50bf068f23726d9b99d Mon Sep 17 00:00:00 2001 From: Daniel Han Date: Wed, 13 Aug 2025 06:04:40 -0700 Subject: [PATCH 010/272] Versioning --- pyproject.toml | 6 +++--- unsloth/models/_utils.py | 2 +- 2 files changed, 4 insertions(+), 4 deletions(-) diff --git a/pyproject.toml b/pyproject.toml index 8e18688ddf..e563ba6fc5 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -7,7 +7,7 @@ name = "unsloth" dynamic = ["version"] description = "2-5X faster LLM finetuning" readme = "README.md" -requires-python = ">=3.9,<3.13" +requires-python = ">=3.9,<=3.13" license = {text = "Apache-2.0"} keywords = ["ai", "llm",] authors = [ @@ -37,7 +37,7 @@ triton = [ ] huggingface = [ - "unsloth_zoo>=2025.8.3", + "unsloth_zoo>=2025.8.4", "packaging", "tyro", "transformers>=4.51.3,!=4.47.0,!=4.52.0,!=4.52.1,!=4.52.2,!=4.52.3,!=4.53.0", @@ -384,7 +384,7 @@ colab-ampere-torch220 = [ "flash-attn>=2.6.3", ] colab-new = [ - "unsloth_zoo>=2025.8.3", + "unsloth_zoo>=2025.8.4", "packaging", "tyro", "transformers>=4.51.3,!=4.47.0,!=4.52.0,!=4.52.1,!=4.52.2,!=4.52.3,!=4.53.0", diff --git a/unsloth/models/_utils.py b/unsloth/models/_utils.py index d6eb82f01c..d1df57ad5c 100644 --- a/unsloth/models/_utils.py +++ b/unsloth/models/_utils.py @@ -12,7 +12,7 @@ # See the License for the specific language governing permissions and # limitations under the License. -__version__ = "2025.8.4" +__version__ = "2025.8.5" __all__ = [ "SUPPORTS_BFLOAT16", From 3b82c4259cd7506b351bf9b073a3033be22da8aa Mon Sep 17 00:00:00 2001 From: Daniel Han Date: Thu, 14 Aug 2025 03:31:47 -0700 Subject: [PATCH 011/272] GPT OSS fix --- unsloth/models/loader.py | 10 ++++++++-- 1 file changed, 8 insertions(+), 2 deletions(-) diff --git a/unsloth/models/loader.py b/unsloth/models/loader.py index 7ac27158a2..960f9cc23f 100644 --- a/unsloth/models/loader.py +++ b/unsloth/models/loader.py @@ -615,12 +615,18 @@ def from_pretrained( os.environ["UNSLOTH_ENABLE_CCE"] = "0" if not load_in_4bit: # Only upcast MoE biases for MXFP4, not BnB + # Also set down projection compute dtype to be float32 os.environ["UNSLOTH_FORCE_CUSTOM_DTYPE"] = \ "all;None;None;"\ "x = 'gate_up_proj_bias'\n"\ - "if hasattr(module, x): setattr(module, x, torch.nn.Parameter(getattr(module, x).to(torch.float32)) if isinstance(getattr(module, x), torch.nn.Parameter) else getattr(module, x).to(torch.float32))\n"\ + "if hasattr(module, x): "\ + "setattr(module, x, torch.nn.Parameter(getattr(module, x).to(torch.float32)) if isinstance(getattr(module, x), torch.nn.Parameter) else getattr(module, x).to(torch.float32))\n"\ "x = 'down_proj_bias'\n"\ - "if hasattr(module, x): setattr(module, x, torch.nn.Parameter(getattr(module, x).to(torch.float32)) if isinstance(getattr(module, x), torch.nn.Parameter) else getattr(module, x).to(torch.float32))\n;" + "if hasattr(module, x): "\ + "setattr(module, x, torch.nn.Parameter(getattr(module, x).to(torch.float32)) if isinstance(getattr(module, x), torch.nn.Parameter) else getattr(module, x).to(torch.float32))\n"\ + ""\ + "if 'down_projs' in name and hasattr(module, 'compute_dtype'): module.compute_dtype = torch.float32\n"\ + ";" else: for check_model_name in DISABLE_COMPILE_MODEL_NAMES: if check_model_name in lowered_model_name: From 61366efc914563179c460c16e2e8e144fd4cb4d8 Mon Sep 17 00:00:00 2001 From: Daniel Han Date: Thu, 14 Aug 2025 03:50:52 -0700 Subject: [PATCH 012/272] GPT OSS fix --- unsloth/models/_utils.py | 2 ++ unsloth/models/loader.py | 4 +++- 2 files changed, 5 insertions(+), 1 deletion(-) diff --git a/unsloth/models/_utils.py b/unsloth/models/_utils.py index d1df57ad5c..ab2694fde1 100644 --- a/unsloth/models/_utils.py +++ b/unsloth/models/_utils.py @@ -68,6 +68,7 @@ "patch_fast_lora", "validate_loftq_config", "RaiseUninitialized", + "dequantize_module_weight", ] import torch @@ -724,6 +725,7 @@ def prepare_model_for_kbit_training( # Weirdly LoraLayer.update_layer downcasts PEFT layers to float16?? # For mixed precision, we need it to be in float32 not float16. from peft import __version__ as peft_version +from peft.utils.integrations import dequantize_module_weight if Version(peft_version) < Version("0.12.0"): from peft.tuners.lora.layer import LoraLayer try: diff --git a/unsloth/models/loader.py b/unsloth/models/loader.py index 960f9cc23f..bb102376d4 100644 --- a/unsloth/models/loader.py +++ b/unsloth/models/loader.py @@ -625,7 +625,9 @@ def from_pretrained( "if hasattr(module, x): "\ "setattr(module, x, torch.nn.Parameter(getattr(module, x).to(torch.float32)) if isinstance(getattr(module, x), torch.nn.Parameter) else getattr(module, x).to(torch.float32))\n"\ ""\ - "if 'down_projs' in name and hasattr(module, 'compute_dtype'): module.compute_dtype = torch.float32\n"\ + "if 'down_projs' in name and hasattr(module, 'compute_dtype') and "\ + "torch.amax(dequantize_module_weight(module)) >= 1024:"\ + "module.compute_dtype = torch.float32\n"\ ";" else: for check_model_name in DISABLE_COMPILE_MODEL_NAMES: From de043d95684df41bf69ec8ea3c29538a9bcab1e4 Mon Sep 17 00:00:00 2001 From: Daniel Han Date: Thu, 14 Aug 2025 04:28:57 -0700 Subject: [PATCH 013/272] Update loader.py --- unsloth/models/loader.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/unsloth/models/loader.py b/unsloth/models/loader.py index bb102376d4..c61aab750d 100644 --- a/unsloth/models/loader.py +++ b/unsloth/models/loader.py @@ -627,7 +627,7 @@ def from_pretrained( ""\ "if 'down_projs' in name and hasattr(module, 'compute_dtype') and "\ "torch.amax(dequantize_module_weight(module)) >= 1024:"\ - "module.compute_dtype = torch.float32\n"\ + "module._pre_set_compute_dtype = torch.float32\n"\ ";" else: for check_model_name in DISABLE_COMPILE_MODEL_NAMES: From c1ef6f1a6270e24b47259856e4b229f44cbe4053 Mon Sep 17 00:00:00 2001 From: Daniel Han Date: Thu, 14 Aug 2025 04:36:16 -0700 Subject: [PATCH 014/272] Update vision.py --- unsloth/models/vision.py | 1 + 1 file changed, 1 insertion(+) diff --git a/unsloth/models/vision.py b/unsloth/models/vision.py index 5524d8f16d..0f267104f3 100644 --- a/unsloth/models/vision.py +++ b/unsloth/models/vision.py @@ -458,6 +458,7 @@ def from_pretrained( # Edit data-types if custom_datatype is not None: for jj, (name, module) in enumerate(model.named_modules()): + print(custom_datatype) exec(custom_datatype) pass pass From f18cd268bae43f9c531bc78a0ded608339b9b056 Mon Sep 17 00:00:00 2001 From: Daniel Han Date: Thu, 14 Aug 2025 04:41:27 -0700 Subject: [PATCH 015/272] Update vision.py --- unsloth/models/vision.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/unsloth/models/vision.py b/unsloth/models/vision.py index 0f267104f3..fcba556e7a 100644 --- a/unsloth/models/vision.py +++ b/unsloth/models/vision.py @@ -356,6 +356,7 @@ def from_pretrained( correct_dtype = None if os.environ.get("UNSLOTH_FORCE_CUSTOM_DTYPE", "") != "": custom_datatype = os.environ["UNSLOTH_FORCE_CUSTOM_DTYPE"] + print(custom_datatype) assert custom_datatype.count(";") >= 4 checker, _dtype, _bnb_compute_dtype, _custom_datatype, execute_code = custom_datatype.split(";", 4) @@ -371,6 +372,7 @@ def from_pretrained( bnb_compute_dtype = eval(_bnb_compute_dtype) correct_dtype = bnb_compute_dtype custom_datatype = _custom_datatype + print(custom_datatype) # Execute code as well if len(execute_code.strip()) != 0: exec(execute_code) @@ -458,7 +460,6 @@ def from_pretrained( # Edit data-types if custom_datatype is not None: for jj, (name, module) in enumerate(model.named_modules()): - print(custom_datatype) exec(custom_datatype) pass pass From 02152243313ae76b42e4b887d7d5c1c87b0901a6 Mon Sep 17 00:00:00 2001 From: Daniel Han Date: Thu, 14 Aug 2025 04:44:56 -0700 Subject: [PATCH 016/272] Update loader.py --- unsloth/models/loader.py | 9 +++++---- unsloth/models/vision.py | 2 -- 2 files changed, 5 insertions(+), 6 deletions(-) diff --git a/unsloth/models/loader.py b/unsloth/models/loader.py index c61aab750d..d0b7d4dc4c 100644 --- a/unsloth/models/loader.py +++ b/unsloth/models/loader.py @@ -615,16 +615,17 @@ def from_pretrained( os.environ["UNSLOTH_ENABLE_CCE"] = "0" if not load_in_4bit: # Only upcast MoE biases for MXFP4, not BnB - # Also set down projection compute dtype to be float32 os.environ["UNSLOTH_FORCE_CUSTOM_DTYPE"] = \ "all;None;None;"\ "x = 'gate_up_proj_bias'\n"\ "if hasattr(module, x): "\ "setattr(module, x, torch.nn.Parameter(getattr(module, x).to(torch.float32)) if isinstance(getattr(module, x), torch.nn.Parameter) else getattr(module, x).to(torch.float32))\n"\ "x = 'down_proj_bias'\n"\ - "if hasattr(module, x): "\ - "setattr(module, x, torch.nn.Parameter(getattr(module, x).to(torch.float32)) if isinstance(getattr(module, x), torch.nn.Parameter) else getattr(module, x).to(torch.float32))\n"\ - ""\ + ";" + else: + # Set down projection compute dtype to be float32 for float16 machines + os.environ["UNSLOTH_FORCE_CUSTOM_DTYPE"] = \ + "all;None;None;"\ "if 'down_projs' in name and hasattr(module, 'compute_dtype') and "\ "torch.amax(dequantize_module_weight(module)) >= 1024:"\ "module._pre_set_compute_dtype = torch.float32\n"\ diff --git a/unsloth/models/vision.py b/unsloth/models/vision.py index fcba556e7a..5524d8f16d 100644 --- a/unsloth/models/vision.py +++ b/unsloth/models/vision.py @@ -356,7 +356,6 @@ def from_pretrained( correct_dtype = None if os.environ.get("UNSLOTH_FORCE_CUSTOM_DTYPE", "") != "": custom_datatype = os.environ["UNSLOTH_FORCE_CUSTOM_DTYPE"] - print(custom_datatype) assert custom_datatype.count(";") >= 4 checker, _dtype, _bnb_compute_dtype, _custom_datatype, execute_code = custom_datatype.split(";", 4) @@ -372,7 +371,6 @@ def from_pretrained( bnb_compute_dtype = eval(_bnb_compute_dtype) correct_dtype = bnb_compute_dtype custom_datatype = _custom_datatype - print(custom_datatype) # Execute code as well if len(execute_code.strip()) != 0: exec(execute_code) From 5ed4a46e7c37e81e9db29f205ad811b061c330c1 Mon Sep 17 00:00:00 2001 From: Daniel Han Date: Thu, 14 Aug 2025 17:23:46 -0700 Subject: [PATCH 017/272] Update vision.py --- unsloth/models/vision.py | 1 + 1 file changed, 1 insertion(+) diff --git a/unsloth/models/vision.py b/unsloth/models/vision.py index 5524d8f16d..bfd0011f89 100644 --- a/unsloth/models/vision.py +++ b/unsloth/models/vision.py @@ -386,6 +386,7 @@ def from_pretrained( print(f"Unsloth: {model_type_arch.title()} does not support SDPA - switching to eager!") del kwargs["attn_implementation"] pass + print(supports_sdpa, kwargs) bnb_config = None if full_finetuning and (load_in_4bit or load_in_8bit): From a22255811467e34ddac87e9af9879e141bb35673 Mon Sep 17 00:00:00 2001 From: Daniel Han Date: Thu, 14 Aug 2025 19:22:16 -0700 Subject: [PATCH 018/272] Update vision.py --- unsloth/models/vision.py | 1 - 1 file changed, 1 deletion(-) diff --git a/unsloth/models/vision.py b/unsloth/models/vision.py index bfd0011f89..5524d8f16d 100644 --- a/unsloth/models/vision.py +++ b/unsloth/models/vision.py @@ -386,7 +386,6 @@ def from_pretrained( print(f"Unsloth: {model_type_arch.title()} does not support SDPA - switching to eager!") del kwargs["attn_implementation"] pass - print(supports_sdpa, kwargs) bnb_config = None if full_finetuning and (load_in_4bit or load_in_8bit): From 6cffb1cb06a7b2b5d14a3d36acc5970f1bd790a5 Mon Sep 17 00:00:00 2001 From: Daniel Han Date: Fri, 15 Aug 2025 04:25:15 -0700 Subject: [PATCH 019/272] Update llama.py --- unsloth/models/llama.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/unsloth/models/llama.py b/unsloth/models/llama.py index ab7f4bfdde..ae03a685eb 100644 --- a/unsloth/models/llama.py +++ b/unsloth/models/llama.py @@ -701,8 +701,9 @@ def LlamaModel_fast_forward( # Fix out of bounds tokenization if hasattr(self, "max_seq_length"): if seq_length > self.max_seq_length: + shape = input_ids.shape if input_ids is not None else inputs_embeds.shape logger.warning_once( - f"Unsloth: Input IDs of length {seq_length} > the model's max sequence length of {self.max_seq_length}.\n"\ + f"Unsloth: Input IDs of shape {shape} with length {seq_length} > the model's max sequence length of {self.max_seq_length}.\n"\ "We shall truncate it ourselves. It's imperative if you correct this issue first." ) if input_ids is not None: From 15d33a5f0a3fed1e8fbd89acf25dda33ceefc436 Mon Sep 17 00:00:00 2001 From: Daniel Han Date: Fri, 15 Aug 2025 04:34:50 -0700 Subject: [PATCH 020/272] Update llama.py --- unsloth/models/llama.py | 3 +++ 1 file changed, 3 insertions(+) diff --git a/unsloth/models/llama.py b/unsloth/models/llama.py index ae03a685eb..badcd51a12 100644 --- a/unsloth/models/llama.py +++ b/unsloth/models/llama.py @@ -698,6 +698,9 @@ def LlamaModel_fast_forward( seq_length_with_past = seq_length + shape = input_ids.shape if input_ids is not None else inputs_embeds.shape + print(shape) + # Fix out of bounds tokenization if hasattr(self, "max_seq_length"): if seq_length > self.max_seq_length: From 95a4dafadb9c1a3b65b4b0c0643741a4b6e144eb Mon Sep 17 00:00:00 2001 From: Daniel Han Date: Fri, 15 Aug 2025 04:54:45 -0700 Subject: [PATCH 021/272] Update llama.py --- unsloth/models/llama.py | 3 --- 1 file changed, 3 deletions(-) diff --git a/unsloth/models/llama.py b/unsloth/models/llama.py index badcd51a12..ae03a685eb 100644 --- a/unsloth/models/llama.py +++ b/unsloth/models/llama.py @@ -698,9 +698,6 @@ def LlamaModel_fast_forward( seq_length_with_past = seq_length - shape = input_ids.shape if input_ids is not None else inputs_embeds.shape - print(shape) - # Fix out of bounds tokenization if hasattr(self, "max_seq_length"): if seq_length > self.max_seq_length: From 4104bba896a760833061ece7dbbdff7423b5d141 Mon Sep 17 00:00:00 2001 From: Daniel Han Date: Fri, 15 Aug 2025 04:55:34 -0700 Subject: [PATCH 022/272] Versioning --- pyproject.toml | 4 ++-- unsloth/models/_utils.py | 2 +- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/pyproject.toml b/pyproject.toml index e563ba6fc5..6f6f225bde 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -37,7 +37,7 @@ triton = [ ] huggingface = [ - "unsloth_zoo>=2025.8.4", + "unsloth_zoo>=2025.8.5", "packaging", "tyro", "transformers>=4.51.3,!=4.47.0,!=4.52.0,!=4.52.1,!=4.52.2,!=4.52.3,!=4.53.0", @@ -384,7 +384,7 @@ colab-ampere-torch220 = [ "flash-attn>=2.6.3", ] colab-new = [ - "unsloth_zoo>=2025.8.4", + "unsloth_zoo>=2025.8.5", "packaging", "tyro", "transformers>=4.51.3,!=4.47.0,!=4.52.0,!=4.52.1,!=4.52.2,!=4.52.3,!=4.53.0", diff --git a/unsloth/models/_utils.py b/unsloth/models/_utils.py index ab2694fde1..c84fd118e7 100644 --- a/unsloth/models/_utils.py +++ b/unsloth/models/_utils.py @@ -12,7 +12,7 @@ # See the License for the specific language governing permissions and # limitations under the License. -__version__ = "2025.8.5" +__version__ = "2025.8.6" __all__ = [ "SUPPORTS_BFLOAT16", From 8cc1999edaee313354f76c2c232389ad3bf07f23 Mon Sep 17 00:00:00 2001 From: Daniel Han Date: Fri, 15 Aug 2025 05:03:06 -0700 Subject: [PATCH 023/272] Update mapper.py --- unsloth/models/mapper.py | 10 ++++++++++ 1 file changed, 10 insertions(+) diff --git a/unsloth/models/mapper.py b/unsloth/models/mapper.py index 829fe29583..e8fc55c2bd 100644 --- a/unsloth/models/mapper.py +++ b/unsloth/models/mapper.py @@ -941,6 +941,16 @@ "Qwen/Qwen3-4B-Thinking-2507", "unsloth/Qwen3-4B-Thinking-2507-bnb-4bit", ), + "unsloth/gemma-3-270m-it-unsloth-bnb-4bit" : ( + "unsloth/gemma-3-270m-it", + "google/gemma-3-270m-it", + "unsloth/gemma-3-270m-it-bnb-4bit", + ), + "unsloth/gemma-3-270m-unsloth-bnb-4bit" : ( + "unsloth/gemma-3-270m", + "google/gemma-3-270m", + "unsloth/gemma-3-270m-bnb-4bit", + ), } INT_TO_FLOAT_MAPPER = {} From ffda8a743c54fb648e8fef8039dfbd724d2fdce2 Mon Sep 17 00:00:00 2001 From: Daniel Han Date: Fri, 15 Aug 2025 18:39:46 -0700 Subject: [PATCH 024/272] Update vision.py --- unsloth/models/vision.py | 1 + 1 file changed, 1 insertion(+) diff --git a/unsloth/models/vision.py b/unsloth/models/vision.py index a5de457cef..a629021339 100644 --- a/unsloth/models/vision.py +++ b/unsloth/models/vision.py @@ -451,6 +451,7 @@ def from_pretrained( # attn_implementation = attn_implementation, **kwargs, ) + print(model.model.layers[0].input_layernorm.weight, model.model.layers[0].input_layernorm.weight.dtype) raise_handler.remove() # Return old flag os.environ["HF_HUB_ENABLE_HF_TRANSFER"] = old_hf_transfer From cdf2e17aea327a652b034a9a2601fee0ae780fb5 Mon Sep 17 00:00:00 2001 From: Daniel Han Date: Fri, 15 Aug 2025 18:49:30 -0700 Subject: [PATCH 025/272] Update vision.py --- unsloth/models/vision.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/unsloth/models/vision.py b/unsloth/models/vision.py index a629021339..fa3bb25e12 100644 --- a/unsloth/models/vision.py +++ b/unsloth/models/vision.py @@ -470,6 +470,7 @@ def from_pretrained( if DEVICE_TYPE == "cuda": torch.cuda.empty_cache() elif DEVICE_TYPE == "xpu": torch.xpu.empty_cache() pass + print(model.model.layers[0].input_layernorm.weight, model.model.layers[0].input_layernorm.weight.dtype) # Counteract saved tokenizers tokenizer_name = model_name if tokenizer_name is None else tokenizer_name @@ -516,6 +517,7 @@ def from_pretrained( ) model, tokenizer = patch_tokenizer(model, tokenizer) model = post_patch_loss_function(model) + print(model.model.layers[0].input_layernorm.weight, model.model.layers[0].input_layernorm.weight.dtype) # Log Unsloth version for future fastpaths for inference if hasattr(model, "config"): From 941d1aeb8f6fb724ca2ca2bc6793980e0647931c Mon Sep 17 00:00:00 2001 From: Daniel Han Date: Fri, 15 Aug 2025 18:52:00 -0700 Subject: [PATCH 026/272] Update vision.py --- unsloth/models/vision.py | 1 + 1 file changed, 1 insertion(+) diff --git a/unsloth/models/vision.py b/unsloth/models/vision.py index fa3bb25e12..4dc9cc4639 100644 --- a/unsloth/models/vision.py +++ b/unsloth/models/vision.py @@ -507,6 +507,7 @@ def from_pretrained( tokenizer.pad_token_id = __tokenizer.pad_token_id pass # Fix other stuff like BnB compute data types + print("do_forced_float32", do_forced_float32) model, tokenizer = patch_model_and_tokenizer( model, tokenizer, From 73fa72cb69866bec70cad78855fef994eb95b916 Mon Sep 17 00:00:00 2001 From: Daniel Han Date: Fri, 15 Aug 2025 19:13:27 -0700 Subject: [PATCH 027/272] Upcast norms --- unsloth/models/loader.py | 33 +++++++++++++++++++++++++++------ 1 file changed, 27 insertions(+), 6 deletions(-) diff --git a/unsloth/models/loader.py b/unsloth/models/loader.py index 59226f0f42..edd909abfe 100644 --- a/unsloth/models/loader.py +++ b/unsloth/models/loader.py @@ -571,8 +571,15 @@ def from_pretrained( elif "qwen2.5" in lowered_model_name and transformers_version < Version("4.49.0"): raise RuntimeError("Unsloth: Qwen 2.5 only works on transformers >= 4.49.0." + LATEST) # Gemma 3 - elif "gemma-3" in lowered_model_name and transformers_version < Version("4.50.0.dev0"): - raise RuntimeError("Unsloth: Gemma 3 only works on transformers >= 4.50.0." + NIGHTLY) + elif "gemma-3" in lowered_model_name: + if transformers_version < Version("4.50.0.dev0"): + raise RuntimeError("Unsloth: Gemma 3 only works on transformers >= 4.50.0." + NIGHTLY) + # Set norms to float32 since anyways they get upcasted to float32 + os.environ["UNSLOTH_FORCE_CUSTOM_DTYPE"] = \ + "all;None;None;"\ + "if name.endswith('norm'): "\ + "module._pre_set_compute_dtype = torch.float32\n"\ + ";" # Cohere elif "c4ai-command-a-03-2025" in lowered_model_name and transformers_version < Version("4.50.0.dev0"): raise RuntimeError("Unsloth: Cohere's Command model only works on transformers >= 4.50.0." + NIGHTLY) @@ -582,7 +589,8 @@ def from_pretrained( os.environ["UNSLOTH_DISABLE_STATIC_GENERATION"] = "1" # Sesame fails os.environ["UNSLOTH_FORCE_CUSTOM_DTYPE"] = \ "all;torch.float32;torch.float16;"\ - "if name.endswith(('_proj', 'fc1', 'fc2', 'codebook', 'head')): module.to(torch.float16);" + "if name.endswith(('_proj', 'fc1', 'fc2', 'codebook', 'head')): module.to(torch.float16)"\ + ";" # Granite 4 elif 'granite-4' in lowered_model_name: # granite-4 rms norms are stored as 16 bit, but we upcast @@ -594,9 +602,12 @@ def from_pretrained( # Gemma 3N elif "gemma-3n" in lowered_model_name: os.environ["UNSLOTH_DISABLE_STATIC_GENERATION"] = "1" + # Set norms to float32 since anyways they get upcasted to float32 os.environ["UNSLOTH_FORCE_CUSTOM_DTYPE"] = \ "float16;torch.float16;torch.float16;"\ - "if name.endswith(('.conv')): module;"\ + "if name.endswith('norm'): "\ + "module._pre_set_compute_dtype = torch.float32\n"\ + ";"\ "from unsloth_zoo.temporary_patches.gemma3n import patch_Gemma3nConvNormAct_forward; patch_Gemma3nConvNormAct_forward()" if transformers_version < Version("4.53.0"): @@ -606,7 +617,8 @@ def from_pretrained( # since Mamba kernels error out on using lower precision os.environ["UNSLOTH_FORCE_CUSTOM_DTYPE"] = \ "float16;torch.float32;torch.float16;"\ - "if name.endswith(('q_proj', 'k_proj', 'v_proj', 'o_proj', 'gate_proj', 'up_proj', 'down_proj', 'head')): module.to(torch.float16);"\ + "if name.endswith(('q_proj', 'k_proj', 'v_proj', 'o_proj', 'gate_proj', 'up_proj', 'down_proj', 'head')): module.to(torch.float16)"\ + ";"\ "os.environ['TRITON_F32_DEFAULT'] = 'ieee'" elif "gpt-oss" in lowered_model_name: os.environ["UNSLOTH_DISABLE_STATIC_GENERATION"] = "1" @@ -615,22 +627,31 @@ def from_pretrained( os.environ["UNSLOTH_ENABLE_CCE"] = "0" if not load_in_4bit: # Only upcast MoE biases for MXFP4, not BnB + # Set norms to float32 since anyways they get upcasted to float32 os.environ["UNSLOTH_FORCE_CUSTOM_DTYPE"] = \ "all;None;None;"\ "x = 'gate_up_proj_bias'\n"\ "if hasattr(module, x): "\ "setattr(module, x, torch.nn.Parameter(getattr(module, x).to(torch.float32)) if isinstance(getattr(module, x), torch.nn.Parameter) else getattr(module, x).to(torch.float32))\n"\ + ""\ "x = 'down_proj_bias'\n"\ "if hasattr(module, x): "\ "setattr(module, x, torch.nn.Parameter(getattr(module, x).to(torch.float32)) if isinstance(getattr(module, x), torch.nn.Parameter) else getattr(module, x).to(torch.float32))\n"\ + ""\ + "if name.endswith('norm'): "\ + "module._pre_set_compute_dtype = torch.float32\n"\ ";" else: # Set down projection compute dtype to be float32 for float16 machines + # Set norms to float32 since anyways they get upcasted to float32 os.environ["UNSLOTH_FORCE_CUSTOM_DTYPE"] = \ "all;None;None;"\ - "if 'down_projs' in name and hasattr(module, 'compute_dtype') and "\ + "if 'down_projs' in name and "\ "torch.amax(dequantize_module_weight(module)) >= 1024:"\ "module._pre_set_compute_dtype = torch.float32\n"\ + ""\ + "if name.endswith('norm'): "\ + "module._pre_set_compute_dtype = torch.float32\n"\ ";" else: for check_model_name in DISABLE_COMPILE_MODEL_NAMES: From e4bbeef2c9b56635ff20ffbaff865c26a052babc Mon Sep 17 00:00:00 2001 From: Daniel Han Date: Fri, 15 Aug 2025 19:22:19 -0700 Subject: [PATCH 028/272] Update loader.py --- unsloth/models/loader.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/unsloth/models/loader.py b/unsloth/models/loader.py index edd909abfe..86850b0253 100644 --- a/unsloth/models/loader.py +++ b/unsloth/models/loader.py @@ -646,7 +646,7 @@ def from_pretrained( # Set norms to float32 since anyways they get upcasted to float32 os.environ["UNSLOTH_FORCE_CUSTOM_DTYPE"] = \ "all;None;None;"\ - "if 'down_projs' in name and "\ + "if 'down_projs' in name and hasattr(module, 'weight') and "\ "torch.amax(dequantize_module_weight(module)) >= 1024:"\ "module._pre_set_compute_dtype = torch.float32\n"\ ""\ From c8d00bebb323700f00742dec14b1319603db7720 Mon Sep 17 00:00:00 2001 From: Daniel Han Date: Fri, 15 Aug 2025 19:25:03 -0700 Subject: [PATCH 029/272] Update vision.py --- unsloth/models/vision.py | 4 ---- 1 file changed, 4 deletions(-) diff --git a/unsloth/models/vision.py b/unsloth/models/vision.py index 4dc9cc4639..a5de457cef 100644 --- a/unsloth/models/vision.py +++ b/unsloth/models/vision.py @@ -451,7 +451,6 @@ def from_pretrained( # attn_implementation = attn_implementation, **kwargs, ) - print(model.model.layers[0].input_layernorm.weight, model.model.layers[0].input_layernorm.weight.dtype) raise_handler.remove() # Return old flag os.environ["HF_HUB_ENABLE_HF_TRANSFER"] = old_hf_transfer @@ -470,7 +469,6 @@ def from_pretrained( if DEVICE_TYPE == "cuda": torch.cuda.empty_cache() elif DEVICE_TYPE == "xpu": torch.xpu.empty_cache() pass - print(model.model.layers[0].input_layernorm.weight, model.model.layers[0].input_layernorm.weight.dtype) # Counteract saved tokenizers tokenizer_name = model_name if tokenizer_name is None else tokenizer_name @@ -507,7 +505,6 @@ def from_pretrained( tokenizer.pad_token_id = __tokenizer.pad_token_id pass # Fix other stuff like BnB compute data types - print("do_forced_float32", do_forced_float32) model, tokenizer = patch_model_and_tokenizer( model, tokenizer, @@ -518,7 +515,6 @@ def from_pretrained( ) model, tokenizer = patch_tokenizer(model, tokenizer) model = post_patch_loss_function(model) - print(model.model.layers[0].input_layernorm.weight, model.model.layers[0].input_layernorm.weight.dtype) # Log Unsloth version for future fastpaths for inference if hasattr(model, "config"): From 564b6f8cd6f73bd0f064347a0d83ab236783317e Mon Sep 17 00:00:00 2001 From: Daniel Han Date: Sat, 16 Aug 2025 23:10:15 -0700 Subject: [PATCH 030/272] Upcast layernorms --- unsloth/models/loader.py | 24 +++++++++--------------- unsloth/models/vision.py | 6 ++++++ 2 files changed, 15 insertions(+), 15 deletions(-) diff --git a/unsloth/models/loader.py b/unsloth/models/loader.py index 86850b0253..e59aef1fd0 100644 --- a/unsloth/models/loader.py +++ b/unsloth/models/loader.py @@ -575,11 +575,7 @@ def from_pretrained( if transformers_version < Version("4.50.0.dev0"): raise RuntimeError("Unsloth: Gemma 3 only works on transformers >= 4.50.0." + NIGHTLY) # Set norms to float32 since anyways they get upcasted to float32 - os.environ["UNSLOTH_FORCE_CUSTOM_DTYPE"] = \ - "all;None;None;"\ - "if name.endswith('norm'): "\ - "module._pre_set_compute_dtype = torch.float32\n"\ - ";" + os.environ["UNSLOTH_HIGH_PRECISION_LAYERNORM"] = "1" # Cohere elif "c4ai-command-a-03-2025" in lowered_model_name and transformers_version < Version("4.50.0.dev0"): raise RuntimeError("Unsloth: Cohere's Command model only works on transformers >= 4.50.0." + NIGHTLY) @@ -593,25 +589,25 @@ def from_pretrained( ";" # Granite 4 elif 'granite-4' in lowered_model_name: - # granite-4 rms norms are stored as 16 bit, but we upcast - os.environ["UNSLOTH_UPCAST_LAYERNORM"] = "1" + # Granite-4 rms norms are stored as 16 bit, but we upcast + os.environ["UNSLOTH_HIGH_PRECISION_LAYERNORM"] = "1" os.environ["UNSLOTH_DISABLE_STATIC_GENERATION"] = "1" # Olmo 2 elif "olmo-2" in lowered_model_name and transformers_version < Version("4.50.0.dev0"): raise RuntimeError("Unsloth: OLMo-2 only works on transformers >= 4.50.0." + NIGHTLY) # Gemma 3N elif "gemma-3n" in lowered_model_name: + if transformers_version < Version("4.53.0"): + raise RuntimeError("Unsloth: Gemma 3N only works on transformers >= 4.53.0" + LATEST) os.environ["UNSLOTH_DISABLE_STATIC_GENERATION"] = "1" - # Set norms to float32 since anyways they get upcasted to float32 os.environ["UNSLOTH_FORCE_CUSTOM_DTYPE"] = \ "float16;torch.float16;torch.float16;"\ "if name.endswith('norm'): "\ "module._pre_set_compute_dtype = torch.float32\n"\ ";"\ "from unsloth_zoo.temporary_patches.gemma3n import patch_Gemma3nConvNormAct_forward; patch_Gemma3nConvNormAct_forward()" - - if transformers_version < Version("4.53.0"): - raise RuntimeError("Unsloth: Gemma 3N only works on transformers >= 4.53.0" + LATEST) + # Set norms to float32 since anyways they get upcasted to float32 + os.environ["UNSLOTH_HIGH_PRECISION_LAYERNORM"] = "1" elif "falcon-h1" in lowered_model_name: # Falcon must use float32 Triton ie TRITON_F32_DEFAULT = 'ieee' # since Mamba kernels error out on using lower precision @@ -638,8 +634,6 @@ def from_pretrained( "if hasattr(module, x): "\ "setattr(module, x, torch.nn.Parameter(getattr(module, x).to(torch.float32)) if isinstance(getattr(module, x), torch.nn.Parameter) else getattr(module, x).to(torch.float32))\n"\ ""\ - "if name.endswith('norm'): "\ - "module._pre_set_compute_dtype = torch.float32\n"\ ";" else: # Set down projection compute dtype to be float32 for float16 machines @@ -650,9 +644,9 @@ def from_pretrained( "torch.amax(dequantize_module_weight(module)) >= 1024:"\ "module._pre_set_compute_dtype = torch.float32\n"\ ""\ - "if name.endswith('norm'): "\ - "module._pre_set_compute_dtype = torch.float32\n"\ ";" + # Set norms to float32 since anyways they get upcasted to float32 + os.environ["UNSLOTH_HIGH_PRECISION_LAYERNORM"] = "1" else: for check_model_name in DISABLE_COMPILE_MODEL_NAMES: if check_model_name in lowered_model_name: diff --git a/unsloth/models/vision.py b/unsloth/models/vision.py index a5de457cef..6790c5cd12 100644 --- a/unsloth/models/vision.py +++ b/unsloth/models/vision.py @@ -455,6 +455,12 @@ def from_pretrained( # Return old flag os.environ["HF_HUB_ENABLE_HF_TRANSFER"] = old_hf_transfer + # Check float32 norm weights + if os.environ.get("UNSLOTH_HIGH_PRECISION_LAYERNORM", "0") == "1": + for jj, (name, module) in enumerate(model.named_modules()): + if name.endswith("norm") and hasattr(module, "weight"): + module._pre_set_compute_dtype = torch.float32 + pass # Edit data-types if custom_datatype is not None: with torch.no_grad(): From b8a34b4a5eeeddab69320aed0097a801d7d0b1b8 Mon Sep 17 00:00:00 2001 From: Daniel Han Date: Sun, 17 Aug 2025 16:45:46 -0700 Subject: [PATCH 031/272] Update llama.py --- unsloth/models/llama.py | 1 + 1 file changed, 1 insertion(+) diff --git a/unsloth/models/llama.py b/unsloth/models/llama.py index ae03a685eb..7217c0b593 100644 --- a/unsloth/models/llama.py +++ b/unsloth/models/llama.py @@ -170,6 +170,7 @@ def needs_device_kw(fn) -> bool: if "cache_position" in kwargs: kwargs["position_ids"] = kwargs["cache_position"] + print(attention_mask) return { "input_ids" : input_ids, "attention_mask": attention_mask, **kwargs, } pass From 509fcb5ea138a7f7d29d033399b0fd0d953499e4 Mon Sep 17 00:00:00 2001 From: Daniel Han Date: Sun, 17 Aug 2025 16:55:02 -0700 Subject: [PATCH 032/272] Update llama.py --- unsloth/models/llama.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/unsloth/models/llama.py b/unsloth/models/llama.py index 7217c0b593..6beb9943e8 100644 --- a/unsloth/models/llama.py +++ b/unsloth/models/llama.py @@ -170,7 +170,6 @@ def needs_device_kw(fn) -> bool: if "cache_position" in kwargs: kwargs["position_ids"] = kwargs["cache_position"] - print(attention_mask) return { "input_ids" : input_ids, "attention_mask": attention_mask, **kwargs, } pass @@ -798,6 +797,7 @@ def LlamaModel_fast_forward( pass # Ignore attention_mask + print(attention_mask) if attention_mask is None: padding_mask = None elif self.training: From 27f1a2efc64f75eade35e5322b2278bbb1b8812a Mon Sep 17 00:00:00 2001 From: Daniel Han Date: Sun, 17 Aug 2025 17:38:42 -0700 Subject: [PATCH 033/272] Update llama.py --- unsloth/models/llama.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/unsloth/models/llama.py b/unsloth/models/llama.py index 6beb9943e8..763d69a5b8 100644 --- a/unsloth/models/llama.py +++ b/unsloth/models/llama.py @@ -797,7 +797,7 @@ def LlamaModel_fast_forward( pass # Ignore attention_mask - print(attention_mask) + print(attention_mask, attention_mask.dtype, attention_mask.shape, attention_mask[:, :, 0]) if attention_mask is None: padding_mask = None elif self.training: From 931851abfdd6fea51c72eee6afdc4809fec14bc3 Mon Sep 17 00:00:00 2001 From: Daniel Han Date: Sun, 17 Aug 2025 17:51:17 -0700 Subject: [PATCH 034/272] Update llama.py --- unsloth/models/llama.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/unsloth/models/llama.py b/unsloth/models/llama.py index 763d69a5b8..7cb39f9c77 100644 --- a/unsloth/models/llama.py +++ b/unsloth/models/llama.py @@ -797,7 +797,7 @@ def LlamaModel_fast_forward( pass # Ignore attention_mask - print(attention_mask, attention_mask.dtype, attention_mask.shape, attention_mask[:, :, 0]) + print(attention_mask, attention_mask.dtype, attention_mask.shape, attention_mask) if attention_mask is None: padding_mask = None elif self.training: From 3b9057bf81aedafba9c7d30f7e3eca80486bec07 Mon Sep 17 00:00:00 2001 From: Daniel Han Date: Sun, 17 Aug 2025 19:16:35 -0700 Subject: [PATCH 035/272] Update llama.py --- unsloth/models/llama.py | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) diff --git a/unsloth/models/llama.py b/unsloth/models/llama.py index 7cb39f9c77..4100afc60e 100644 --- a/unsloth/models/llama.py +++ b/unsloth/models/llama.py @@ -11,7 +11,7 @@ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. - +global final_attention_mask import torch import gc import math @@ -797,7 +797,10 @@ def LlamaModel_fast_forward( pass # Ignore attention_mask - print(attention_mask, attention_mask.dtype, attention_mask.shape, attention_mask) + if "RAISE_ATTENTION_MASK" in os.environ: + global final_attention_mask + final_attention_mask = attention_mask + raise if attention_mask is None: padding_mask = None elif self.training: From 3dd87bb0ccc3886611f7fe60e24ec97393c47342 Mon Sep 17 00:00:00 2001 From: Daniel Han Date: Mon, 18 Aug 2025 03:10:07 -0700 Subject: [PATCH 036/272] Update llama.py --- unsloth/models/llama.py | 6 +----- 1 file changed, 1 insertion(+), 5 deletions(-) diff --git a/unsloth/models/llama.py b/unsloth/models/llama.py index 4100afc60e..ae03a685eb 100644 --- a/unsloth/models/llama.py +++ b/unsloth/models/llama.py @@ -11,7 +11,7 @@ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. -global final_attention_mask + import torch import gc import math @@ -797,10 +797,6 @@ def LlamaModel_fast_forward( pass # Ignore attention_mask - if "RAISE_ATTENTION_MASK" in os.environ: - global final_attention_mask - final_attention_mask = attention_mask - raise if attention_mask is None: padding_mask = None elif self.training: From b757faf23e7c4cdbc5eee85c39f4841fd9841450 Mon Sep 17 00:00:00 2001 From: Daniel Han Date: Mon, 18 Aug 2025 05:36:47 -0700 Subject: [PATCH 037/272] Update save.py --- unsloth/save.py | 41 +++++++++++++++++++++++++++++++++++++---- 1 file changed, 37 insertions(+), 4 deletions(-) diff --git a/unsloth/save.py b/unsloth/save.py index e6d09b78fa..ef9c84e925 100644 --- a/unsloth/save.py +++ b/unsloth/save.py @@ -1195,6 +1195,41 @@ def save_to_gguf( f"--outfile {final_location} --vocab-type {vocab_type} "\ f"--outtype {first_conversion} --concurrency {n_cpus} --pad-vocab" else: + # Fix up conversion script is possible + with open(convert_location, "rb") as f: converter_latest = f.read() + # Fix metadata + converter_latest = re.sub( + rb"(self\.metadata \= .+?\(.+?\)"\ + rb"[\n]{1,}([\s]{4,}))", + rb"\1"\ + rb"if hasattr(self.metadata, 'quantized_by'): self.metadata.quantized_by = 'Unsloth'\n"\ + rb"\2if hasattr(self.metadata, 'repo_url'): self.metadata.repo_url = 'https://huggingface.co/unsloth'\n"\ + rb"\2if hasattr(self.metadata, 'tags'): self.metadata.tags = ['unsloth', 'llama.cpp']\n"\ + rb"\2", + converter_latest, + ) + + # Make mistral_common optional for now + # from x import y + converter_latest = re.sub( + rb"(from mistral_common[^\n\(]{1,})[\s]{0,}\n", + rb"try:\n \1\nexcept:\n pass\n", + converter_latest, + ) + # from x import (y, z,) + converter_latest = re.sub( + rb"(from mistral_common[^\n\(]{1,}[\s]{0,}\(.+?\))", + rb"try:\n \1\nexcept:\n pass\n", + converter_latest, + flags = re.MULTILINE | re.DOTALL, + ) + + try: + # Write file + with open(convert_location, "wb") as file: + file.write(converter_latest) + except: + pass command = f"python {convert_location} {model_directory} "\ f"--outfile {final_location} "\ f"--outtype {first_conversion}" @@ -1694,7 +1729,7 @@ def push_to_ollama_hub(username: str, model_name: str, tag: str): print(f"\nMODEL PUBLISHED FAILED WITH RETURN CODE {return_code}") else: print("\nMODEL PUBLISHED SUCCESSFULLY") - +pass def push_to_ollama( tokenizer, @@ -1726,9 +1761,7 @@ def push_to_ollama( ) print("Successfully pushed to ollama") - - - +pass def unsloth_save_pretrained_gguf( From 2e86333f332204c613a2e5636b88f0e1ef34487d Mon Sep 17 00:00:00 2001 From: Daniel Han Date: Mon, 18 Aug 2025 05:42:11 -0700 Subject: [PATCH 038/272] Update rl.py --- unsloth/models/rl.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/unsloth/models/rl.py b/unsloth/models/rl.py index e751ef5e30..b08d4eda62 100644 --- a/unsloth/models/rl.py +++ b/unsloth/models/rl.py @@ -487,6 +487,8 @@ def _patch_trl_rl_trainers(trainer_file = "grpo_trainer"): "logging_steps" : 1, "max_seq_length" : None, "num_generations" : 8, + "steps_per_generation" : 1, # Otherwise defaults to ga_steps which is wrong + "generation_batch_size" : None, # Useless. If steps_per_generation set, generation_batch_size clashes "top_k" : None, "vllm_mode" : "colocate", "generation_kwargs" : {}, From b01e948b8d351ce1a8ae41de55e8dc7a7648bc32 Mon Sep 17 00:00:00 2001 From: Daniel Han Date: Mon, 18 Aug 2025 05:44:09 -0700 Subject: [PATCH 039/272] Update pyproject.toml --- pyproject.toml | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/pyproject.toml b/pyproject.toml index 6f6f225bde..f8558a83b6 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -37,7 +37,7 @@ triton = [ ] huggingface = [ - "unsloth_zoo>=2025.8.5", + "unsloth_zoo>=2025.8.6", "packaging", "tyro", "transformers>=4.51.3,!=4.47.0,!=4.52.0,!=4.52.1,!=4.52.2,!=4.52.3,!=4.53.0", @@ -384,7 +384,7 @@ colab-ampere-torch220 = [ "flash-attn>=2.6.3", ] colab-new = [ - "unsloth_zoo>=2025.8.5", + "unsloth_zoo>=2025.8.6", "packaging", "tyro", "transformers>=4.51.3,!=4.47.0,!=4.52.0,!=4.52.1,!=4.52.2,!=4.52.3,!=4.53.0", From a751fd789636a36ba1edd75775946a1339689e00 Mon Sep 17 00:00:00 2001 From: Daniel Han Date: Mon, 18 Aug 2025 06:07:29 -0700 Subject: [PATCH 040/272] Update rl.py --- unsloth/models/rl.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/unsloth/models/rl.py b/unsloth/models/rl.py index b08d4eda62..52b1e83694 100644 --- a/unsloth/models/rl.py +++ b/unsloth/models/rl.py @@ -487,8 +487,8 @@ def _patch_trl_rl_trainers(trainer_file = "grpo_trainer"): "logging_steps" : 1, "max_seq_length" : None, "num_generations" : 8, - "steps_per_generation" : 1, # Otherwise defaults to ga_steps which is wrong - "generation_batch_size" : None, # Useless. If steps_per_generation set, generation_batch_size clashes + # "steps_per_generation" : 1, # Otherwise defaults to ga_steps which is wrong + # "generation_batch_size" : None, # Useless. If steps_per_generation set, generation_batch_size clashes "top_k" : None, "vllm_mode" : "colocate", "generation_kwargs" : {}, From 3cb6eaf68bda8bb8bad74bd2087c6f1aa366d80e Mon Sep 17 00:00:00 2001 From: Daniel Han Date: Mon, 18 Aug 2025 06:24:30 -0700 Subject: [PATCH 041/272] Update rl_replacements.py --- unsloth/models/rl_replacements.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/unsloth/models/rl_replacements.py b/unsloth/models/rl_replacements.py index 2555f0df1f..717e6cbf11 100644 --- a/unsloth/models/rl_replacements.py +++ b/unsloth/models/rl_replacements.py @@ -556,7 +556,7 @@ def grpo_trainer_fix_batch_size(RLTrainer_source, RLConfig_source): " per_device_train_batch_size = num_generations\n" return check_batch_size pass -RL_CONFIG_CHANGES["grpo_trainer"].append(grpo_trainer_fix_batch_size) +# RL_CONFIG_CHANGES["grpo_trainer"].append(grpo_trainer_fix_batch_size) # Add other reward function names From de77a26c00cbc93050e103cf5060e54eac72b15c Mon Sep 17 00:00:00 2001 From: Daniel Han Date: Mon, 18 Aug 2025 21:02:30 -0700 Subject: [PATCH 042/272] Update rl.py --- unsloth/models/rl.py | 24 ++++++++++++++++++++++-- 1 file changed, 22 insertions(+), 2 deletions(-) diff --git a/unsloth/models/rl.py b/unsloth/models/rl.py index 52b1e83694..4dabdee639 100644 --- a/unsloth/models/rl.py +++ b/unsloth/models/rl.py @@ -133,15 +133,18 @@ class Unsloth{RLConfig_name}({RLConfig_name}): default = -1, metadata = {{'help': 'Chunk size to reduce memory usage. -1 is most efficient.'}}, ) + {max_seq_length_pre} def __init__({RLConfig_arguments}, vllm_sampling_params = None, unsloth_num_chunks = -1, + {max_seq_length_call} **kwargs, ): {RLConfig_extra_args} super().__init__({RLConfig_call_args}{RLConfig_kwargs}) self.vllm_sampling_params = vllm_sampling_params self.unsloth_num_chunks = unsloth_num_chunks + {max_seq_length_post} pass {RLTrainer_extras} @@ -266,6 +269,21 @@ def _patch_trl_rl_trainers(trainer_file = "grpo_trainer"): extra_args += mixed_precision pass + # Check if max_seq_length is NOT defined (max_length is now default) + if "max_seq_length" not in call_args and "max_length" in call_args: + max_seq_length_pre = \ + """max_seq_length : Optional[int] = field( + default = None, + metadata = {{'help': 'Maximum sequence length to truncate to.'}}, + )""" + max_seq_length_call = "max_seq_length = max_seq_length," + max_seq_length_post = "self.max_seq_length = max_seq_length" + else: + max_seq_length_pre = "" + max_seq_length_call = "" + max_seq_length_post = "" + pass + # Check if per_device_eval_batch_size (default 8) bigger than bsz # Also use FP16 / BF16 evaluation if "args" in call_args: @@ -353,9 +371,7 @@ def _patch_trl_rl_trainers(trainer_file = "grpo_trainer"): " max_length = args.max_length\n"\ " else:\n"\ " model_max_length = getattr(model, 'max_seq_length', None)\n"\ - " # print(model_max_length, 'mml1')\n"\ " if model_max_length is None: model_max_length = getattr(model, 'max_length', None)\n"\ - " # print(model_max_length, 'mml2')\n"\ " if model_max_length is not None:\n"\ " args.max_length = model_max_length\n"\ " max_length = args.max_length\n"\ @@ -666,6 +682,10 @@ def _patch_trl_rl_trainers(trainer_file = "grpo_trainer"): RLTrainer_post = RLTrainer_post, RL_pre = RL_pre, + max_seq_length_pre = max_seq_length_pre, + max_seq_length_call = max_seq_length_call, + max_seq_length_post = max_seq_length_post, + selective_log_softmax_code = selective_log_softmax_code, ) From 27ca53180d68e80818e8e40f03e85d6abd897401 Mon Sep 17 00:00:00 2001 From: Daniel Han Date: Mon, 18 Aug 2025 21:08:45 -0700 Subject: [PATCH 043/272] Update rl.py --- unsloth/models/rl.py | 30 +++++++++++++++--------------- 1 file changed, 15 insertions(+), 15 deletions(-) diff --git a/unsloth/models/rl.py b/unsloth/models/rl.py index 4dabdee639..f21bcbe4db 100644 --- a/unsloth/models/rl.py +++ b/unsloth/models/rl.py @@ -269,21 +269,6 @@ def _patch_trl_rl_trainers(trainer_file = "grpo_trainer"): extra_args += mixed_precision pass - # Check if max_seq_length is NOT defined (max_length is now default) - if "max_seq_length" not in call_args and "max_length" in call_args: - max_seq_length_pre = \ - """max_seq_length : Optional[int] = field( - default = None, - metadata = {{'help': 'Maximum sequence length to truncate to.'}}, - )""" - max_seq_length_call = "max_seq_length = max_seq_length," - max_seq_length_post = "self.max_seq_length = max_seq_length" - else: - max_seq_length_pre = "" - max_seq_length_call = "" - max_seq_length_post = "" - pass - # Check if per_device_eval_batch_size (default 8) bigger than bsz # Also use FP16 / BF16 evaluation if "args" in call_args: @@ -551,6 +536,21 @@ def _patch_trl_rl_trainers(trainer_file = "grpo_trainer"): extra_args += learning_rate_check pass + # Check if max_seq_length is NOT defined (max_length is now default) + if "max_seq_length" not in call_args and "max_length" in call_args: + max_seq_length_pre = \ + """max_seq_length : Optional[int] = field( + default = None, + metadata = {{'help': 'Maximum sequence length to truncate to.'}}, + )""" + max_seq_length_call = "max_seq_length = max_seq_length," + max_seq_length_post = "self.max_seq_length = max_seq_length" + else: + max_seq_length_pre = "" + max_seq_length_call = "" + max_seq_length_post = "" + pass + # Add output_dir saving if "output_dir" in call_args: # Default checks From 6514c8ee55baf15360f5bf840dcaf6e8cf9eeb0f Mon Sep 17 00:00:00 2001 From: Daniel Han Date: Mon, 18 Aug 2025 21:10:05 -0700 Subject: [PATCH 044/272] Update rl.py --- unsloth/models/rl.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/unsloth/models/rl.py b/unsloth/models/rl.py index f21bcbe4db..afa6b25731 100644 --- a/unsloth/models/rl.py +++ b/unsloth/models/rl.py @@ -541,7 +541,7 @@ def _patch_trl_rl_trainers(trainer_file = "grpo_trainer"): max_seq_length_pre = \ """max_seq_length : Optional[int] = field( default = None, - metadata = {{'help': 'Maximum sequence length to truncate to.'}}, + metadata = {'help': 'Maximum sequence length to truncate to.'}, )""" max_seq_length_call = "max_seq_length = max_seq_length," max_seq_length_post = "self.max_seq_length = max_seq_length" From 3e29ae7ca8fa2ef130a3dedce365d5c33a7d63b7 Mon Sep 17 00:00:00 2001 From: Daniel Han Date: Mon, 18 Aug 2025 22:41:37 -0700 Subject: [PATCH 045/272] Update _utils.py --- unsloth/models/_utils.py | 32 ++++++++++++++++++++++++++++++++ 1 file changed, 32 insertions(+) diff --git a/unsloth/models/_utils.py b/unsloth/models/_utils.py index 749becf098..dd1798f105 100644 --- a/unsloth/models/_utils.py +++ b/unsloth/models/_utils.py @@ -273,6 +273,38 @@ def filter(self, x): return not (self.text in x.getMessage()) except: pass +# Using a slow image processor as `use_fast` +try: + from transformers.processing_utils import logger as processing_utils_logger + processing_utils_logger.addFilter(HideLoggingMessage("`use_fast`")) + del processing_utils_logger +except: + pass + +# Using a slow image processor as `use_fast` +try: + from transformers.models.auto.image_processing_auto import logger as processing_utils_logger + processing_utils_logger.addFilter(HideLoggingMessage("`use_fast`")) + del processing_utils_logger +except: + pass + +# `use_cache=True` is incompatible with gradient checkpointing +try: + from transformers.trainer import logger as trainer_logger + trainer_logger.addFilter(HideLoggingMessage("`use_cache=True`")) + del trainer_logger +except: + pass + +# `use_cache=True` is incompatible with gradient checkpointing +try: + from transformers.utils.generic import logger as trainer_logger + trainer_logger.addFilter(HideLoggingMessage("`use_cache=True`")) + del trainer_logger +except: + pass + # Errors out on # Some weights of Gemma3nForConditionalGeneration were not initialized from the model checkpoint from transformers.modeling_utils import logger as transformers_logger From a42f6247d09a42ce858a4ce6af733463c2eb958b Mon Sep 17 00:00:00 2001 From: Daniel Han Date: Tue, 19 Aug 2025 02:33:58 -0700 Subject: [PATCH 046/272] Update __init__.py --- unsloth/__init__.py | 25 +++++++++++++++++++++++++ 1 file changed, 25 insertions(+) diff --git a/unsloth/__init__.py b/unsloth/__init__.py index 5d9ddbd43f..1055dfb3eb 100644 --- a/unsloth/__init__.py +++ b/unsloth/__init__.py @@ -12,6 +12,31 @@ # See the License for the specific language governing permissions and # limitations under the License. +try: + # Fix up AttributeError: 'MessageFactory' object has no attribute 'GetPrototype' + # MUST do this at the start primarily due to tensorflow causing issues + import google.protobuf.message_factory + class MessageFactory: + def CreatePrototype(self, *args, **kwargs): return + def GetMessages(self, *args, **kwargs): return + def GetPrototype(self, *args, **kwargs): return + if not hasattr(google.protobuf.message_factory, "MessageFactory"): + google.protobuf.message_factory.MessageFactory = MessageFactory + elif hasattr(google.protobuf.message_factory, "MessageFactory") and \ + not hasattr(google.protobuf.message_factory.MessageFactory, "GetPrototype") and \ + not hasattr(google.protobuf.message_factory, "GetMessageClass"): + google.protobuf.message_factory.MessageFactory = MessageFactory + elif hasattr(google.protobuf.message_factory, "MessageFactory") and \ + not hasattr(google.protobuf.message_factory.MessageFactory, "GetPrototype") and \ + hasattr(google.protobuf.message_factory, "GetMessageClass"): + GetMessageClass = google.protobuf.message_factory.GetMessageClass + def GetPrototype(self, descriptor): + return GetMessageClass(descriptor) + google.protobuf.message_factory.MessageFactory.GetPrototype = GetPrototype + pass +except: + pass + import warnings, importlib, sys from packaging.version import Version import os, re, subprocess, inspect From 9437f9e269d28070c2ee68abd6dce087b0cb78f4 Mon Sep 17 00:00:00 2001 From: Daniel Han Date: Tue, 19 Aug 2025 03:14:46 -0700 Subject: [PATCH 047/272] Torch 2.8 --- pyproject.toml | 112 ++++++++++++++++++++++++++++++++++++++- unsloth/_auto_install.py | 6 ++- 2 files changed, 116 insertions(+), 2 deletions(-) diff --git a/pyproject.toml b/pyproject.toml index f8558a83b6..0462327beb 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -207,6 +207,16 @@ cu126onlytorch260 = [ "xformers @ https://download.pytorch.org/whl/cu126/xformers-0.0.29.post3-cp311-cp311-win_amd64.whl ; python_version=='3.11' and platform_system == 'Windows'", "xformers @ https://download.pytorch.org/whl/cu126/xformers-0.0.29.post3-cp312-cp312-win_amd64.whl ; python_version=='3.12' and platform_system == 'Windows'", ] +cu118onlytorch270 = [ + "xformers @ https://download.pytorch.org/whl/cu118/xformers-0.0.30-cp39-cp39-manylinux_2_28_x86_64.whl ; python_version=='3.9' and platform_system == 'Linux'", + "xformers @ https://download.pytorch.org/whl/cu118/xformers-0.0.30-cp310-cp310-manylinux_2_28_x86_64.whl ; python_version=='3.10' and platform_system == 'Linux'", + "xformers @ https://download.pytorch.org/whl/cu118/xformers-0.0.30-cp311-cp311-manylinux_2_28_x86_64.whl ; python_version=='3.11' and platform_system == 'Linux'", + "xformers @ https://download.pytorch.org/whl/cu118/xformers-0.0.30-cp312-cp312-manylinux_2_28_x86_64.whl ; python_version=='3.12' and platform_system == 'Linux'", + "xformers @ https://download.pytorch.org/whl/cu118/xformers-0.0.30-cp39-cp39-win_amd64.whl ; python_version=='3.9' and platform_system == 'Windows'", + "xformers @ https://download.pytorch.org/whl/cu118/xformers-0.0.30-cp310-cp310-win_amd64.whl ; python_version=='3.10' and platform_system == 'Windows'", + "xformers @ https://download.pytorch.org/whl/cu118/xformers-0.0.30-cp311-cp311-win_amd64.whl ; python_version=='3.11' and platform_system == 'Windows'", + "xformers @ https://download.pytorch.org/whl/cu118/xformers-0.0.30-cp312-cp312-win_amd64.whl ; python_version=='3.12' and platform_system == 'Windows'", +] cu126onlytorch270 = [ "xformers @ https://download.pytorch.org/whl/cu126/xformers-0.0.30-cp39-cp39-manylinux_2_28_x86_64.whl ; python_version=='3.9' and platform_system == 'Linux'", "xformers @ https://download.pytorch.org/whl/cu126/xformers-0.0.30-cp310-cp310-manylinux_2_28_x86_64.whl ; python_version=='3.10' and platform_system == 'Linux'", @@ -227,6 +237,30 @@ cu128onlytorch270 = [ "xformers @ https://download.pytorch.org/whl/cu128/xformers-0.0.30-cp311-cp311-win_amd64.whl ; python_version=='3.11' and platform_system == 'Windows'", "xformers @ https://download.pytorch.org/whl/cu128/xformers-0.0.30-cp312-cp312-win_amd64.whl ; python_version=='3.12' and platform_system == 'Windows'", ] +cu118onlytorch271 = [ + "xformers @ https://download.pytorch.org/whl/cu118/xformers-0.0.31.post1-cp39-abi3-manylinux_2_28_x86_64.whl ; platform_system == 'Linux'", + "xformers @ https://download.pytorch.org/whl/cu118/xformers-0.0.31.post1-cp39-abi3-win_amd64.whl ; platform_system == 'Windows'", +] +cu126onlytorch271 = [ + "xformers @ https://download.pytorch.org/whl/cu126/xformers-0.0.31.post1-cp39-abi3-manylinux_2_28_x86_64.whl ; platform_system == 'Linux'", + "xformers @ https://download.pytorch.org/whl/cu126/xformers-0.0.31.post1-cp39-abi3-win_amd64.whl ; platform_system == 'Windows'", +] +cu128onlytorch271 = [ + "xformers @ https://download.pytorch.org/whl/cu128/xformers-0.0.31.post1-cp39-abi3-manylinux_2_28_x86_64.whl ; platform_system == 'Linux'", + "xformers @ https://download.pytorch.org/whl/cu128/xformers-0.0.31.post1-cp39-abi3-win_amd64.whl ; platform_system == 'Windows'", +] +cu118onlytorch280 = [ + "xformers @ https://download.pytorch.org/whl/cu126/xformers-0.0.32.post2-cp39-abi3-manylinux_2_28_x86_64.whl ; platform_system == 'Linux'", + "xformers @ https://download.pytorch.org/whl/cu126/xformers-0.0.32.post2-cp39-abi3-win_amd64.whl ; platform_system == 'Windows'", +] +cu126onlytorch280 = [ + "xformers @ https://download.pytorch.org/whl/cu128/xformers-0.0.32.post2-cp39-abi3-manylinux_2_28_x86_64.whl ; platform_system == 'Linux'", + "xformers @ https://download.pytorch.org/whl/cu128/xformers-0.0.32.post2-cp39-abi3-win_amd64.whl ; platform_system == 'Windows'", +] +cu128onlytorch280 = [ + "xformers @ https://download.pytorch.org/whl/cu129/xformers-0.0.32.post2-cp39-abi3-manylinux_2_28_x86_64.whl ; platform_system == 'Linux'", + "xformers @ https://download.pytorch.org/whl/cu129/xformers-0.0.32.post2-cp39-abi3-win_amd64.whl ; platform_system == 'Windows'", +] cu118 = [ "unsloth[huggingface]", "bitsandbytes>=0.45.5", @@ -337,6 +371,11 @@ cu126-torch260 = [ "bitsandbytes>=0.45.5", "unsloth[cu126onlytorch260]", ] +cu118-torch270 = [ + "unsloth[huggingface]", + "bitsandbytes>=0.45.5", + "unsloth[cu118onlytorch270]", +] cu126-torch270 = [ "unsloth[huggingface]", "bitsandbytes>=0.45.5", @@ -347,6 +386,36 @@ cu128-torch270 = [ "bitsandbytes>=0.45.5", "unsloth[cu128onlytorch270]", ] +cu118-torch271 = [ + "unsloth[huggingface]", + "bitsandbytes>=0.45.5", + "unsloth[cu118onlytorch271]", +] +cu126-torch271 = [ + "unsloth[huggingface]", + "bitsandbytes>=0.45.5", + "unsloth[cu126onlytorch271]", +] +cu128-torch271 = [ + "unsloth[huggingface]", + "bitsandbytes>=0.45.5", + "unsloth[cu128onlytorch271]", +] +cu118-torch280 = [ + "unsloth[huggingface]", + "bitsandbytes>=0.45.5", + "unsloth[cu118onlytorch280]", +] +cu126-torch280 = [ + "unsloth[huggingface]", + "bitsandbytes>=0.45.5", + "unsloth[cu126onlytorch280]", +] +cu128-torch280 = [ + "unsloth[huggingface]", + "bitsandbytes>=0.45.5", + "unsloth[cu128onlytorch280]", +] kaggle = [ "unsloth[huggingface]", ] @@ -540,6 +609,12 @@ cu126-ampere-torch260 = [ "unsloth[cu126onlytorch260]", "unsloth[flashattention]", ] +cu118-ampere-torch270 = [ + "unsloth[huggingface]", + "bitsandbytes>=0.45.5", + "unsloth[cu118onlytorch270]", + "unsloth[flashattention]", +] cu126-ampere-torch270 = [ "unsloth[huggingface]", "bitsandbytes>=0.45.5", @@ -552,7 +627,42 @@ cu128-ampere-torch270 = [ "unsloth[cu128onlytorch270]", "unsloth[flashattention]", ] - +cu118-ampere-torch271 = [ + "unsloth[huggingface]", + "bitsandbytes>=0.45.5", + "unsloth[cu118onlytorch271]", + "unsloth[flashattention]", +] +cu126-ampere-torch271 = [ + "unsloth[huggingface]", + "bitsandbytes>=0.45.5", + "unsloth[cu126onlytorch271]", + "unsloth[flashattention]", +] +cu128-ampere-torch271 = [ + "unsloth[huggingface]", + "bitsandbytes>=0.45.5", + "unsloth[cu128onlytorch271]", + "unsloth[flashattention]", +] +cu118-ampere-torch280 = [ + "unsloth[huggingface]", + "bitsandbytes>=0.45.5", + "unsloth[cu118onlytorch280]", + "unsloth[flashattention]", +] +cu126-ampere-torch280 = [ + "unsloth[huggingface]", + "bitsandbytes>=0.45.5", + "unsloth[cu126onlytorch280]", + "unsloth[flashattention]", +] +cu128-ampere-torch280 = [ + "unsloth[huggingface]", + "bitsandbytes>=0.45.5", + "unsloth[cu128onlytorch280]", + "unsloth[flashattention]", +] flashattentiontorch260abiFALSEcu12x = [ "flash-attn @ https://github.com/Dao-AILab/flash-attention/releases/download/v2.7.4.post1/flash_attn-2.7.4.post1+cu12torch2.6cxx11abiFALSE-cp39-cp39-linux_x86_64.whl ; platform_system == 'Linux' and python_version == '3.9'", "flash-attn @ https://github.com/Dao-AILab/flash-attention/releases/download/v2.7.4.post1/flash_attn-2.7.4.post1+cu12torch2.6cxx11abiFALSE-cp310-cp310-linux_x86_64.whl ; platform_system == 'Linux' and python_version == '3.10'", diff --git a/unsloth/_auto_install.py b/unsloth/_auto_install.py index c8559394ed..27b23ed476 100644 --- a/unsloth/_auto_install.py +++ b/unsloth/_auto_install.py @@ -30,7 +30,11 @@ elif v < V('2.5.1'): x = 'cu{}{}-torch250' elif v <= V('2.5.1'): x = 'cu{}{}-torch251' elif v < V('2.7.0'): x = 'cu{}{}-torch260' -elif v < V('2.8.0'): x = 'cu{}{}-torch270' +elif v < V('2.7.9'): x = 'cu{}{}-torch270' +elif v < V('2.8.0'): x = 'cu{}{}-torch271' +elif v < V('2.8.9'): x = 'cu{}{}-torch280' else: raise RuntimeError(f"Torch = {v} too new!") +if v > V('2.6.9') and cuda not in ("11.8", "12.6", "12.8"): + raise RuntimeError(f"CUDA = {cuda} not supported!") x = x.format(cuda.replace(".", ""), "-ampere" if is_ampere else "") print(f'pip install --upgrade pip && pip install "unsloth[{x}] @ git+https://github.com/unslothai/unsloth.git"') \ No newline at end of file From 1dd99a2ebc8cf9b19d97ffffcc47bd27582f60cd Mon Sep 17 00:00:00 2001 From: Daniel Han Date: Tue, 19 Aug 2025 03:16:34 -0700 Subject: [PATCH 048/272] Update rl_replacements.py --- unsloth/models/rl_replacements.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/unsloth/models/rl_replacements.py b/unsloth/models/rl_replacements.py index 717e6cbf11..2555f0df1f 100644 --- a/unsloth/models/rl_replacements.py +++ b/unsloth/models/rl_replacements.py @@ -556,7 +556,7 @@ def grpo_trainer_fix_batch_size(RLTrainer_source, RLConfig_source): " per_device_train_batch_size = num_generations\n" return check_batch_size pass -# RL_CONFIG_CHANGES["grpo_trainer"].append(grpo_trainer_fix_batch_size) +RL_CONFIG_CHANGES["grpo_trainer"].append(grpo_trainer_fix_batch_size) # Add other reward function names From 5349cd0fa072105ab6904b5339b814eb7ed47b1e Mon Sep 17 00:00:00 2001 From: Daniel Han Date: Wed, 20 Aug 2025 00:10:48 -0700 Subject: [PATCH 049/272] Update loader.py --- unsloth/models/loader.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/unsloth/models/loader.py b/unsloth/models/loader.py index fae6ae0770..ce09049050 100644 --- a/unsloth/models/loader.py +++ b/unsloth/models/loader.py @@ -641,7 +641,7 @@ def from_pretrained( os.environ["UNSLOTH_FORCE_CUSTOM_DTYPE"] = \ "all;None;None;"\ "if 'down_projs' in name and hasattr(module, 'weight') and "\ - "torch.amax(dequantize_module_weight(module)) >= 1024:"\ + "torch.amax(dequantize_module_weight(module)) >= 512:"\ "module._pre_set_compute_dtype = torch.float32\n"\ ""\ ";" From 5a344c2017830ee4a8ee02e81f0383ffd8b2016f Mon Sep 17 00:00:00 2001 From: Daniel Han Date: Wed, 20 Aug 2025 00:14:49 -0700 Subject: [PATCH 050/272] UNSLOTH_ENABLE_CCE --- unsloth/__init__.py | 6 ++++++ unsloth/models/loader.py | 3 --- 2 files changed, 6 insertions(+), 3 deletions(-) diff --git a/unsloth/__init__.py b/unsloth/__init__.py index a43dc4f70f..c6851546b5 100644 --- a/unsloth/__init__.py +++ b/unsloth/__init__.py @@ -104,6 +104,12 @@ def get_device_count(): del os.environ["PYTORCH_CUDA_ALLOC_CONF"] pass +# CCE fails on Torch 2.8 and above +# OutOfResources: out of resource: shared memory, Required: 98304, Hardware limit: 65536. Reducing block sizes or `num_stages` +if (major_torch >= 2 and minor_torch >= 8) or (major_torch > 2): + os.environ["UNSLOTH_ENABLE_CCE"] = "0" +pass + # Fix Xformers performance issues since 0.0.25 import importlib.util from pathlib import Path diff --git a/unsloth/models/loader.py b/unsloth/models/loader.py index ce09049050..94fd81d16d 100644 --- a/unsloth/models/loader.py +++ b/unsloth/models/loader.py @@ -618,9 +618,6 @@ def from_pretrained( "os.environ['TRITON_F32_DEFAULT'] = 'ieee'" elif "gpt-oss" in lowered_model_name: os.environ["UNSLOTH_DISABLE_STATIC_GENERATION"] = "1" - # CCE fails on Tesla T4 - # OutOfResources: out of resource: shared memory, Required: 98304, Hardware limit: 65536. Reducing block sizes or `num_stages` - os.environ["UNSLOTH_ENABLE_CCE"] = "0" if not load_in_4bit: # Only upcast MoE biases for MXFP4, not BnB # Set norms to float32 since anyways they get upcasted to float32 From e56363c9dcd8e7e34619261871ccf798872e0fe3 Mon Sep 17 00:00:00 2001 From: Daniel Han Date: Wed, 20 Aug 2025 00:40:23 -0700 Subject: [PATCH 051/272] Fix --- unsloth/__init__.py | 2 +- unsloth/models/loader.py | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/unsloth/__init__.py b/unsloth/__init__.py index c6851546b5..2c72092b57 100644 --- a/unsloth/__init__.py +++ b/unsloth/__init__.py @@ -93,7 +93,7 @@ def get_device_count(): # We support Pytorch 2 # Fixes https://github.com/unslothai/unsloth/issues/38 -torch_version = str(torch.__version__).split(".") +torch_version = str(re.match(r"[0-9\.]{3,}", str(torch.__version__)).group(0)).split(".") major_torch, minor_torch = torch_version[0], torch_version[1] major_torch, minor_torch = int(major_torch), int(minor_torch) if (major_torch < 2): diff --git a/unsloth/models/loader.py b/unsloth/models/loader.py index 94fd81d16d..00e942ea93 100644 --- a/unsloth/models/loader.py +++ b/unsloth/models/loader.py @@ -638,7 +638,7 @@ def from_pretrained( os.environ["UNSLOTH_FORCE_CUSTOM_DTYPE"] = \ "all;None;None;"\ "if 'down_projs' in name and hasattr(module, 'weight') and "\ - "torch.amax(dequantize_module_weight(module)) >= 512:"\ + "torch.amax(dequantize_module_weight(module)) >= 128:"\ "module._pre_set_compute_dtype = torch.float32\n"\ ""\ ";" From c79aece5377480352b1b9eb5339d175551434745 Mon Sep 17 00:00:00 2001 From: Daniel Han Date: Wed, 20 Aug 2025 01:12:42 -0700 Subject: [PATCH 052/272] Update loader.py --- unsloth/models/loader.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/unsloth/models/loader.py b/unsloth/models/loader.py index 00e942ea93..050e077a39 100644 --- a/unsloth/models/loader.py +++ b/unsloth/models/loader.py @@ -638,7 +638,7 @@ def from_pretrained( os.environ["UNSLOTH_FORCE_CUSTOM_DTYPE"] = \ "all;None;None;"\ "if 'down_projs' in name and hasattr(module, 'weight') and "\ - "torch.amax(dequantize_module_weight(module)) >= 128:"\ + "torch.amax(dequantize_module_weight(module)) >= 0:"\ "module._pre_set_compute_dtype = torch.float32\n"\ ""\ ";" From c4b530cc29c08693ce139f4c8decdfb80aed6370 Mon Sep 17 00:00:00 2001 From: Daniel Han Date: Wed, 20 Aug 2025 01:32:44 -0700 Subject: [PATCH 053/272] Update loader.py --- unsloth/models/loader.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/unsloth/models/loader.py b/unsloth/models/loader.py index 050e077a39..0ff765bf4c 100644 --- a/unsloth/models/loader.py +++ b/unsloth/models/loader.py @@ -638,7 +638,7 @@ def from_pretrained( os.environ["UNSLOTH_FORCE_CUSTOM_DTYPE"] = \ "all;None;None;"\ "if 'down_projs' in name and hasattr(module, 'weight') and "\ - "torch.amax(dequantize_module_weight(module)) >= 0:"\ + "torch.amax(dequantize_module_weight(module)) >= 1024:"\ "module._pre_set_compute_dtype = torch.float32\n"\ ""\ ";" From 0913b585eaa4d81df1ab0d2fae09f7944f5178cb Mon Sep 17 00:00:00 2001 From: Daniel Han Date: Wed, 20 Aug 2025 01:47:06 -0700 Subject: [PATCH 054/272] Update __init__.py --- unsloth/__init__.py | 51 +++++++++++++++++++++++---------------------- 1 file changed, 26 insertions(+), 25 deletions(-) diff --git a/unsloth/__init__.py b/unsloth/__init__.py index 2c72092b57..3cb3c2e492 100644 --- a/unsloth/__init__.py +++ b/unsloth/__init__.py @@ -53,6 +53,32 @@ # Log Unsloth is being used os.environ["UNSLOTH_IS_PRESENT"] = "1" +# Fix up AttributeError: 'MessageFactory' object has no attribute 'GetPrototype' +# MUST do this at the start primarily due to tensorflow causing issues +try: + import google.protobuf.message_factory + class MessageFactory: + def CreatePrototype(self, *args, **kwargs): return + def GetMessages(self, *args, **kwargs): return + def GetPrototype(self, *args, **kwargs): return + if not hasattr(google.protobuf.message_factory, "MessageFactory"): + google.protobuf.message_factory.MessageFactory = MessageFactory + elif hasattr(google.protobuf.message_factory, "MessageFactory") and \ + not hasattr(google.protobuf.message_factory.MessageFactory, "GetPrototype") and \ + not hasattr(google.protobuf.message_factory, "GetMessageClass"): + google.protobuf.message_factory.MessageFactory = MessageFactory + elif hasattr(google.protobuf.message_factory, "MessageFactory") and \ + not hasattr(google.protobuf.message_factory.MessageFactory, "GetPrototype") and \ + hasattr(google.protobuf.message_factory, "GetMessageClass"): + GetMessageClass = google.protobuf.message_factory.GetMessageClass + def GetPrototype(self, descriptor): + return GetMessageClass(descriptor) + google.protobuf.message_factory.MessageFactory.GetPrototype = GetPrototype + pass +except: + pass + +# Try importing PyTorch and check version try: import torch except ModuleNotFoundError: @@ -246,31 +272,6 @@ def is_bf16_supported(): return SUPPORTS_BFLOAT16 raise ImportError("Unsloth: Please install unsloth_zoo via `pip install unsloth_zoo`") pass -try: - # Fix up AttributeError: 'MessageFactory' object has no attribute 'GetPrototype' - # MUST do this at the start primarily due to tensorflow causing issues - import google.protobuf.message_factory - class MessageFactory: - def CreatePrototype(self, *args, **kwargs): return - def GetMessages(self, *args, **kwargs): return - def GetPrototype(self, *args, **kwargs): return - if not hasattr(google.protobuf.message_factory, "MessageFactory"): - google.protobuf.message_factory.MessageFactory = MessageFactory - elif hasattr(google.protobuf.message_factory, "MessageFactory") and \ - not hasattr(google.protobuf.message_factory.MessageFactory, "GetPrototype") and \ - not hasattr(google.protobuf.message_factory, "GetMessageClass"): - google.protobuf.message_factory.MessageFactory = MessageFactory - elif hasattr(google.protobuf.message_factory, "MessageFactory") and \ - not hasattr(google.protobuf.message_factory.MessageFactory, "GetPrototype") and \ - hasattr(google.protobuf.message_factory, "GetMessageClass"): - GetMessageClass = google.protobuf.message_factory.GetMessageClass - def GetPrototype(self, descriptor): - return GetMessageClass(descriptor) - google.protobuf.message_factory.MessageFactory.GetPrototype = GetPrototype - pass -except: - pass - from .models import * from .models import __version__ from .save import * From 374f703ee909c56536265e1cca71306a873abd46 Mon Sep 17 00:00:00 2001 From: Daniel Han Date: Wed, 20 Aug 2025 01:49:57 -0700 Subject: [PATCH 055/272] Update __init__.py --- unsloth/__init__.py | 50 ++++++++++++++++++++++----------------------- 1 file changed, 25 insertions(+), 25 deletions(-) diff --git a/unsloth/__init__.py b/unsloth/__init__.py index 3cb3c2e492..0430e5704d 100644 --- a/unsloth/__init__.py +++ b/unsloth/__init__.py @@ -53,31 +53,6 @@ # Log Unsloth is being used os.environ["UNSLOTH_IS_PRESENT"] = "1" -# Fix up AttributeError: 'MessageFactory' object has no attribute 'GetPrototype' -# MUST do this at the start primarily due to tensorflow causing issues -try: - import google.protobuf.message_factory - class MessageFactory: - def CreatePrototype(self, *args, **kwargs): return - def GetMessages(self, *args, **kwargs): return - def GetPrototype(self, *args, **kwargs): return - if not hasattr(google.protobuf.message_factory, "MessageFactory"): - google.protobuf.message_factory.MessageFactory = MessageFactory - elif hasattr(google.protobuf.message_factory, "MessageFactory") and \ - not hasattr(google.protobuf.message_factory.MessageFactory, "GetPrototype") and \ - not hasattr(google.protobuf.message_factory, "GetMessageClass"): - google.protobuf.message_factory.MessageFactory = MessageFactory - elif hasattr(google.protobuf.message_factory, "MessageFactory") and \ - not hasattr(google.protobuf.message_factory.MessageFactory, "GetPrototype") and \ - hasattr(google.protobuf.message_factory, "GetMessageClass"): - GetMessageClass = google.protobuf.message_factory.GetMessageClass - def GetPrototype(self, descriptor): - return GetMessageClass(descriptor) - google.protobuf.message_factory.MessageFactory.GetPrototype = GetPrototype - pass -except: - pass - # Try importing PyTorch and check version try: import torch @@ -136,6 +111,31 @@ def get_device_count(): os.environ["UNSLOTH_ENABLE_CCE"] = "0" pass +# Fix up AttributeError: 'MessageFactory' object has no attribute 'GetPrototype' +# MUST do this at the start primarily due to tensorflow causing issues +try: + import google.protobuf.message_factory + class MessageFactory: + def CreatePrototype(self, *args, **kwargs): return + def GetMessages(self, *args, **kwargs): return + def GetPrototype(self, *args, **kwargs): return + if not hasattr(google.protobuf.message_factory, "MessageFactory"): + google.protobuf.message_factory.MessageFactory = MessageFactory + elif hasattr(google.protobuf.message_factory, "MessageFactory") and \ + not hasattr(google.protobuf.message_factory.MessageFactory, "GetPrototype") and \ + not hasattr(google.protobuf.message_factory, "GetMessageClass"): + google.protobuf.message_factory.MessageFactory = MessageFactory + elif hasattr(google.protobuf.message_factory, "MessageFactory") and \ + not hasattr(google.protobuf.message_factory.MessageFactory, "GetPrototype") and \ + hasattr(google.protobuf.message_factory, "GetMessageClass"): + GetMessageClass = google.protobuf.message_factory.GetMessageClass + def GetPrototype(self, descriptor): + return GetMessageClass(descriptor) + google.protobuf.message_factory.MessageFactory.GetPrototype = GetPrototype + pass +except: + pass + # Fix Xformers performance issues since 0.0.25 import importlib.util from pathlib import Path From c0efbec6918a125859e10fa8c412d42e360548be Mon Sep 17 00:00:00 2001 From: Daniel Han Date: Wed, 20 Aug 2025 01:51:18 -0700 Subject: [PATCH 056/272] Update __init__.py --- unsloth/__init__.py | 50 ++++++++++++++++++++++----------------------- 1 file changed, 25 insertions(+), 25 deletions(-) diff --git a/unsloth/__init__.py b/unsloth/__init__.py index 0430e5704d..f34645651b 100644 --- a/unsloth/__init__.py +++ b/unsloth/__init__.py @@ -111,31 +111,6 @@ def get_device_count(): os.environ["UNSLOTH_ENABLE_CCE"] = "0" pass -# Fix up AttributeError: 'MessageFactory' object has no attribute 'GetPrototype' -# MUST do this at the start primarily due to tensorflow causing issues -try: - import google.protobuf.message_factory - class MessageFactory: - def CreatePrototype(self, *args, **kwargs): return - def GetMessages(self, *args, **kwargs): return - def GetPrototype(self, *args, **kwargs): return - if not hasattr(google.protobuf.message_factory, "MessageFactory"): - google.protobuf.message_factory.MessageFactory = MessageFactory - elif hasattr(google.protobuf.message_factory, "MessageFactory") and \ - not hasattr(google.protobuf.message_factory.MessageFactory, "GetPrototype") and \ - not hasattr(google.protobuf.message_factory, "GetMessageClass"): - google.protobuf.message_factory.MessageFactory = MessageFactory - elif hasattr(google.protobuf.message_factory, "MessageFactory") and \ - not hasattr(google.protobuf.message_factory.MessageFactory, "GetPrototype") and \ - hasattr(google.protobuf.message_factory, "GetMessageClass"): - GetMessageClass = google.protobuf.message_factory.GetMessageClass - def GetPrototype(self, descriptor): - return GetMessageClass(descriptor) - google.protobuf.message_factory.MessageFactory.GetPrototype = GetPrototype - pass -except: - pass - # Fix Xformers performance issues since 0.0.25 import importlib.util from pathlib import Path @@ -272,6 +247,31 @@ def is_bf16_supported(): return SUPPORTS_BFLOAT16 raise ImportError("Unsloth: Please install unsloth_zoo via `pip install unsloth_zoo`") pass +# Fix up AttributeError: 'MessageFactory' object has no attribute 'GetPrototype' +# MUST do this at the start primarily due to tensorflow causing issues +try: + import google.protobuf.message_factory + class MessageFactory: + def CreatePrototype(self, *args, **kwargs): return + def GetMessages(self, *args, **kwargs): return + def GetPrototype(self, *args, **kwargs): return + if not hasattr(google.protobuf.message_factory, "MessageFactory"): + google.protobuf.message_factory.MessageFactory = MessageFactory + elif hasattr(google.protobuf.message_factory, "MessageFactory") and \ + not hasattr(google.protobuf.message_factory.MessageFactory, "GetPrototype") and \ + not hasattr(google.protobuf.message_factory, "GetMessageClass"): + google.protobuf.message_factory.MessageFactory = MessageFactory + elif hasattr(google.protobuf.message_factory, "MessageFactory") and \ + not hasattr(google.protobuf.message_factory.MessageFactory, "GetPrototype") and \ + hasattr(google.protobuf.message_factory, "GetMessageClass"): + GetMessageClass = google.protobuf.message_factory.GetMessageClass + def GetPrototype(self, descriptor): + return GetMessageClass(descriptor) + google.protobuf.message_factory.MessageFactory.GetPrototype = GetPrototype + pass +except: + pass + from .models import * from .models import __version__ from .save import * From 761a4454a95b3ff9a6bc28c2f4ed5619df9b828f Mon Sep 17 00:00:00 2001 From: Daniel Han Date: Wed, 20 Aug 2025 01:53:53 -0700 Subject: [PATCH 057/272] Update __init__.py --- unsloth/__init__.py | 42 +++++++++++++++++++++--------------------- 1 file changed, 21 insertions(+), 21 deletions(-) diff --git a/unsloth/__init__.py b/unsloth/__init__.py index f34645651b..95035b91b0 100644 --- a/unsloth/__init__.py +++ b/unsloth/__init__.py @@ -226,27 +226,6 @@ def is_bf16_supported(): return SUPPORTS_BFLOAT16 # TODO: check triton for intel installed properly. pass -# Check for unsloth_zoo -try: - unsloth_zoo_version = importlib_version("unsloth_zoo") - if Version(unsloth_zoo_version) < Version("2025.8.1"): - print( - "Unsloth: Please update Unsloth and Unsloth-Zoo to the latest version!\n"\ - "Do this via `pip install --upgrade --force-reinstall --no-cache-dir --no-deps unsloth unsloth_zoo`" - ) - # if os.environ.get("UNSLOTH_DISABLE_AUTO_UPDATES", "0") == "0": - # try: - # os.system("pip install --upgrade --no-cache-dir --no-deps unsloth_zoo") - # except: - # try: - # os.system("pip install --upgrade --no-cache-dir --no-deps --user unsloth_zoo") - # except: - # raise ImportError("Unsloth: Please update unsloth_zoo via `pip install --upgrade --no-cache-dir --no-deps unsloth_zoo`") - import unsloth_zoo -except: - raise ImportError("Unsloth: Please install unsloth_zoo via `pip install unsloth_zoo`") -pass - # Fix up AttributeError: 'MessageFactory' object has no attribute 'GetPrototype' # MUST do this at the start primarily due to tensorflow causing issues try: @@ -272,6 +251,27 @@ def GetPrototype(self, descriptor): except: pass +# Check for unsloth_zoo +try: + unsloth_zoo_version = importlib_version("unsloth_zoo") + if Version(unsloth_zoo_version) < Version("2025.8.1"): + print( + "Unsloth: Please update Unsloth and Unsloth-Zoo to the latest version!\n"\ + "Do this via `pip install --upgrade --force-reinstall --no-cache-dir --no-deps unsloth unsloth_zoo`" + ) + # if os.environ.get("UNSLOTH_DISABLE_AUTO_UPDATES", "0") == "0": + # try: + # os.system("pip install --upgrade --no-cache-dir --no-deps unsloth_zoo") + # except: + # try: + # os.system("pip install --upgrade --no-cache-dir --no-deps --user unsloth_zoo") + # except: + # raise ImportError("Unsloth: Please update unsloth_zoo via `pip install --upgrade --no-cache-dir --no-deps unsloth_zoo`") + import unsloth_zoo +except: + raise ImportError("Unsloth: Please install unsloth_zoo via `pip install unsloth_zoo`") +pass + from .models import * from .models import __version__ from .save import * From 30ea44c17f2b4e60b77240c1cb1ec93610c57861 Mon Sep 17 00:00:00 2001 From: Daniel Han Date: Wed, 20 Aug 2025 01:56:58 -0700 Subject: [PATCH 058/272] Import fixes --- unsloth/__init__.py | 30 ++++-------------------------- unsloth/import_fixes.py | 40 ++++++++++++++++++++++++++++++++++++++++ 2 files changed, 44 insertions(+), 26 deletions(-) create mode 100644 unsloth/import_fixes.py diff --git a/unsloth/__init__.py b/unsloth/__init__.py index 95035b91b0..fd6bd7d499 100644 --- a/unsloth/__init__.py +++ b/unsloth/__init__.py @@ -17,6 +17,10 @@ import os, re, subprocess, inspect import numpy as np +# Fix some issues before importing other packages +from .import_fixes import fix_message_factory_issue +fix_message_factory_issue(); del fix_message_factory_issue; + # Check if modules that need patching are already imported critical_modules = ['trl', 'transformers', 'peft'] already_imported = [mod for mod in critical_modules if mod in sys.modules] @@ -161,7 +165,6 @@ def is_bf16_supported(): return SUPPORTS_BFLOAT16 SUPPORTS_BFLOAT16 = torch.xpu.is_bf16_supported() pass - # For Gradio HF Spaces? # if "SPACE_AUTHOR_NAME" not in os.environ and "SPACE_REPO_NAME" not in os.environ: import triton @@ -226,31 +229,6 @@ def is_bf16_supported(): return SUPPORTS_BFLOAT16 # TODO: check triton for intel installed properly. pass -# Fix up AttributeError: 'MessageFactory' object has no attribute 'GetPrototype' -# MUST do this at the start primarily due to tensorflow causing issues -try: - import google.protobuf.message_factory - class MessageFactory: - def CreatePrototype(self, *args, **kwargs): return - def GetMessages(self, *args, **kwargs): return - def GetPrototype(self, *args, **kwargs): return - if not hasattr(google.protobuf.message_factory, "MessageFactory"): - google.protobuf.message_factory.MessageFactory = MessageFactory - elif hasattr(google.protobuf.message_factory, "MessageFactory") and \ - not hasattr(google.protobuf.message_factory.MessageFactory, "GetPrototype") and \ - not hasattr(google.protobuf.message_factory, "GetMessageClass"): - google.protobuf.message_factory.MessageFactory = MessageFactory - elif hasattr(google.protobuf.message_factory, "MessageFactory") and \ - not hasattr(google.protobuf.message_factory.MessageFactory, "GetPrototype") and \ - hasattr(google.protobuf.message_factory, "GetMessageClass"): - GetMessageClass = google.protobuf.message_factory.GetMessageClass - def GetPrototype(self, descriptor): - return GetMessageClass(descriptor) - google.protobuf.message_factory.MessageFactory.GetPrototype = GetPrototype - pass -except: - pass - # Check for unsloth_zoo try: unsloth_zoo_version = importlib_version("unsloth_zoo") diff --git a/unsloth/import_fixes.py b/unsloth/import_fixes.py new file mode 100644 index 0000000000..d265a09df0 --- /dev/null +++ b/unsloth/import_fixes.py @@ -0,0 +1,40 @@ +# Copyright 2023-present Daniel Han-Chen & the Unsloth team. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +def fix_message_factory_issue(): + # Fix up AttributeError: 'MessageFactory' object has no attribute 'GetPrototype' + # MUST do this at the start primarily due to tensorflow causing issues + try: + import google.protobuf.message_factory + class MessageFactory: + def CreatePrototype(self, *args, **kwargs): return + def GetMessages(self, *args, **kwargs): return + def GetPrototype(self, *args, **kwargs): return + if not hasattr(google.protobuf.message_factory, "MessageFactory"): + google.protobuf.message_factory.MessageFactory = MessageFactory + elif hasattr(google.protobuf.message_factory, "MessageFactory") and \ + not hasattr(google.protobuf.message_factory.MessageFactory, "GetPrototype") and \ + not hasattr(google.protobuf.message_factory, "GetMessageClass"): + google.protobuf.message_factory.MessageFactory = MessageFactory + elif hasattr(google.protobuf.message_factory, "MessageFactory") and \ + not hasattr(google.protobuf.message_factory.MessageFactory, "GetPrototype") and \ + hasattr(google.protobuf.message_factory, "GetMessageClass"): + GetMessageClass = google.protobuf.message_factory.GetMessageClass + def GetPrototype(self, descriptor): + return GetMessageClass(descriptor) + google.protobuf.message_factory.MessageFactory.GetPrototype = GetPrototype + pass + except: + pass +pass From c45467cfd91d5d66308f5cbc8a6ab3cc90bec5d5 Mon Sep 17 00:00:00 2001 From: Daniel Han Date: Wed, 20 Aug 2025 02:00:51 -0700 Subject: [PATCH 059/272] Update loader.py --- unsloth/models/loader.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/unsloth/models/loader.py b/unsloth/models/loader.py index 0ff765bf4c..72655782f9 100644 --- a/unsloth/models/loader.py +++ b/unsloth/models/loader.py @@ -637,8 +637,8 @@ def from_pretrained( # Set norms to float32 since anyways they get upcasted to float32 os.environ["UNSLOTH_FORCE_CUSTOM_DTYPE"] = \ "all;None;None;"\ - "if 'down_projs' in name and hasattr(module, 'weight') and "\ - "torch.amax(dequantize_module_weight(module)) >= 1024:"\ + "if hasattr(module, 'weight') and "\ + "torch.amax(dequantize_module_weight(module)) >= 1:"\ "module._pre_set_compute_dtype = torch.float32\n"\ ""\ ";" From 55e4c78a943a52b9e0b46b29afae0f79e371573c Mon Sep 17 00:00:00 2001 From: Daniel Han Date: Wed, 20 Aug 2025 02:15:10 -0700 Subject: [PATCH 060/272] Fix aimv2 issue --- unsloth/__init__.py | 30 +++------------- unsloth/import_fixes.py | 79 +++++++++++++++++++++++++++++++++++++++-- 2 files changed, 82 insertions(+), 27 deletions(-) diff --git a/unsloth/__init__.py b/unsloth/__init__.py index fd6bd7d499..335db48775 100644 --- a/unsloth/__init__.py +++ b/unsloth/__init__.py @@ -115,35 +115,15 @@ def get_device_count(): os.environ["UNSLOTH_ENABLE_CCE"] = "0" pass -# Fix Xformers performance issues since 0.0.25 +# Fix other issues import importlib.util from pathlib import Path from importlib.metadata import version as importlib_version from packaging.version import Version -try: - xformers_version = importlib_version("xformers") - if Version(xformers_version) < Version("0.0.29"): - xformers_location = importlib.util.find_spec("xformers").origin - xformers_location = os.path.split(xformers_location)[0] - cutlass = Path(xformers_location) / "ops" / "fmha" / "cutlass.py" - - if cutlass.exists(): - with open(cutlass, "r+", encoding = "utf-8") as f: - text = f.read() - # See https://github.com/facebookresearch/xformers/issues/1176#issuecomment-2545829591 - if "num_splits_key=-1," in text: - text = text.replace("num_splits_key=-1,", "num_splits_key=None,") - f.seek(0) - f.write(text) - f.truncate() - print("Unsloth: Patching Xformers to fix some performance issues.") - pass - pass - pass - pass -except: - pass -pass +from .import_fixes import fix_xformers_performance_issue +fix_xformers_performance_issue(); del fix_xformers_performance_issue; +from .import_fixes import fix_vllm_aimv2_issue +fix_vllm_aimv2_issue(); del fix_vllm_aimv2_issue; # Torch 2.4 has including_emulation if DEVICE_TYPE == "cuda": diff --git a/unsloth/import_fixes.py b/unsloth/import_fixes.py index d265a09df0..126aac6365 100644 --- a/unsloth/import_fixes.py +++ b/unsloth/import_fixes.py @@ -12,9 +12,16 @@ # See the License for the specific language governing permissions and # limitations under the License. +import os +import importlib.util +from pathlib import Path +from importlib.metadata import version as importlib_version +from packaging.version import Version +UNSLOTH_ENABLE_LOGGING = os.environ.get("UNSLOTH_ENABLE_LOGGING", "0") == "1" + +# Fix up AttributeError: 'MessageFactory' object has no attribute 'GetPrototype' +# MUST do this at the start primarily due to tensorflow causing issues def fix_message_factory_issue(): - # Fix up AttributeError: 'MessageFactory' object has no attribute 'GetPrototype' - # MUST do this at the start primarily due to tensorflow causing issues try: import google.protobuf.message_factory class MessageFactory: @@ -22,11 +29,15 @@ def CreatePrototype(self, *args, **kwargs): return def GetMessages(self, *args, **kwargs): return def GetPrototype(self, *args, **kwargs): return if not hasattr(google.protobuf.message_factory, "MessageFactory"): + if UNSLOTH_ENABLE_LOGGING: + print("Unsloth: Patching protobuf.MessageFactory as it doesn't exist") google.protobuf.message_factory.MessageFactory = MessageFactory elif hasattr(google.protobuf.message_factory, "MessageFactory") and \ not hasattr(google.protobuf.message_factory.MessageFactory, "GetPrototype") and \ not hasattr(google.protobuf.message_factory, "GetMessageClass"): google.protobuf.message_factory.MessageFactory = MessageFactory + if UNSLOTH_ENABLE_LOGGING: + print("Unsloth: Patching protobuf.MessageFactory as it doesn't exist") elif hasattr(google.protobuf.message_factory, "MessageFactory") and \ not hasattr(google.protobuf.message_factory.MessageFactory, "GetPrototype") and \ hasattr(google.protobuf.message_factory, "GetMessageClass"): @@ -34,7 +45,71 @@ def GetPrototype(self, *args, **kwargs): return def GetPrototype(self, descriptor): return GetMessageClass(descriptor) google.protobuf.message_factory.MessageFactory.GetPrototype = GetPrototype + if UNSLOTH_ENABLE_LOGGING: + print("Unsloth: Patching protobuf.MessageFactory.GetPrototype") pass except: pass pass + +# Fix Xformers performance issues since 0.0.25 +def fix_xformers_performance_issue(): + xformers_version = importlib_version("xformers") + if Version(xformers_version) < Version("0.0.29"): + xformers_location = importlib.util.find_spec("xformers").origin + xformers_location = os.path.split(xformers_location)[0] + cutlass = Path(xformers_location) / "ops" / "fmha" / "cutlass.py" + try: + if cutlass.exists(): + with open(cutlass, "r+", encoding = "utf-8") as f: + text = f.read() + # See https://github.com/facebookresearch/xformers/issues/1176#issuecomment-2545829591 + if "num_splits_key=-1," in text: + text = text.replace( + "num_splits_key=-1,", + "num_splits_key=None,", + ) + f.seek(0) + f.write(text) + f.truncate() + if UNSLOTH_ENABLE_LOGGING: + print("Unsloth: Patching Xformers to fix some performance issues.") + except: + pass +pass + +# ValueError: 'aimv2' is already used by a Transformers config, pick another name. +def fix_vllm_aimv2_issue(): + vllm_version = importlib_version("vllm") + if Version(vllm_version) < Version("0.10.1"): + vllm_version = importlib.util.find_spec("xformers").origin + vllm_version = os.path.split(vllm_version)[0] + ovis_config = Path(vllm_version) / "transformers_utils" / "configs" / "ovis.py" + try: + if ovis_config.exists(): + with open(ovis_config, "r+", encoding = "utf-8") as f: + text = f.read() + # See https://github.com/vllm-project/vllm-ascend/issues/2046 + if 'AutoConfig.register("aimv2", AIMv2Config)' in text: + text = text.replace( + 'AutoConfig.register("aimv2", AIMv2Config)', + '', + ) + text = text.replace( + '''backbone_config.pop('model_type') + backbone_config = AutoConfig.for_model(model_type, + **backbone_config)''', + '''if model_type != "aimv2": + backbone_config.pop('model_type') + backbone_config = AutoConfig.for_model(model_type, **backbone_config) + else: + backbone_config = AIMv2Config(**backbone_config)''' + ) + f.seek(0) + f.write(text) + f.truncate() + if UNSLOTH_ENABLE_LOGGING: + print("Unsloth: Patching vLLM to fix `'aimv2' is already used by a Transformers config, pick another name.`") + except: + pass +pass From a160e42ad8250f40b25e72e2a1b2e2d550986a65 Mon Sep 17 00:00:00 2001 From: Daniel Han Date: Wed, 20 Aug 2025 02:20:31 -0700 Subject: [PATCH 061/272] Update loader.py --- unsloth/models/loader.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/unsloth/models/loader.py b/unsloth/models/loader.py index 72655782f9..0ff765bf4c 100644 --- a/unsloth/models/loader.py +++ b/unsloth/models/loader.py @@ -637,8 +637,8 @@ def from_pretrained( # Set norms to float32 since anyways they get upcasted to float32 os.environ["UNSLOTH_FORCE_CUSTOM_DTYPE"] = \ "all;None;None;"\ - "if hasattr(module, 'weight') and "\ - "torch.amax(dequantize_module_weight(module)) >= 1:"\ + "if 'down_projs' in name and hasattr(module, 'weight') and "\ + "torch.amax(dequantize_module_weight(module)) >= 1024:"\ "module._pre_set_compute_dtype = torch.float32\n"\ ""\ ";" From 675c4effe78a3ef5bb3f21f6892f3edc54e1e935 Mon Sep 17 00:00:00 2001 From: Daniel Han Date: Wed, 20 Aug 2025 02:23:21 -0700 Subject: [PATCH 062/272] Update import_fixes.py --- unsloth/import_fixes.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/unsloth/import_fixes.py b/unsloth/import_fixes.py index 126aac6365..1a4172e01f 100644 --- a/unsloth/import_fixes.py +++ b/unsloth/import_fixes.py @@ -54,6 +54,7 @@ def GetPrototype(self, descriptor): # Fix Xformers performance issues since 0.0.25 def fix_xformers_performance_issue(): + if importlib.util.find_spec("xformers") is None: return xformers_version = importlib_version("xformers") if Version(xformers_version) < Version("0.0.29"): xformers_location = importlib.util.find_spec("xformers").origin @@ -80,6 +81,7 @@ def fix_xformers_performance_issue(): # ValueError: 'aimv2' is already used by a Transformers config, pick another name. def fix_vllm_aimv2_issue(): + if importlib.util.find_spec("vllm") is None: return vllm_version = importlib_version("vllm") if Version(vllm_version) < Version("0.10.1"): vllm_version = importlib.util.find_spec("xformers").origin From a99d6b273c59f0908385559ba2d8b441751b6249 Mon Sep 17 00:00:00 2001 From: Daniel Han Date: Wed, 20 Aug 2025 02:26:23 -0700 Subject: [PATCH 063/272] Update import_fixes.py --- unsloth/import_fixes.py | 12 +++++++----- 1 file changed, 7 insertions(+), 5 deletions(-) diff --git a/unsloth/import_fixes.py b/unsloth/import_fixes.py index 1a4172e01f..a07f9970f8 100644 --- a/unsloth/import_fixes.py +++ b/unsloth/import_fixes.py @@ -75,8 +75,9 @@ def fix_xformers_performance_issue(): f.truncate() if UNSLOTH_ENABLE_LOGGING: print("Unsloth: Patching Xformers to fix some performance issues.") - except: - pass + except Exception as e: + if UNSLOTH_ENABLE_LOGGING: + print(f"Unsloth: Failed patching Xformers with error = {str(e)}") pass # ValueError: 'aimv2' is already used by a Transformers config, pick another name. @@ -84,7 +85,7 @@ def fix_vllm_aimv2_issue(): if importlib.util.find_spec("vllm") is None: return vllm_version = importlib_version("vllm") if Version(vllm_version) < Version("0.10.1"): - vllm_version = importlib.util.find_spec("xformers").origin + vllm_version = importlib.util.find_spec("vllm").origin vllm_version = os.path.split(vllm_version)[0] ovis_config = Path(vllm_version) / "transformers_utils" / "configs" / "ovis.py" try: @@ -112,6 +113,7 @@ def fix_vllm_aimv2_issue(): f.truncate() if UNSLOTH_ENABLE_LOGGING: print("Unsloth: Patching vLLM to fix `'aimv2' is already used by a Transformers config, pick another name.`") - except: - pass + except Exception as e: + if UNSLOTH_ENABLE_LOGGING: + print(f"Unsloth: Failed patching vLLM with error = {str(e)}") pass From 7e8262303ef06bc39367a17acf0e783abb37c1b4 Mon Sep 17 00:00:00 2001 From: Daniel Han Date: Wed, 20 Aug 2025 02:38:39 -0700 Subject: [PATCH 064/272] Update loader.py --- unsloth/models/loader.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/unsloth/models/loader.py b/unsloth/models/loader.py index 0ff765bf4c..050e077a39 100644 --- a/unsloth/models/loader.py +++ b/unsloth/models/loader.py @@ -638,7 +638,7 @@ def from_pretrained( os.environ["UNSLOTH_FORCE_CUSTOM_DTYPE"] = \ "all;None;None;"\ "if 'down_projs' in name and hasattr(module, 'weight') and "\ - "torch.amax(dequantize_module_weight(module)) >= 1024:"\ + "torch.amax(dequantize_module_weight(module)) >= 0:"\ "module._pre_set_compute_dtype = torch.float32\n"\ ""\ ";" From 0e678d6fe9ef0aeced0380184bfb9e7c9b1a1778 Mon Sep 17 00:00:00 2001 From: Daniel Han Date: Wed, 20 Aug 2025 03:38:26 -0700 Subject: [PATCH 065/272] Update loader.py --- unsloth/models/loader.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/unsloth/models/loader.py b/unsloth/models/loader.py index 050e077a39..1b110ca513 100644 --- a/unsloth/models/loader.py +++ b/unsloth/models/loader.py @@ -637,7 +637,7 @@ def from_pretrained( # Set norms to float32 since anyways they get upcasted to float32 os.environ["UNSLOTH_FORCE_CUSTOM_DTYPE"] = \ "all;None;None;"\ - "if 'down_projs' in name and hasattr(module, 'weight') and "\ + "if ('down_projs' in name or 'gate_up_projs' in name) and hasattr(module, 'weight') and "\ "torch.amax(dequantize_module_weight(module)) >= 0:"\ "module._pre_set_compute_dtype = torch.float32\n"\ ""\ From 9b82317a699779d8b96e986fe8ef7a3f16494247 Mon Sep 17 00:00:00 2001 From: Daniel Han Date: Wed, 20 Aug 2025 04:09:24 -0700 Subject: [PATCH 066/272] Update loader.py --- unsloth/models/loader.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/unsloth/models/loader.py b/unsloth/models/loader.py index 1b110ca513..0da6b83d12 100644 --- a/unsloth/models/loader.py +++ b/unsloth/models/loader.py @@ -637,8 +637,8 @@ def from_pretrained( # Set norms to float32 since anyways they get upcasted to float32 os.environ["UNSLOTH_FORCE_CUSTOM_DTYPE"] = \ "all;None;None;"\ - "if ('down_projs' in name or 'gate_up_projs' in name) and hasattr(module, 'weight') and "\ - "torch.amax(dequantize_module_weight(module)) >= 0:"\ + "if ('down_projs' in name) and hasattr(module, 'weight') and "\ + "torch.amax(dequantize_module_weight(module)) >= 1024:"\ "module._pre_set_compute_dtype = torch.float32\n"\ ""\ ";" From 8a76fd32bdf05d3e63dd6df309b52d861e11ef3f Mon Sep 17 00:00:00 2001 From: Daniel Han Date: Wed, 20 Aug 2025 04:39:30 -0700 Subject: [PATCH 067/272] Upgrade --- pyproject.toml | 4 ++-- unsloth/__init__.py | 2 +- unsloth/models/_utils.py | 2 +- 3 files changed, 4 insertions(+), 4 deletions(-) diff --git a/pyproject.toml b/pyproject.toml index c4c3ebe6f5..83b75b0a00 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -37,7 +37,7 @@ triton = [ ] huggingface = [ - "unsloth_zoo>=2025.8.7", + "unsloth_zoo>=2025.8.8", "packaging", "tyro", "transformers>=4.51.3,!=4.47.0,!=4.52.0,!=4.52.1,!=4.52.2,!=4.52.3,!=4.53.0,!=4.54.0,!=4.55.0,!=4.55.1", @@ -453,7 +453,7 @@ colab-ampere-torch220 = [ "flash-attn>=2.6.3", ] colab-new = [ - "unsloth_zoo>=2025.8.7", + "unsloth_zoo>=2025.8.8", "packaging", "tyro", "transformers>=4.51.3,!=4.47.0,!=4.52.0,!=4.52.1,!=4.52.2,!=4.52.3,!=4.53.0,!=4.54.0,!=4.55.0,!=4.55.1", diff --git a/unsloth/__init__.py b/unsloth/__init__.py index 335db48775..a6ea8f4c9f 100644 --- a/unsloth/__init__.py +++ b/unsloth/__init__.py @@ -212,7 +212,7 @@ def is_bf16_supported(): return SUPPORTS_BFLOAT16 # Check for unsloth_zoo try: unsloth_zoo_version = importlib_version("unsloth_zoo") - if Version(unsloth_zoo_version) < Version("2025.8.1"): + if Version(unsloth_zoo_version) < Version("2025.8.8"): print( "Unsloth: Please update Unsloth and Unsloth-Zoo to the latest version!\n"\ "Do this via `pip install --upgrade --force-reinstall --no-cache-dir --no-deps unsloth unsloth_zoo`" diff --git a/unsloth/models/_utils.py b/unsloth/models/_utils.py index 85f1a9a960..fde776a5e6 100644 --- a/unsloth/models/_utils.py +++ b/unsloth/models/_utils.py @@ -12,7 +12,7 @@ # See the License for the specific language governing permissions and # limitations under the License. -__version__ = "2025.8.8" +__version__ = "2025.8.9" __all__ = [ "SUPPORTS_BFLOAT16", From 94bcb28818558f7de378ef4356b5ac6651e545fa Mon Sep 17 00:00:00 2001 From: Daniel Han Date: Wed, 20 Aug 2025 04:40:24 -0700 Subject: [PATCH 068/272] Update loader.py --- unsloth/models/loader.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/unsloth/models/loader.py b/unsloth/models/loader.py index 0da6b83d12..54d2fa2ce6 100644 --- a/unsloth/models/loader.py +++ b/unsloth/models/loader.py @@ -638,7 +638,7 @@ def from_pretrained( os.environ["UNSLOTH_FORCE_CUSTOM_DTYPE"] = \ "all;None;None;"\ "if ('down_projs' in name) and hasattr(module, 'weight') and "\ - "torch.amax(dequantize_module_weight(module)) >= 1024:"\ + "torch.amax(dequantize_module_weight(module)) >= 0:"\ "module._pre_set_compute_dtype = torch.float32\n"\ ""\ ";" From 7d7a1156843603b2b283f77e283801feffbb0ac6 Mon Sep 17 00:00:00 2001 From: Daniel Han Date: Wed, 20 Aug 2025 05:17:37 -0700 Subject: [PATCH 069/272] Update loader.py --- unsloth/models/loader.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/unsloth/models/loader.py b/unsloth/models/loader.py index 54d2fa2ce6..878a7c4a4c 100644 --- a/unsloth/models/loader.py +++ b/unsloth/models/loader.py @@ -637,7 +637,7 @@ def from_pretrained( # Set norms to float32 since anyways they get upcasted to float32 os.environ["UNSLOTH_FORCE_CUSTOM_DTYPE"] = \ "all;None;None;"\ - "if ('down_projs' in name) and hasattr(module, 'weight') and "\ + "if hasattr(module, 'weight') and "\ "torch.amax(dequantize_module_weight(module)) >= 0:"\ "module._pre_set_compute_dtype = torch.float32\n"\ ""\ From 031f5e12487786462fc2f0306ff6792697b2dec7 Mon Sep 17 00:00:00 2001 From: Daniel Han Date: Wed, 20 Aug 2025 07:20:46 -0700 Subject: [PATCH 070/272] Update loader.py --- unsloth/models/loader.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/unsloth/models/loader.py b/unsloth/models/loader.py index 878a7c4a4c..3af8200ebb 100644 --- a/unsloth/models/loader.py +++ b/unsloth/models/loader.py @@ -636,8 +636,8 @@ def from_pretrained( # Set down projection compute dtype to be float32 for float16 machines # Set norms to float32 since anyways they get upcasted to float32 os.environ["UNSLOTH_FORCE_CUSTOM_DTYPE"] = \ - "all;None;None;"\ - "if hasattr(module, 'weight') and "\ + "torch.float16;torch.bfloat16;torch.bfloat16;"\ + "if ('down_projs' in name) and hasattr(module, 'weight') and "\ "torch.amax(dequantize_module_weight(module)) >= 0:"\ "module._pre_set_compute_dtype = torch.float32\n"\ ""\ From 98bee64be03b6988613e2e3b1dbc5013bff3242b Mon Sep 17 00:00:00 2001 From: Daniel Han Date: Wed, 20 Aug 2025 07:34:42 -0700 Subject: [PATCH 071/272] Update loader.py --- unsloth/models/loader.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/unsloth/models/loader.py b/unsloth/models/loader.py index 3af8200ebb..3aed8654f8 100644 --- a/unsloth/models/loader.py +++ b/unsloth/models/loader.py @@ -636,11 +636,13 @@ def from_pretrained( # Set down projection compute dtype to be float32 for float16 machines # Set norms to float32 since anyways they get upcasted to float32 os.environ["UNSLOTH_FORCE_CUSTOM_DTYPE"] = \ - "torch.float16;torch.bfloat16;torch.bfloat16;"\ + "torch.float16;torch.bfloat16;torch.float16;"\ "if ('down_projs' in name) and hasattr(module, 'weight') and "\ "torch.amax(dequantize_module_weight(module)) >= 0:"\ "module._pre_set_compute_dtype = torch.float32\n"\ ""\ + "if ('mlp.router' in name) and hasattr(module, 'weight'):"\ + "module._pre_set_compute_dtype = torch.float32\n"\ ";" # Set norms to float32 since anyways they get upcasted to float32 os.environ["UNSLOTH_HIGH_PRECISION_LAYERNORM"] = "1" From 2ba900880d41c43e5322837d046f00425f3a249c Mon Sep 17 00:00:00 2001 From: Daniel Han Date: Wed, 20 Aug 2025 17:24:53 -0700 Subject: [PATCH 072/272] Update vision.py --- unsloth/models/vision.py | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/unsloth/models/vision.py b/unsloth/models/vision.py index 6790c5cd12..2d3e0a2002 100644 --- a/unsloth/models/vision.py +++ b/unsloth/models/vision.py @@ -245,6 +245,7 @@ def unsloth_base_fast_generate( return output pass +global partial_model class FastBaseModel: @@ -454,6 +455,9 @@ def from_pretrained( raise_handler.remove() # Return old flag os.environ["HF_HUB_ENABLE_HF_TRANSFER"] = old_hf_transfer + global partial_model + partial_model = model + raise # Check float32 norm weights if os.environ.get("UNSLOTH_HIGH_PRECISION_LAYERNORM", "0") == "1": From ea435e6d06712d59ebe00f8e23c86edacc96173a Mon Sep 17 00:00:00 2001 From: Daniel Han Date: Wed, 20 Aug 2025 18:17:55 -0700 Subject: [PATCH 073/272] Update vision.py --- unsloth/models/vision.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/unsloth/models/vision.py b/unsloth/models/vision.py index 2d3e0a2002..a61337b791 100644 --- a/unsloth/models/vision.py +++ b/unsloth/models/vision.py @@ -455,9 +455,6 @@ def from_pretrained( raise_handler.remove() # Return old flag os.environ["HF_HUB_ENABLE_HF_TRANSFER"] = old_hf_transfer - global partial_model - partial_model = model - raise # Check float32 norm weights if os.environ.get("UNSLOTH_HIGH_PRECISION_LAYERNORM", "0") == "1": @@ -525,6 +522,9 @@ def from_pretrained( ) model, tokenizer = patch_tokenizer(model, tokenizer) model = post_patch_loss_function(model) + global partial_model + partial_model = model + raise # Log Unsloth version for future fastpaths for inference if hasattr(model, "config"): From 5bebfa9f37b933a3b000a5aa3f22448ac8fde7c0 Mon Sep 17 00:00:00 2001 From: Daniel Han Date: Wed, 20 Aug 2025 19:13:42 -0700 Subject: [PATCH 074/272] custom_datatype --- unsloth/models/loader.py | 2 +- unsloth/models/vision.py | 1 + 2 files changed, 2 insertions(+), 1 deletion(-) diff --git a/unsloth/models/loader.py b/unsloth/models/loader.py index 3aed8654f8..9ab990133c 100644 --- a/unsloth/models/loader.py +++ b/unsloth/models/loader.py @@ -640,7 +640,7 @@ def from_pretrained( "if ('down_projs' in name) and hasattr(module, 'weight') and "\ "torch.amax(dequantize_module_weight(module)) >= 0:"\ "module._pre_set_compute_dtype = torch.float32\n"\ - ""\ + "\n"\ "if ('mlp.router' in name) and hasattr(module, 'weight'):"\ "module._pre_set_compute_dtype = torch.float32\n"\ ";" diff --git a/unsloth/models/vision.py b/unsloth/models/vision.py index a61337b791..c57fd80ef5 100644 --- a/unsloth/models/vision.py +++ b/unsloth/models/vision.py @@ -463,6 +463,7 @@ def from_pretrained( module._pre_set_compute_dtype = torch.float32 pass # Edit data-types + print("custom_datatype", custom_datatype) if custom_datatype is not None: with torch.no_grad(): for jj, (name, module) in enumerate(model.named_modules()): From 356789a65805931f09ffca007227d203f19d1ebc Mon Sep 17 00:00:00 2001 From: Daniel Han Date: Wed, 20 Aug 2025 19:18:03 -0700 Subject: [PATCH 075/272] recheck --- unsloth/models/loader.py | 1 + unsloth/models/vision.py | 2 +- 2 files changed, 2 insertions(+), 1 deletion(-) diff --git a/unsloth/models/loader.py b/unsloth/models/loader.py index 9ab990133c..3de0943917 100644 --- a/unsloth/models/loader.py +++ b/unsloth/models/loader.py @@ -646,6 +646,7 @@ def from_pretrained( ";" # Set norms to float32 since anyways they get upcasted to float32 os.environ["UNSLOTH_HIGH_PRECISION_LAYERNORM"] = "1" + print(os.environ["UNSLOTH_FORCE_CUSTOM_DTYPE"]) else: for check_model_name in DISABLE_COMPILE_MODEL_NAMES: if check_model_name in lowered_model_name: diff --git a/unsloth/models/vision.py b/unsloth/models/vision.py index c57fd80ef5..419d760f7a 100644 --- a/unsloth/models/vision.py +++ b/unsloth/models/vision.py @@ -359,7 +359,7 @@ def from_pretrained( custom_datatype = os.environ["UNSLOTH_FORCE_CUSTOM_DTYPE"] assert custom_datatype.count(";") >= 4 checker, _dtype, _bnb_compute_dtype, _custom_datatype, execute_code = custom_datatype.split(";", 4) - + print(checker, _dtype, _bnb_compute_dtype, _custom_datatype, execute_code) # Allow custom dtypes on all runs allow_all_runs = (checker == "all") # Allow only on float16 datatypes From d0f97a9a0f295fbe08f3c6b4401b34bcea125ac1 Mon Sep 17 00:00:00 2001 From: Daniel Han Date: Wed, 20 Aug 2025 19:21:21 -0700 Subject: [PATCH 076/272] Float16 --- unsloth/models/loader.py | 5 ++--- unsloth/models/vision.py | 5 ++++- 2 files changed, 6 insertions(+), 4 deletions(-) diff --git a/unsloth/models/loader.py b/unsloth/models/loader.py index 3de0943917..a7d3da17bd 100644 --- a/unsloth/models/loader.py +++ b/unsloth/models/loader.py @@ -601,7 +601,7 @@ def from_pretrained( raise RuntimeError("Unsloth: Gemma 3N only works on transformers >= 4.53.0" + LATEST) os.environ["UNSLOTH_DISABLE_STATIC_GENERATION"] = "1" os.environ["UNSLOTH_FORCE_CUSTOM_DTYPE"] = \ - "float16;torch.float16;torch.float16;"\ + "torch.float16;torch.float16;torch.float16;"\ "if name.endswith('norm'): "\ "module._pre_set_compute_dtype = torch.float32\n"\ ";"\ @@ -612,7 +612,7 @@ def from_pretrained( # Falcon must use float32 Triton ie TRITON_F32_DEFAULT = 'ieee' # since Mamba kernels error out on using lower precision os.environ["UNSLOTH_FORCE_CUSTOM_DTYPE"] = \ - "float16;torch.float32;torch.float16;"\ + "torch.float16;torch.float32;torch.float16;"\ "if name.endswith(('q_proj', 'k_proj', 'v_proj', 'o_proj', 'gate_proj', 'up_proj', 'down_proj', 'head')): module.to(torch.float16)"\ ";"\ "os.environ['TRITON_F32_DEFAULT'] = 'ieee'" @@ -646,7 +646,6 @@ def from_pretrained( ";" # Set norms to float32 since anyways they get upcasted to float32 os.environ["UNSLOTH_HIGH_PRECISION_LAYERNORM"] = "1" - print(os.environ["UNSLOTH_FORCE_CUSTOM_DTYPE"]) else: for check_model_name in DISABLE_COMPILE_MODEL_NAMES: if check_model_name in lowered_model_name: diff --git a/unsloth/models/vision.py b/unsloth/models/vision.py index 419d760f7a..12ec00c3bd 100644 --- a/unsloth/models/vision.py +++ b/unsloth/models/vision.py @@ -363,7 +363,10 @@ def from_pretrained( # Allow custom dtypes on all runs allow_all_runs = (checker == "all") # Allow only on float16 datatypes - allow_float16_runs = (checker == "float16" and dtype == torch.float16) + allow_float16_runs = ( + (checker == "float16" or checker == "torch.float16") and \ + (dtype == torch.float16) + ) if allow_all_runs or allow_float16_runs: if eval(_dtype) is not None: From d83767f321203359cd31a096b502b6d81181fe77 Mon Sep 17 00:00:00 2001 From: Daniel Han Date: Wed, 20 Aug 2025 19:24:26 -0700 Subject: [PATCH 077/272] Update vision.py --- unsloth/models/vision.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/unsloth/models/vision.py b/unsloth/models/vision.py index 12ec00c3bd..705647cb28 100644 --- a/unsloth/models/vision.py +++ b/unsloth/models/vision.py @@ -359,7 +359,6 @@ def from_pretrained( custom_datatype = os.environ["UNSLOTH_FORCE_CUSTOM_DTYPE"] assert custom_datatype.count(";") >= 4 checker, _dtype, _bnb_compute_dtype, _custom_datatype, execute_code = custom_datatype.split(";", 4) - print(checker, _dtype, _bnb_compute_dtype, _custom_datatype, execute_code) # Allow custom dtypes on all runs allow_all_runs = (checker == "all") # Allow only on float16 datatypes @@ -367,6 +366,7 @@ def from_pretrained( (checker == "float16" or checker == "torch.float16") and \ (dtype == torch.float16) ) + print([checker], [_dtype], [_bnb_compute_dtype], [_custom_datatype], [execute_code] ) if allow_all_runs or allow_float16_runs: if eval(_dtype) is not None: @@ -387,7 +387,7 @@ def from_pretrained( if not ("attn_implementation" in kwargs): kwargs["attn_implementation"] = "sdpa" if not supports_sdpa: - print(f"Unsloth: {model_type_arch.title()} does not support SDPA - switching to eager!") + print(f"Unsloth: {model_type_arch.title()} does not support SDPA - switching to fast eager.") del kwargs["attn_implementation"] pass From 5b575d87ef24302cb434743868836bcd95acc2f2 Mon Sep 17 00:00:00 2001 From: Daniel Han Date: Wed, 20 Aug 2025 19:27:58 -0700 Subject: [PATCH 078/272] Update vision.py --- unsloth/models/vision.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/unsloth/models/vision.py b/unsloth/models/vision.py index 705647cb28..44f62d850d 100644 --- a/unsloth/models/vision.py +++ b/unsloth/models/vision.py @@ -366,7 +366,7 @@ def from_pretrained( (checker == "float16" or checker == "torch.float16") and \ (dtype == torch.float16) ) - print([checker], [_dtype], [_bnb_compute_dtype], [_custom_datatype], [execute_code] ) + print([allow_float16_runs], [checker], [_dtype], [_bnb_compute_dtype], [_custom_datatype], [execute_code] ) if allow_all_runs or allow_float16_runs: if eval(_dtype) is not None: From 66eee4deea47e76281497aeabc0be1a215ab9f39 Mon Sep 17 00:00:00 2001 From: Daniel Han Date: Wed, 20 Aug 2025 19:29:05 -0700 Subject: [PATCH 079/272] Update vision.py --- unsloth/models/vision.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/unsloth/models/vision.py b/unsloth/models/vision.py index 44f62d850d..3ce03e6da7 100644 --- a/unsloth/models/vision.py +++ b/unsloth/models/vision.py @@ -366,7 +366,7 @@ def from_pretrained( (checker == "float16" or checker == "torch.float16") and \ (dtype == torch.float16) ) - print([allow_float16_runs], [checker], [_dtype], [_bnb_compute_dtype], [_custom_datatype], [execute_code] ) + print([(checker == "float16" or checker == "torch.float16")], [dtype], [allow_float16_runs], [checker], [_dtype], [_bnb_compute_dtype], [_custom_datatype], [execute_code] ) if allow_all_runs or allow_float16_runs: if eval(_dtype) is not None: From 27d044e47840785f40a195aa7ee77dcab1149046 Mon Sep 17 00:00:00 2001 From: Daniel Han Date: Wed, 20 Aug 2025 20:38:44 -0700 Subject: [PATCH 080/272] Update vision.py --- unsloth/models/vision.py | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/unsloth/models/vision.py b/unsloth/models/vision.py index 3ce03e6da7..e125824c63 100644 --- a/unsloth/models/vision.py +++ b/unsloth/models/vision.py @@ -364,10 +364,8 @@ def from_pretrained( # Allow only on float16 datatypes allow_float16_runs = ( (checker == "float16" or checker == "torch.float16") and \ - (dtype == torch.float16) + (dtype == torch.float16 or os.environ.get("UNSLOTH_FORCE_FLOAT32", "0") == "1") ) - print([(checker == "float16" or checker == "torch.float16")], [dtype], [allow_float16_runs], [checker], [_dtype], [_bnb_compute_dtype], [_custom_datatype], [execute_code] ) - if allow_all_runs or allow_float16_runs: if eval(_dtype) is not None: dtype = eval(_dtype) From 34d07d89463c21cbb33275ccffaf044e3d7df243 Mon Sep 17 00:00:00 2001 From: Daniel Han Date: Wed, 20 Aug 2025 20:42:24 -0700 Subject: [PATCH 081/272] Update vision.py --- unsloth/models/vision.py | 4 ---- 1 file changed, 4 deletions(-) diff --git a/unsloth/models/vision.py b/unsloth/models/vision.py index e125824c63..23e2bb088a 100644 --- a/unsloth/models/vision.py +++ b/unsloth/models/vision.py @@ -464,7 +464,6 @@ def from_pretrained( module._pre_set_compute_dtype = torch.float32 pass # Edit data-types - print("custom_datatype", custom_datatype) if custom_datatype is not None: with torch.no_grad(): for jj, (name, module) in enumerate(model.named_modules()): @@ -524,9 +523,6 @@ def from_pretrained( ) model, tokenizer = patch_tokenizer(model, tokenizer) model = post_patch_loss_function(model) - global partial_model - partial_model = model - raise # Log Unsloth version for future fastpaths for inference if hasattr(model, "config"): From 3ad756145f638cfaa2f15a21f24d4b97d58d4ad1 Mon Sep 17 00:00:00 2001 From: Daniel Han Date: Wed, 20 Aug 2025 21:29:44 -0700 Subject: [PATCH 082/272] Update loader.py --- unsloth/models/loader.py | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/unsloth/models/loader.py b/unsloth/models/loader.py index a7d3da17bd..0156e2f059 100644 --- a/unsloth/models/loader.py +++ b/unsloth/models/loader.py @@ -638,11 +638,9 @@ def from_pretrained( os.environ["UNSLOTH_FORCE_CUSTOM_DTYPE"] = \ "torch.float16;torch.bfloat16;torch.float16;"\ "if ('down_projs' in name) and hasattr(module, 'weight') and "\ - "torch.amax(dequantize_module_weight(module)) >= 0:"\ + "torch.amax(dequantize_module_weight(module)) >= 1024:"\ "module._pre_set_compute_dtype = torch.float32\n"\ "\n"\ - "if ('mlp.router' in name) and hasattr(module, 'weight'):"\ - "module._pre_set_compute_dtype = torch.float32\n"\ ";" # Set norms to float32 since anyways they get upcasted to float32 os.environ["UNSLOTH_HIGH_PRECISION_LAYERNORM"] = "1" From b75729795a21149ff23f513469f603f21ddf7a0b Mon Sep 17 00:00:00 2001 From: Daniel Han Date: Wed, 20 Aug 2025 21:31:05 -0700 Subject: [PATCH 083/272] Update loader.py --- unsloth/models/loader.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/unsloth/models/loader.py b/unsloth/models/loader.py index 0156e2f059..14baa60d66 100644 --- a/unsloth/models/loader.py +++ b/unsloth/models/loader.py @@ -638,7 +638,7 @@ def from_pretrained( os.environ["UNSLOTH_FORCE_CUSTOM_DTYPE"] = \ "torch.float16;torch.bfloat16;torch.float16;"\ "if ('down_projs' in name) and hasattr(module, 'weight') and "\ - "torch.amax(dequantize_module_weight(module)) >= 1024:"\ + "torch.amax(dequantize_module_weight(module)) >= 102400:"\ "module._pre_set_compute_dtype = torch.float32\n"\ "\n"\ ";" From ceeca866ae8cb9774a830d3fba84c9238c281d77 Mon Sep 17 00:00:00 2001 From: Daniel Han Date: Wed, 20 Aug 2025 21:44:30 -0700 Subject: [PATCH 084/272] Update loader.py --- unsloth/models/loader.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/unsloth/models/loader.py b/unsloth/models/loader.py index 14baa60d66..4e0365ce1e 100644 --- a/unsloth/models/loader.py +++ b/unsloth/models/loader.py @@ -638,7 +638,7 @@ def from_pretrained( os.environ["UNSLOTH_FORCE_CUSTOM_DTYPE"] = \ "torch.float16;torch.bfloat16;torch.float16;"\ "if ('down_projs' in name) and hasattr(module, 'weight') and "\ - "torch.amax(dequantize_module_weight(module)) >= 102400:"\ + "torch.amax(dequantize_module_weight(module)) >= 512:"\ "module._pre_set_compute_dtype = torch.float32\n"\ "\n"\ ";" From 87758b98edf6cc2aa8addbd19cfba4678fa3cc2c Mon Sep 17 00:00:00 2001 From: Daniel Han Date: Wed, 20 Aug 2025 21:51:36 -0700 Subject: [PATCH 085/272] Update loader.py --- unsloth/models/loader.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/unsloth/models/loader.py b/unsloth/models/loader.py index 4e0365ce1e..85696859ae 100644 --- a/unsloth/models/loader.py +++ b/unsloth/models/loader.py @@ -638,9 +638,11 @@ def from_pretrained( os.environ["UNSLOTH_FORCE_CUSTOM_DTYPE"] = \ "torch.float16;torch.bfloat16;torch.float16;"\ "if ('down_projs' in name) and hasattr(module, 'weight') and "\ - "torch.amax(dequantize_module_weight(module)) >= 512:"\ + "torch.amax(dequantize_module_weight(module)) >= 256:"\ "module._pre_set_compute_dtype = torch.float32\n"\ "\n"\ + "if ('mlp.router' in name) and hasattr(module, 'weight'):"\ + "module._pre_set_compute_dtype = torch.float32\n"\ ";" # Set norms to float32 since anyways they get upcasted to float32 os.environ["UNSLOTH_HIGH_PRECISION_LAYERNORM"] = "1" From 97d34d48536b35c0d2fd7d60995c099aea8a6d83 Mon Sep 17 00:00:00 2001 From: Daniel Han Date: Thu, 21 Aug 2025 00:23:27 -0700 Subject: [PATCH 086/272] Update loader.py --- unsloth/models/loader.py | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/unsloth/models/loader.py b/unsloth/models/loader.py index 85696859ae..4e0365ce1e 100644 --- a/unsloth/models/loader.py +++ b/unsloth/models/loader.py @@ -638,11 +638,9 @@ def from_pretrained( os.environ["UNSLOTH_FORCE_CUSTOM_DTYPE"] = \ "torch.float16;torch.bfloat16;torch.float16;"\ "if ('down_projs' in name) and hasattr(module, 'weight') and "\ - "torch.amax(dequantize_module_weight(module)) >= 256:"\ + "torch.amax(dequantize_module_weight(module)) >= 512:"\ "module._pre_set_compute_dtype = torch.float32\n"\ "\n"\ - "if ('mlp.router' in name) and hasattr(module, 'weight'):"\ - "module._pre_set_compute_dtype = torch.float32\n"\ ";" # Set norms to float32 since anyways they get upcasted to float32 os.environ["UNSLOTH_HIGH_PRECISION_LAYERNORM"] = "1" From 43bf41f9df86e3bb2bf40e4db8957e0418fbc5e6 Mon Sep 17 00:00:00 2001 From: Daniel Han Date: Thu, 21 Aug 2025 00:24:39 -0700 Subject: [PATCH 087/272] Update loader.py --- unsloth/models/loader.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/unsloth/models/loader.py b/unsloth/models/loader.py index 4e0365ce1e..94a07bf06a 100644 --- a/unsloth/models/loader.py +++ b/unsloth/models/loader.py @@ -638,7 +638,7 @@ def from_pretrained( os.environ["UNSLOTH_FORCE_CUSTOM_DTYPE"] = \ "torch.float16;torch.bfloat16;torch.float16;"\ "if ('down_projs' in name) and hasattr(module, 'weight') and "\ - "torch.amax(dequantize_module_weight(module)) >= 512:"\ + "torch.amax(dequantize_module_weight(module)) >= 256:"\ "module._pre_set_compute_dtype = torch.float32\n"\ "\n"\ ";" From 6e7ad5259d13c959cb08ee81a97547425144d639 Mon Sep 17 00:00:00 2001 From: Daniel Han Date: Thu, 21 Aug 2025 00:26:49 -0700 Subject: [PATCH 088/272] Update loader.py --- unsloth/models/loader.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/unsloth/models/loader.py b/unsloth/models/loader.py index 94a07bf06a..c9c1e05553 100644 --- a/unsloth/models/loader.py +++ b/unsloth/models/loader.py @@ -638,7 +638,7 @@ def from_pretrained( os.environ["UNSLOTH_FORCE_CUSTOM_DTYPE"] = \ "torch.float16;torch.bfloat16;torch.float16;"\ "if ('down_projs' in name) and hasattr(module, 'weight') and "\ - "torch.amax(dequantize_module_weight(module)) >= 256:"\ + "torch.amax(dequantize_module_weight(module)) >= 128:"\ "module._pre_set_compute_dtype = torch.float32\n"\ "\n"\ ";" From d605aa7311bffa8e80ae6ec3e6f34716d209e140 Mon Sep 17 00:00:00 2001 From: Daniel Han Date: Thu, 21 Aug 2025 00:35:38 -0700 Subject: [PATCH 089/272] Update loader.py --- unsloth/models/loader.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/unsloth/models/loader.py b/unsloth/models/loader.py index c9c1e05553..6ec045eb36 100644 --- a/unsloth/models/loader.py +++ b/unsloth/models/loader.py @@ -638,7 +638,7 @@ def from_pretrained( os.environ["UNSLOTH_FORCE_CUSTOM_DTYPE"] = \ "torch.float16;torch.bfloat16;torch.float16;"\ "if ('down_projs' in name) and hasattr(module, 'weight') and "\ - "torch.amax(dequantize_module_weight(module)) >= 128:"\ + "torch.amax(dequantize_module_weight(module)) >= 0:"\ "module._pre_set_compute_dtype = torch.float32\n"\ "\n"\ ";" From f417dc882969acfd9e11a4a3d0ed7b548371aa2e Mon Sep 17 00:00:00 2001 From: Daniel Han Date: Thu, 21 Aug 2025 00:51:06 -0700 Subject: [PATCH 090/272] Update loader.py --- unsloth/models/loader.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/unsloth/models/loader.py b/unsloth/models/loader.py index 6ec045eb36..a7d3da17bd 100644 --- a/unsloth/models/loader.py +++ b/unsloth/models/loader.py @@ -641,6 +641,8 @@ def from_pretrained( "torch.amax(dequantize_module_weight(module)) >= 0:"\ "module._pre_set_compute_dtype = torch.float32\n"\ "\n"\ + "if ('mlp.router' in name) and hasattr(module, 'weight'):"\ + "module._pre_set_compute_dtype = torch.float32\n"\ ";" # Set norms to float32 since anyways they get upcasted to float32 os.environ["UNSLOTH_HIGH_PRECISION_LAYERNORM"] = "1" From 05fe3d1fd7d6f202a4f8b50262d5d00127eb72e2 Mon Sep 17 00:00:00 2001 From: Daniel Han Date: Thu, 21 Aug 2025 01:21:10 -0700 Subject: [PATCH 091/272] Update loader.py --- unsloth/models/loader.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/unsloth/models/loader.py b/unsloth/models/loader.py index a7d3da17bd..28bb896760 100644 --- a/unsloth/models/loader.py +++ b/unsloth/models/loader.py @@ -637,7 +637,7 @@ def from_pretrained( # Set norms to float32 since anyways they get upcasted to float32 os.environ["UNSLOTH_FORCE_CUSTOM_DTYPE"] = \ "torch.float16;torch.bfloat16;torch.float16;"\ - "if ('down_projs' in name) and hasattr(module, 'weight') and "\ + "if ('down_projs' in name or 'gate_up_proj' in name) and hasattr(module, 'weight') and "\ "torch.amax(dequantize_module_weight(module)) >= 0:"\ "module._pre_set_compute_dtype = torch.float32\n"\ "\n"\ From a79d6f6ac880e17b6079b1ba7981b130615a19dc Mon Sep 17 00:00:00 2001 From: Daniel Han Date: Thu, 21 Aug 2025 01:54:48 -0700 Subject: [PATCH 092/272] Update loader.py --- unsloth/models/loader.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/unsloth/models/loader.py b/unsloth/models/loader.py index 28bb896760..a7d3da17bd 100644 --- a/unsloth/models/loader.py +++ b/unsloth/models/loader.py @@ -637,7 +637,7 @@ def from_pretrained( # Set norms to float32 since anyways they get upcasted to float32 os.environ["UNSLOTH_FORCE_CUSTOM_DTYPE"] = \ "torch.float16;torch.bfloat16;torch.float16;"\ - "if ('down_projs' in name or 'gate_up_proj' in name) and hasattr(module, 'weight') and "\ + "if ('down_projs' in name) and hasattr(module, 'weight') and "\ "torch.amax(dequantize_module_weight(module)) >= 0:"\ "module._pre_set_compute_dtype = torch.float32\n"\ "\n"\ From 59702c494078128468015ccd003761e83ca2451a Mon Sep 17 00:00:00 2001 From: Daniel Han Date: Thu, 21 Aug 2025 02:09:23 -0700 Subject: [PATCH 093/272] Update loader.py --- unsloth/models/loader.py | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/unsloth/models/loader.py b/unsloth/models/loader.py index a7d3da17bd..b95678a499 100644 --- a/unsloth/models/loader.py +++ b/unsloth/models/loader.py @@ -643,6 +643,10 @@ def from_pretrained( "\n"\ "if ('mlp.router' in name) and hasattr(module, 'weight'):"\ "module._pre_set_compute_dtype = torch.float32\n"\ + "\n"\ + "if ('self_attn' in name) and hasattr(module, 'sinks'):"\ + "module.sinks._pre_set_compute_dtype = torch.float32\n"\ + "\n"\ ";" # Set norms to float32 since anyways they get upcasted to float32 os.environ["UNSLOTH_HIGH_PRECISION_LAYERNORM"] = "1" From 1b66aee7b2f395ba51e1a2e69219f2c08701a95c Mon Sep 17 00:00:00 2001 From: Daniel Han Date: Thu, 21 Aug 2025 02:32:41 -0700 Subject: [PATCH 094/272] Update loader.py --- unsloth/models/loader.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/unsloth/models/loader.py b/unsloth/models/loader.py index b95678a499..ef39e636c2 100644 --- a/unsloth/models/loader.py +++ b/unsloth/models/loader.py @@ -637,7 +637,7 @@ def from_pretrained( # Set norms to float32 since anyways they get upcasted to float32 os.environ["UNSLOTH_FORCE_CUSTOM_DTYPE"] = \ "torch.float16;torch.bfloat16;torch.float16;"\ - "if ('down_projs' in name) and hasattr(module, 'weight') and "\ + "if ('down_projs' in name or '_proj' in name) and hasattr(module, 'weight') and "\ "torch.amax(dequantize_module_weight(module)) >= 0:"\ "module._pre_set_compute_dtype = torch.float32\n"\ "\n"\ From a71fa05c7a7a8e72547a7c054e659ce1149e088e Mon Sep 17 00:00:00 2001 From: Daniel Han Date: Thu, 21 Aug 2025 02:51:45 -0700 Subject: [PATCH 095/272] Update loader.py --- unsloth/models/loader.py | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/unsloth/models/loader.py b/unsloth/models/loader.py index ef39e636c2..dd0a3961e7 100644 --- a/unsloth/models/loader.py +++ b/unsloth/models/loader.py @@ -637,7 +637,7 @@ def from_pretrained( # Set norms to float32 since anyways they get upcasted to float32 os.environ["UNSLOTH_FORCE_CUSTOM_DTYPE"] = \ "torch.float16;torch.bfloat16;torch.float16;"\ - "if ('down_projs' in name or '_proj' in name) and hasattr(module, 'weight') and "\ + "if ('down_projs') and hasattr(module, 'weight') and "\ "torch.amax(dequantize_module_weight(module)) >= 0:"\ "module._pre_set_compute_dtype = torch.float32\n"\ "\n"\ @@ -647,6 +647,9 @@ def from_pretrained( "if ('self_attn' in name) and hasattr(module, 'sinks'):"\ "module.sinks._pre_set_compute_dtype = torch.float32\n"\ "\n"\ + "if ('embed_tokens' in name):"\ + "module.sinks._pre_set_compute_dtype = torch.float32\n"\ + "\n"\ ";" # Set norms to float32 since anyways they get upcasted to float32 os.environ["UNSLOTH_HIGH_PRECISION_LAYERNORM"] = "1" From d3e8625b1de6703165535f985d54ebf621eec1ae Mon Sep 17 00:00:00 2001 From: Daniel Han Date: Thu, 21 Aug 2025 02:53:29 -0700 Subject: [PATCH 096/272] Update loader.py --- unsloth/models/loader.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/unsloth/models/loader.py b/unsloth/models/loader.py index dd0a3961e7..1c64ae4cfc 100644 --- a/unsloth/models/loader.py +++ b/unsloth/models/loader.py @@ -647,8 +647,8 @@ def from_pretrained( "if ('self_attn' in name) and hasattr(module, 'sinks'):"\ "module.sinks._pre_set_compute_dtype = torch.float32\n"\ "\n"\ - "if ('embed_tokens' in name):"\ - "module.sinks._pre_set_compute_dtype = torch.float32\n"\ + "if ('embed_tokens' in name) and hasattr(module, 'weight'):"\ + "module._pre_set_compute_dtype = torch.float32\n"\ "\n"\ ";" # Set norms to float32 since anyways they get upcasted to float32 From fb112cf3c6b48df1afcf51827f775ce1fee951eb Mon Sep 17 00:00:00 2001 From: Daniel Han Date: Thu, 21 Aug 2025 03:09:03 -0700 Subject: [PATCH 097/272] Update loader.py --- unsloth/models/loader.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/unsloth/models/loader.py b/unsloth/models/loader.py index 1c64ae4cfc..e8c410ebd1 100644 --- a/unsloth/models/loader.py +++ b/unsloth/models/loader.py @@ -647,7 +647,7 @@ def from_pretrained( "if ('self_attn' in name) and hasattr(module, 'sinks'):"\ "module.sinks._pre_set_compute_dtype = torch.float32\n"\ "\n"\ - "if ('embed_tokens' in name) and hasattr(module, 'weight'):"\ + "if ('embed_tokens' in name or 'lm_head' in name) and hasattr(module, 'weight'):"\ "module._pre_set_compute_dtype = torch.float32\n"\ "\n"\ ";" From 5dbdcc565dd6dc8fa5edc2bf4314ad326ffef18c Mon Sep 17 00:00:00 2001 From: Daniel Han Date: Thu, 21 Aug 2025 03:29:27 -0700 Subject: [PATCH 098/272] Update loader.py --- unsloth/models/loader.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/unsloth/models/loader.py b/unsloth/models/loader.py index e8c410ebd1..c9e0646af7 100644 --- a/unsloth/models/loader.py +++ b/unsloth/models/loader.py @@ -648,7 +648,7 @@ def from_pretrained( "module.sinks._pre_set_compute_dtype = torch.float32\n"\ "\n"\ "if ('embed_tokens' in name or 'lm_head' in name) and hasattr(module, 'weight'):"\ - "module._pre_set_compute_dtype = torch.float32\n"\ + "module._pre_set_compute_dtype = torch.bfloat16\n"\ "\n"\ ";" # Set norms to float32 since anyways they get upcasted to float32 From fdaa0074093bfffd626632bf8153d52eb7c30a4e Mon Sep 17 00:00:00 2001 From: Daniel Han Date: Thu, 21 Aug 2025 04:02:33 -0700 Subject: [PATCH 099/272] Update loader.py --- unsloth/models/loader.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/unsloth/models/loader.py b/unsloth/models/loader.py index c9e0646af7..71459599a5 100644 --- a/unsloth/models/loader.py +++ b/unsloth/models/loader.py @@ -639,13 +639,13 @@ def from_pretrained( "torch.float16;torch.bfloat16;torch.float16;"\ "if ('down_projs') and hasattr(module, 'weight') and "\ "torch.amax(dequantize_module_weight(module)) >= 0:"\ - "module._pre_set_compute_dtype = torch.float32\n"\ + "module._pre_set_compute_dtype = torch.bfloat16\n"\ "\n"\ "if ('mlp.router' in name) and hasattr(module, 'weight'):"\ - "module._pre_set_compute_dtype = torch.float32\n"\ + "module._pre_set_compute_dtype = torch.bfloat16\n"\ "\n"\ "if ('self_attn' in name) and hasattr(module, 'sinks'):"\ - "module.sinks._pre_set_compute_dtype = torch.float32\n"\ + "module.sinks._pre_set_compute_dtype = torch.bfloat16\n"\ "\n"\ "if ('embed_tokens' in name or 'lm_head' in name) and hasattr(module, 'weight'):"\ "module._pre_set_compute_dtype = torch.bfloat16\n"\ From ba0eb04d9076811da446e8a7d46717ac91fd2ada Mon Sep 17 00:00:00 2001 From: Daniel Han Date: Thu, 21 Aug 2025 04:19:00 -0700 Subject: [PATCH 100/272] Bug fix --- unsloth/models/loader.py | 2 +- unsloth/models/vision.py | 3 ++- 2 files changed, 3 insertions(+), 2 deletions(-) diff --git a/unsloth/models/loader.py b/unsloth/models/loader.py index 71459599a5..7b8320c65a 100644 --- a/unsloth/models/loader.py +++ b/unsloth/models/loader.py @@ -637,7 +637,7 @@ def from_pretrained( # Set norms to float32 since anyways they get upcasted to float32 os.environ["UNSLOTH_FORCE_CUSTOM_DTYPE"] = \ "torch.float16;torch.bfloat16;torch.float16;"\ - "if ('down_projs') and hasattr(module, 'weight') and "\ + "if ('_proj' in name) and hasattr(module, 'weight') and "\ "torch.amax(dequantize_module_weight(module)) >= 0:"\ "module._pre_set_compute_dtype = torch.bfloat16\n"\ "\n"\ diff --git a/unsloth/models/vision.py b/unsloth/models/vision.py index 23e2bb088a..486a049339 100644 --- a/unsloth/models/vision.py +++ b/unsloth/models/vision.py @@ -213,7 +213,8 @@ def unsloth_base_fast_generate( cache_implementation = None if cache_implementation is not None: swa = getattr(getattr(self.config, "text_config", self.config), "sliding_window", None) - if swa == 0 or type(swa) is not int: + if (swa == 0 or type(swa) is not int) \ + and (getattr(self, "_can_compile_fullgraph", True) is True): cache_implementation = "static" else: cache_implementation = "hybrid" From 3f982620a575c0117aafc572c4767d77ced7304b Mon Sep 17 00:00:00 2001 From: Daniel Han Date: Thu, 21 Aug 2025 05:47:58 -0700 Subject: [PATCH 101/272] Update loader.py --- unsloth/models/loader.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/unsloth/models/loader.py b/unsloth/models/loader.py index 7b8320c65a..f6bb23551d 100644 --- a/unsloth/models/loader.py +++ b/unsloth/models/loader.py @@ -636,7 +636,7 @@ def from_pretrained( # Set down projection compute dtype to be float32 for float16 machines # Set norms to float32 since anyways they get upcasted to float32 os.environ["UNSLOTH_FORCE_CUSTOM_DTYPE"] = \ - "torch.float16;torch.bfloat16;torch.float16;"\ + "torch.float16;torch.bfloat16;torch.bfloat16;"\ "if ('_proj' in name) and hasattr(module, 'weight') and "\ "torch.amax(dequantize_module_weight(module)) >= 0:"\ "module._pre_set_compute_dtype = torch.bfloat16\n"\ From 3e6511b84f297289bf694893b023db35fd24fc49 Mon Sep 17 00:00:00 2001 From: Daniel Han Date: Thu, 21 Aug 2025 06:37:40 -0700 Subject: [PATCH 102/272] Update loader.py --- unsloth/models/loader.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/unsloth/models/loader.py b/unsloth/models/loader.py index f6bb23551d..889d170a17 100644 --- a/unsloth/models/loader.py +++ b/unsloth/models/loader.py @@ -785,7 +785,8 @@ def from_pretrained( model_types = ["siglip"] + model_types # Set forced float32 env flag - os.environ["UNSLOTH_FORCE_FLOAT32"] = "0" + if "UNSLOTH_FORCE_FLOAT32" not in os.environ: + os.environ["UNSLOTH_FORCE_FLOAT32"] = "0" do_forced_float32 = False for model_type_arch in model_types: if model_type_arch != "siglip": break From c9e75375b31d14c66e9f8846e2793f96e9bfee71 Mon Sep 17 00:00:00 2001 From: Daniel Han Date: Thu, 21 Aug 2025 07:00:44 -0700 Subject: [PATCH 103/272] Update loader.py --- unsloth/models/loader.py | 3 +++ 1 file changed, 3 insertions(+) diff --git a/unsloth/models/loader.py b/unsloth/models/loader.py index 889d170a17..3112f674fe 100644 --- a/unsloth/models/loader.py +++ b/unsloth/models/loader.py @@ -641,6 +641,9 @@ def from_pretrained( "torch.amax(dequantize_module_weight(module)) >= 0:"\ "module._pre_set_compute_dtype = torch.bfloat16\n"\ "\n"\ + "if hasattr(module, 'weight'):"\ + "module._pre_set_compute_dtype = torch.bfloat16\n"\ + "\n"\ "if ('mlp.router' in name) and hasattr(module, 'weight'):"\ "module._pre_set_compute_dtype = torch.bfloat16\n"\ "\n"\ From 2e38e8a9b9e46b5bb4bf026dfff677728d662297 Mon Sep 17 00:00:00 2001 From: Daniel Han Date: Fri, 22 Aug 2025 03:42:08 -0700 Subject: [PATCH 104/272] Update loader.py --- unsloth/models/loader.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/unsloth/models/loader.py b/unsloth/models/loader.py index 3112f674fe..9ae1448762 100644 --- a/unsloth/models/loader.py +++ b/unsloth/models/loader.py @@ -897,6 +897,8 @@ def from_pretrained( if load_in_4bit: # Fix up bitsandbytes config + print("torch_dtype", model.config.to_dict().get("torch_dtype")) + print("dtype", model.config.to_dict().get("dtype")) quantization_config = \ { # Sometimes torch_dtype is not a string!! From 8b3a8bacf4a19133d9d4952fad7fd65d437861a8 Mon Sep 17 00:00:00 2001 From: Daniel Han Date: Fri, 22 Aug 2025 03:44:29 -0700 Subject: [PATCH 105/272] Update loader.py --- unsloth/models/loader.py | 10 ++++++---- 1 file changed, 6 insertions(+), 4 deletions(-) diff --git a/unsloth/models/loader.py b/unsloth/models/loader.py index 9ae1448762..1b3b2d6011 100644 --- a/unsloth/models/loader.py +++ b/unsloth/models/loader.py @@ -436,10 +436,12 @@ def from_pretrained( if load_in_4bit: # Fix up bitsandbytes config + config = model.config.to_dict() + torch_dtype = config.get("dtype") or config.get("torch_dtype") quantization_config = \ { # Sometimes torch_dtype is not a string!! - "bnb_4bit_compute_dtype" : model.config.to_dict()["torch_dtype"], + "bnb_4bit_compute_dtype" : torch_dtype, "bnb_4bit_quant_type" : "nf4", "bnb_4bit_use_double_quant" : True, "llm_int8_enable_fp32_cpu_offload" : False, @@ -897,12 +899,12 @@ def from_pretrained( if load_in_4bit: # Fix up bitsandbytes config - print("torch_dtype", model.config.to_dict().get("torch_dtype")) - print("dtype", model.config.to_dict().get("dtype")) + config = model.config.to_dict() + torch_dtype = config.get("dtype") or config.get("torch_dtype") quantization_config = \ { # Sometimes torch_dtype is not a string!! - "bnb_4bit_compute_dtype" : model.config.to_dict()["torch_dtype"], + "bnb_4bit_compute_dtype" : torch_dtype, "bnb_4bit_quant_type" : "nf4", "bnb_4bit_use_double_quant" : True, "llm_int8_enable_fp32_cpu_offload" : False, From f706d20e56924bdb26190625ebb66bac4eaa63d6 Mon Sep 17 00:00:00 2001 From: Daniel Han Date: Fri, 22 Aug 2025 03:59:09 -0700 Subject: [PATCH 106/272] torch_dtype --- unsloth/models/vision.py | 19 ++++++++++++++----- unsloth/save.py | 15 +++++++++++---- 2 files changed, 25 insertions(+), 9 deletions(-) diff --git a/unsloth/models/vision.py b/unsloth/models/vision.py index 486a049339..fc31032594 100644 --- a/unsloth/models/vision.py +++ b/unsloth/models/vision.py @@ -73,6 +73,9 @@ PROMPT_LOOPKUP = dict() from transformers import GenerationConfig, CompileConfig, HybridCache +from transformers import PretrainedConfig +HAS_TORCH_DTYPE = "torch_dtype" in PretrainedConfig.__doc__ + _compile_config = CompileConfig( fullgraph = False, dynamic = None, @@ -118,7 +121,7 @@ def unsloth_base_fast_generate( bsz = input_ids.shape[0] FastBaseModel.for_inference(self) - dtype = _get_dtype(self.config.torch_dtype) + dtype = _get_dtype(getattr(self.config, "dtype", None) or getattr(self.config, "torch_dtype", None)) # Check if VLM is_vlm = any( @@ -246,8 +249,6 @@ def unsloth_base_fast_generate( return output pass -global partial_model - class FastBaseModel: @staticmethod @@ -443,11 +444,17 @@ def from_pretrained( torch_dtype = dtype if do_forced_float32: torch_dtype = torch.bfloat16 + if HAS_TORCH_DTYPE: + kwargs["torch_dtype"] = torch_dtype + else: + # Transformers removed torch_dtype + kwargs["dtype"] = torch_dtype + raise_handler = RaiseUninitialized() model = auto_model.from_pretrained( model_name, device_map = device_map, - torch_dtype = torch_dtype, + # torch_dtype = torch_dtype, # Transformers removed torch_dtype # quantization_config = bnb_config, token = token, trust_remote_code = trust_remote_code, @@ -698,7 +705,9 @@ def post_patch_model( full_finetuning = os.environ.get("UNSLOTH_ENABLE_FULL_FINETUNING", "0") == "1" float32_mixed_precision = True - if _get_dtype(model.config.torch_dtype) == torch.bfloat16 and full_finetuning: + if _get_dtype( + getattr(model.config, "dtype", None) or getattr(model.config, "torch_dtype", None) + ) == torch.bfloat16 and full_finetuning: # Use bfloat16 precision for full finetuning float32_mixed_precision = False diff --git a/unsloth/save.py b/unsloth/save.py index 9539b66701..4535c7dc42 100644 --- a/unsloth/save.py +++ b/unsloth/save.py @@ -549,11 +549,14 @@ def unsloth_save_model( from collections import OrderedDict state_dict = OrderedDict() - torch_dtype = internal_model.config.torch_dtype + torch_dtype = \ + getattr(internal_model.config, "dtype", None) or \ + getattr(internal_model.config, "torch_dtype", None) if type(torch_dtype) is str: if torch_dtype == "float16": torch_dtype = torch.float16 elif torch_dtype == "bfloat16": torch_dtype = torch.bfloat16 - pass + else: + torch_dtype = internal_model.model.embed_tokens.weight.dtype # Check modules to save float32 dtype state_dict["model.embed_tokens.weight"] = internal_model.model.embed_tokens.weight.data.to(torch_dtype) @@ -1880,7 +1883,9 @@ def unsloth_save_pretrained_gguf( for _ in range(3): gc.collect() - model_dtype = self.config.torch_dtype + model_dtype = \ + getattr(self.config, "dtype", None) or \ + getattr(self.config, "torch_dtype", None) model_type = self.config.model_type if type(model_dtype) is str: assert(model_dtype == "float16" or model_dtype == "bfloat16") @@ -2058,7 +2063,9 @@ def unsloth_push_to_hub_gguf( for _ in range(3): gc.collect() - model_dtype = self.config.torch_dtype + model_dtype = \ + getattr(self.config, "dtype", None) or \ + getattr(self.config, "torch_dtype", None) model_type = self.config.model_type if type(model_dtype) is str: assert(model_dtype == "float16" or model_dtype == "bfloat16") From b56cc1b82cfb64a02bbe7a12afd1c05eaa4bf53d Mon Sep 17 00:00:00 2001 From: Daniel Han Date: Thu, 4 Sep 2025 03:33:54 -0700 Subject: [PATCH 107/272] Update rl.py --- unsloth/models/rl.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/unsloth/models/rl.py b/unsloth/models/rl.py index 0f1fa2dbf6..b1ab96c840 100644 --- a/unsloth/models/rl.py +++ b/unsloth/models/rl.py @@ -513,7 +513,7 @@ def _patch_trl_rl_trainers(trainer_file = "grpo_trainer"): "fp16" : False, "include_tokens_per_second" : False, "include_num_input_tokens_seen" : False, - "auto_find_batch_size" : True, # Auto /2 batch size + "auto_find_batch_size" : False, # Auto /2 batch size - too many people complained so removing "dataloader_pin_memory" : True, # Might fail so disable for now # "dataloader_persistent_workers" : True, # Keeps dataloader in RAM From c47f9367f53c0495bace2aa145252955d620aa78 Mon Sep 17 00:00:00 2001 From: Daniel Han Date: Thu, 4 Sep 2025 03:55:38 -0700 Subject: [PATCH 108/272] Fix CE Loss --- unsloth/models/llama.py | 4 ++-- unsloth/models/mistral.py | 29 +++++++++++++++++++++-------- 2 files changed, 23 insertions(+), 10 deletions(-) diff --git a/unsloth/models/llama.py b/unsloth/models/llama.py index cf2ca75f75..f978060c9c 100644 --- a/unsloth/models/llama.py +++ b/unsloth/models/llama.py @@ -1236,7 +1236,7 @@ def _CausalLM_fast_forward( # < 1024 Normal Unsloth uses less VRAM! if bsz*q_len <= 1024: RETURN_LOGITS = True - if not RETURN_LOGITS and HAS_CUT_CROSS_ENTROPY and labels is not None: + if not RETURN_LOGITS and labels is not None: n_items = kwargs.get("num_items_in_batch", None) or kwargs.get("n_items", None) @@ -1259,7 +1259,7 @@ def _CausalLM_fast_forward( mask = None, n_items = n_items, scaling = getattr(self, "accelerator_scaler", None), - target_gb = 1, + target_gb = None, torch_compile = True, logit_softcapping = logit_softcapping, ) diff --git a/unsloth/models/mistral.py b/unsloth/models/mistral.py index 6274f2e5df..faab2d30b1 100644 --- a/unsloth/models/mistral.py +++ b/unsloth/models/mistral.py @@ -300,17 +300,30 @@ def MistralForCausalLM_fast_forward( # < 1024 Normal Unsloth uses less VRAM! if bsz * q_len <= 1024: RETURN_LOGITS = True - if not RETURN_LOGITS and HAS_CUT_CROSS_ENTROPY and os.environ.get("UNSLOTH_ENABLE_CCE", "1") != "0" and labels is not None: + if not RETURN_LOGITS and labels is not None: n_items = kwargs.get("num_items_in_batch", None) or kwargs.get("n_items", None) logit_softcapping = getattr(self.config, "final_logit_softcapping", 0) - loss = fused_linear_cross_entropy( - hidden_states = hidden_states, - lm_weight = lm_head, - labels = labels, - num_items_in_batch = n_items, - logit_softcapping = logit_softcapping, - ) + # loss = fused_linear_cross_entropy( + # hidden_states = hidden_states, + # lm_weight = lm_head, + # labels = labels, + # num_items_in_batch = n_items, + # logit_softcapping = logit_softcapping, + # ) + loss = unsloth_fused_ce_loss( + trainer = None, + hidden_states = hidden_states, + lm_head_weight = lm_head, + lm_head_bias = None, + labels = labels, + mask = None, + n_items = n_items, + scaling = getattr(self, "accelerator_scaler", None), + target_gb = None, + torch_compile = True, + logit_softcapping = logit_softcapping, + ) if not return_dict: output = (logits,) + outputs[1:] return (loss,) + output if loss is not None else output From 0b896c5f93e10a24b6db32d96627bb4482ff7558 Mon Sep 17 00:00:00 2001 From: Daniel Han Date: Thu, 4 Sep 2025 05:11:33 -0700 Subject: [PATCH 109/272] Versioning --- pyproject.toml | 4 ++-- unsloth/__init__.py | 2 +- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/pyproject.toml b/pyproject.toml index 8c60cb5866..160182c2a2 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -37,7 +37,7 @@ triton = [ ] huggingface = [ - "unsloth_zoo>=2025.8.9", + "unsloth_zoo>=2025.9.1", "packaging", "tyro", "transformers>=4.51.3,!=4.52.0,!=4.52.1,!=4.52.2,!=4.52.3,!=4.53.0,!=4.54.0,!=4.55.0,!=4.55.1", @@ -453,7 +453,7 @@ colab-ampere-torch220 = [ "flash-attn>=2.6.3", ] colab-new = [ - "unsloth_zoo>=2025.8.9", + "unsloth_zoo>=2025.9.1", "packaging", "tyro", "transformers>=4.51.3,!=4.47.0,!=4.52.0,!=4.52.1,!=4.52.2,!=4.52.3,!=4.53.0,!=4.54.0,!=4.55.0,!=4.55.1", diff --git a/unsloth/__init__.py b/unsloth/__init__.py index 1b2a9310ff..25a54165b7 100644 --- a/unsloth/__init__.py +++ b/unsloth/__init__.py @@ -214,7 +214,7 @@ def is_bf16_supported(): return SUPPORTS_BFLOAT16 # Check for unsloth_zoo try: unsloth_zoo_version = importlib_version("unsloth_zoo") - if Version(unsloth_zoo_version) < Version("2025.8.8"): + if Version(unsloth_zoo_version) < Version("2025.9.1"): print( "Unsloth: Please update Unsloth and Unsloth-Zoo to the latest version!\n"\ "Do this via `pip install --upgrade --force-reinstall --no-cache-dir --no-deps unsloth unsloth_zoo`" From 7234a62f5b40d2ee96e65570a8e7a769e5449271 Mon Sep 17 00:00:00 2001 From: Daniel Han Date: Tue, 9 Sep 2025 01:59:13 -0700 Subject: [PATCH 110/272] Update loader.py --- unsloth/models/loader.py | 1 + 1 file changed, 1 insertion(+) diff --git a/unsloth/models/loader.py b/unsloth/models/loader.py index b1844a1472..952f900ff4 100644 --- a/unsloth/models/loader.py +++ b/unsloth/models/loader.py @@ -527,6 +527,7 @@ def from_pretrained( qat_scheme = None, *args, **kwargs, ): + print("model_name", model_name) if token is None: token = get_token() # Login to allow private models if token is not None: From 68c1aba08999d4f8801cda2194bcab5234109f31 Mon Sep 17 00:00:00 2001 From: Daniel Han Date: Tue, 9 Sep 2025 02:01:49 -0700 Subject: [PATCH 111/272] Update loader.py --- unsloth/models/loader.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/unsloth/models/loader.py b/unsloth/models/loader.py index 952f900ff4..b689b1f3c2 100644 --- a/unsloth/models/loader.py +++ b/unsloth/models/loader.py @@ -254,7 +254,9 @@ def from_pretrained( # Get base model for PEFT: if is_peft: # Check base model again for PEFT + print("is_peft", model_name) model_name = peft_config.base_model_name_or_path + print("is_peft", model_name) if not use_exact_model_name: model_name = get_model_name(model_name, load_in_4bit) model_config = AutoConfig.from_pretrained( From 05fc2f2628b54ee2e867ff5c307abcfda7310cce Mon Sep 17 00:00:00 2001 From: Daniel Han Date: Tue, 9 Sep 2025 04:31:12 -0700 Subject: [PATCH 112/272] extract_model_type_from_config --- unsloth/models/_utils.py | 33 ++++++++++++++++++++++++++++++++- unsloth/models/loader.py | 4 +--- 2 files changed, 33 insertions(+), 4 deletions(-) diff --git a/unsloth/models/_utils.py b/unsloth/models/_utils.py index 597ed0244b..0346ba13c1 100644 --- a/unsloth/models/_utils.py +++ b/unsloth/models/_utils.py @@ -12,12 +12,13 @@ # See the License for the specific language governing permissions and # limitations under the License. -__version__ = "2025.9.2" +__version__ = "2025.9.3" __all__ = [ "SUPPORTS_BFLOAT16", "is_bfloat16_supported", "is_vLLM_available", + "extract_model_type_from_config", "prepare_model_for_kbit_training", "xformers", @@ -1561,3 +1562,33 @@ def _prepare_model_for_qat(model: torch.nn.Module, qat_scheme: str) -> torch.nn. quantize_(model, QATConfig(base_config, step="prepare"), filter_fn=filter_fn) return model pass + + +def extract_model_type_from_config(config): + """ Gets model_type from config file - can be PEFT or normal HF """ + model_type = None + from peft import PeftConfig + if issubclass(type(config), PeftConfig): + model_type_list = re.finditer(r"transformers\.models\.([^\.]{2,})\.modeling_\1", str(config)) + model_type_list = list(model_type_list) + # Use transformers.models.gpt_oss.modeling_gpt_oss + if len(model_type_list) != 0: + model_type = model_type_list[0].group(1) + elif hasattr(config, "auto_mapping"): + # Use GptOssForCausalLM + model_type = config.auto_mapping.get("base_model_class", None) + if model_type is None: + # Last resort use model name unsloth/gpt-oss-20b-unsloth-bnb-4bit + model_type = config.base_model_name_or_path + model_type = os.path.split(model_type)[-1] + else: + + if model_type is None: + raise TypeError(f"Unsloth: Cannot determine model type for config file: {str(config)}") + + # Standardize model_type + model_type = model_type.lower() + model_type = model_type.replace("_", "-") + model_type = model_type.replace("/", "-") + return model_type +pass diff --git a/unsloth/models/loader.py b/unsloth/models/loader.py index c0b996ae02..9c26c8834e 100644 --- a/unsloth/models/loader.py +++ b/unsloth/models/loader.py @@ -20,6 +20,7 @@ HAS_FLASH_ATTENTION_SOFTCAPPING, USE_MODELSCOPE, get_transformers_model_type, + extract_model_type_from_config, ) from .granite import FastGraniteModel from .llama import FastLlamaModel, logger @@ -254,9 +255,7 @@ def from_pretrained( # Get base model for PEFT: if is_peft: # Check base model again for PEFT - print("is_peft", model_name) model_name = peft_config.base_model_name_or_path - print("is_peft", model_name) if not use_exact_model_name: model_name = get_model_name(model_name, load_in_4bit) model_config = AutoConfig.from_pretrained( @@ -529,7 +528,6 @@ def from_pretrained( qat_scheme = None, *args, **kwargs, ): - print("model_name", model_name) if token is None: token = get_token() # Login to allow private models if token is not None: From 99c7afb3fcc8aaa755dba2ad9f74140ff978028c Mon Sep 17 00:00:00 2001 From: Daniel Han Date: Tue, 9 Sep 2025 21:51:46 -0700 Subject: [PATCH 113/272] Model types --- unsloth/models/_utils.py | 39 +++++++++++--- unsloth/models/loader.py | 114 +++++++++++++++++++++------------------ 2 files changed, 92 insertions(+), 61 deletions(-) diff --git a/unsloth/models/_utils.py b/unsloth/models/_utils.py index 0346ba13c1..f961a49de5 100644 --- a/unsloth/models/_utils.py +++ b/unsloth/models/_utils.py @@ -1566,7 +1566,9 @@ def _prepare_model_for_qat(model: torch.nn.Module, qat_scheme: str) -> torch.nn. def extract_model_type_from_config(config): """ Gets model_type from config file - can be PEFT or normal HF """ - model_type = None + if config is None: + raise TypeError(f"Unsloth: Cannot determine model type for config file: {str(config)}") + model_types = None from peft import PeftConfig if issubclass(type(config), PeftConfig): model_type_list = re.finditer(r"transformers\.models\.([^\.]{2,})\.modeling_\1", str(config)) @@ -1574,6 +1576,7 @@ def extract_model_type_from_config(config): # Use transformers.models.gpt_oss.modeling_gpt_oss if len(model_type_list) != 0: model_type = model_type_list[0].group(1) + model_types = [model_type] elif hasattr(config, "auto_mapping"): # Use GptOssForCausalLM model_type = config.auto_mapping.get("base_model_class", None) @@ -1581,14 +1584,34 @@ def extract_model_type_from_config(config): # Last resort use model name unsloth/gpt-oss-20b-unsloth-bnb-4bit model_type = config.base_model_name_or_path model_type = os.path.split(model_type)[-1] + model_types = [model_type] else: - - if model_type is None: + from collections.abc import Mapping, Sequence + def find_values(data, target_key): + stack = [data] + while stack: + obj = stack.pop() + if isinstance(obj, Mapping): + # Emit values for matches + if target_key in obj: + yield obj[target_key] + # Keep walking into nested values + stack.extend(obj.values()) + elif isinstance(obj, Sequence) and not isinstance(obj, (str, bytes, bytearray)): + # Walk sequences (lists/tuples/sets), but not strings/bytes + stack.extend(obj) + model_types = list(find_values(getattr(config, "to_dict", lambda *args, **kwargs: {})(), "model_type")) + pass + if model_types is None: raise TypeError(f"Unsloth: Cannot determine model type for config file: {str(config)}") - # Standardize model_type - model_type = model_type.lower() - model_type = model_type.replace("_", "-") - model_type = model_type.replace("/", "-") - return model_type + final_model_types = [] + for model_type in model_types: + model_type = model_type.lower() + model_type = model_type.replace("_", "") + model_type = model_type.replace("-", "") + model_type = model_type.replace("/", "") + model_type = model_type.replace(".", "") + final_model_types.append(model_type) + return tuple(sorted(final_model_types)) pass diff --git a/unsloth/models/loader.py b/unsloth/models/loader.py index 9c26c8834e..6cefe33aaf 100644 --- a/unsloth/models/loader.py +++ b/unsloth/models/loader.py @@ -84,7 +84,8 @@ global FORCE_FLOAT32 FORCE_FLOAT32 = [ "gemma3", - "gpt_oss", + "gemma3n", + "gptoss", ] class FastLanguageModel(FastLlamaModel): @@ -178,6 +179,8 @@ def from_pretrained( autoconfig_error = None peft_error = None + model_config = None + peft_config = None try: model_config = AutoConfig.from_pretrained( model_name, @@ -201,8 +204,12 @@ def from_pretrained( peft_error = str(error) is_peft = False pass - - # Both config.json and adapter_config.json should not exist! + model_types = extract_model_type_from_config(model_config or peft_config) + if len(model_types) == 1: + model_type = model_types[0] + else: + # Leave as tuple if more than one arch + model_type = model_types # Old transformers versions check both_exist = (is_model and is_peft) and not SUPPORTS_LLAMA32 @@ -267,8 +274,6 @@ def from_pretrained( if not was_disabled: enable_progress_bars() - model_type = model_config.model_type - if model_type == "llama": scaling_type = None if getattr(model_config, "rope_scaling", None) is not None: @@ -494,10 +499,11 @@ def from_pretrained( from transformers import AutoModelForVision2Seq pass +# Must be alphabetically sorted for each entry DISABLE_COMPILE_MODEL_NAMES = [ - "aya-vision", + "ayavision", "modernbert", - "granite-vision", + "granite,llavanext,siglipvisionmodel", # Granite-vision 3 ] @@ -574,20 +580,55 @@ def from_pretrained( if not use_exact_model_name: model_name = get_model_name(model_name, load_in_4bit) + # First check if it's a normal model via AutoConfig + from huggingface_hub.utils import disable_progress_bars, enable_progress_bars, are_progress_bars_disabled + was_disabled = are_progress_bars_disabled() + disable_progress_bars() + + autoconfig_error = None + peft_error = None + model_config = None + peft_config = None + try: + model_config = AutoConfig.from_pretrained( + model_name, + token = token, + revision = revision, + trust_remote_code = trust_remote_code, + ) + is_model = True + except Exception as error: + autoconfig_error = str(error) + is_model = False + try: + peft_config = PeftConfig.from_pretrained( + model_name, + token = token, + revision = revision, + trust_remote_code = trust_remote_code, + ) + is_peft = True + except Exception as error: + peft_error = str(error) + is_peft = False + pass + model_types = extract_model_type_from_config(model_config or peft_config) + model_types_all = ",".join(model_types) + # Check versions lowered_model_name = model_name.lower() os.environ["UNSLOTH_MODEL_NAME"] = lowered_model_name LATEST = '\nPlease use transformers via `pip install --no-deps git+https://github.com/huggingface/transformers.git`' NIGHTLY = '\nPlease use nightly transformers via pip install --upgrade "transformers>=4.49.0"`' # Pixtral - if "pixtral" in lowered_model_name and transformers_version < Version("4.49.0"): + if "pixtral" in model_types_all and transformers_version < Version("4.49.0"): raise RuntimeError("Unsloth: Pixtral only works on transformers >= 4.49.0." + LATEST) # Qwen 2.5 - elif "qwen2.5" in lowered_model_name and transformers_version < Version("4.49.0"): + elif "qwen25" in model_types_all and transformers_version < Version("4.49.0"): raise RuntimeError("Unsloth: Qwen 2.5 only works on transformers >= 4.49.0." + LATEST) # Gemma 3 - elif "gemma-3" in lowered_model_name: - if "gemma-3n" in lowered_model_name: + elif "gemma3" in model_types_all: + if "gemma3n" in model_types_all: if transformers_version < Version("4.53.0"): raise RuntimeError("Unsloth: Gemma 3N only works on transformers >= 4.53.0" + LATEST) os.environ["UNSLOTH_DISABLE_STATIC_GENERATION"] = "1" @@ -605,10 +646,10 @@ def from_pretrained( # common in both gemma-3 and gemma-3n os.environ["UNSLOTH_HIGH_PRECISION_LAYERNORM"] = "1" # Cohere - elif "c4ai-command-a-03-2025" in lowered_model_name and transformers_version < Version("4.50.0.dev0"): + elif "cohere2" in model_types_all and transformers_version < Version("4.50.0.dev0"): raise RuntimeError("Unsloth: Cohere's Command model only works on transformers >= 4.50.0." + NIGHTLY) # Sesame - elif "csm-1b" in lowered_model_name: + elif "csm" in model_types_all: os.environ["UNSLOTH_COMPILE_DISABLE"] = "1" # Inference is too slow os.environ["UNSLOTH_DISABLE_STATIC_GENERATION"] = "1" # Sesame fails os.environ["UNSLOTH_FORCE_CUSTOM_DTYPE"] = \ @@ -616,14 +657,14 @@ def from_pretrained( "if name.endswith(('_proj', 'fc1', 'fc2', 'codebook', 'head')): module.to(torch.float16)"\ ";" # Granite 4 - elif 'granite-4' in lowered_model_name: + elif 'granitemoehybrid' in model_types_all: # Granite-4 rms norms are stored as 16 bit, but we upcast os.environ["UNSLOTH_HIGH_PRECISION_LAYERNORM"] = "1" os.environ["UNSLOTH_DISABLE_STATIC_GENERATION"] = "1" # Olmo 2 - elif "olmo-2" in lowered_model_name and transformers_version < Version("4.50.0.dev0"): + elif "olmo2" in model_types_all and transformers_version < Version("4.50.0.dev0"): raise RuntimeError("Unsloth: OLMo-2 only works on transformers >= 4.50.0." + NIGHTLY) - elif "falcon-h1" in lowered_model_name: + elif "falconh1" in model_types_all: # Falcon must use float32 Triton ie TRITON_F32_DEFAULT = 'ieee' # since Mamba kernels error out on using lower precision os.environ["UNSLOTH_FORCE_CUSTOM_DTYPE"] = \ @@ -631,7 +672,7 @@ def from_pretrained( "if name.endswith(('q_proj', 'k_proj', 'v_proj', 'o_proj', 'gate_proj', 'up_proj', 'down_proj', 'head')): module.to(torch.float16)"\ ";"\ "os.environ['TRITON_F32_DEFAULT'] = 'ieee'" - elif "gpt-oss" in lowered_model_name: + elif "gptoss" in model_types_all: os.environ["UNSLOTH_DISABLE_STATIC_GENERATION"] = "1" if not load_in_4bit: # Only upcast MoE biases for MXFP4, not BnB @@ -681,39 +722,6 @@ def from_pretrained( model_name = snapshot_download(model_name) pass - # First check if it's a normal model via AutoConfig - from huggingface_hub.utils import disable_progress_bars, enable_progress_bars, are_progress_bars_disabled - was_disabled = are_progress_bars_disabled() - disable_progress_bars() - - autoconfig_error = None - peft_error = None - try: - model_config = AutoConfig.from_pretrained( - model_name, - token = token, - revision = revision, - trust_remote_code = trust_remote_code, - ) - is_model = True - except Exception as error: - autoconfig_error = str(error) - is_model = False - try: - peft_config = PeftConfig.from_pretrained( - model_name, - token = token, - revision = revision, - trust_remote_code = trust_remote_code, - ) - is_peft = True - except Exception as error: - peft_error = str(error) - is_peft = False - pass - - # Both config.json and adapter_config.json should not exist! - # Old transformers versions check both_exist = (is_model and is_peft) and not SUPPORTS_LLAMA32 @@ -799,8 +807,8 @@ def from_pretrained( if model_type_arch != "siglip": break global FORCE_FLOAT32 for disable_name in FORCE_FLOAT32: - if (disable_name.lower() == model_type_arch.lower().replace("-", "_") or \ - disable_name.lower() in model_name.lower()) and \ + if (disable_name.lower() == model_type_arch.lower().replace("-", "").replace("_", "") or \ + disable_name.lower() in model_types_all) and \ ((dtype == torch.float16) or not SUPPORTS_BFLOAT16): os.environ["UNSLOTH_FORCE_FLOAT32"] = "1" dtype = torch.bfloat16 # Change to bfloat16 loading @@ -846,7 +854,7 @@ def from_pretrained( ) pass # Fix SDPA - if "gemma-3n" in lowered_model_name: + if "gemma3n" in model_types_all: supports_sdpa = False pass From fc5d91de3b2200e6a4a32e865c5f18272271de5a Mon Sep 17 00:00:00 2001 From: Daniel Han Date: Tue, 9 Sep 2025 22:02:56 -0700 Subject: [PATCH 114/272] Update loader.py --- unsloth/models/loader.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/unsloth/models/loader.py b/unsloth/models/loader.py index 6cefe33aaf..44a74601d9 100644 --- a/unsloth/models/loader.py +++ b/unsloth/models/loader.py @@ -205,6 +205,7 @@ def from_pretrained( is_peft = False pass model_types = extract_model_type_from_config(model_config or peft_config) + print("model_types", model_types) if len(model_types) == 1: model_type = model_types[0] else: @@ -614,6 +615,7 @@ def from_pretrained( pass model_types = extract_model_type_from_config(model_config or peft_config) model_types_all = ",".join(model_types) + print("model_types", model_types) # Check versions lowered_model_name = model_name.lower() From 702a9ead13538d5a930c9a2f644fb92671dd35f2 Mon Sep 17 00:00:00 2001 From: Daniel Han Date: Tue, 9 Sep 2025 22:11:08 -0700 Subject: [PATCH 115/272] get_transformers_model_type --- unsloth/models/_utils.py | 54 ---------------------------------------- unsloth/models/loader.py | 26 ++++++------------- 2 files changed, 8 insertions(+), 72 deletions(-) diff --git a/unsloth/models/_utils.py b/unsloth/models/_utils.py index f961a49de5..56b98489f6 100644 --- a/unsloth/models/_utils.py +++ b/unsloth/models/_utils.py @@ -18,7 +18,6 @@ "SUPPORTS_BFLOAT16", "is_bfloat16_supported", "is_vLLM_available", - "extract_model_type_from_config", "prepare_model_for_kbit_training", "xformers", @@ -1562,56 +1561,3 @@ def _prepare_model_for_qat(model: torch.nn.Module, qat_scheme: str) -> torch.nn. quantize_(model, QATConfig(base_config, step="prepare"), filter_fn=filter_fn) return model pass - - -def extract_model_type_from_config(config): - """ Gets model_type from config file - can be PEFT or normal HF """ - if config is None: - raise TypeError(f"Unsloth: Cannot determine model type for config file: {str(config)}") - model_types = None - from peft import PeftConfig - if issubclass(type(config), PeftConfig): - model_type_list = re.finditer(r"transformers\.models\.([^\.]{2,})\.modeling_\1", str(config)) - model_type_list = list(model_type_list) - # Use transformers.models.gpt_oss.modeling_gpt_oss - if len(model_type_list) != 0: - model_type = model_type_list[0].group(1) - model_types = [model_type] - elif hasattr(config, "auto_mapping"): - # Use GptOssForCausalLM - model_type = config.auto_mapping.get("base_model_class", None) - if model_type is None: - # Last resort use model name unsloth/gpt-oss-20b-unsloth-bnb-4bit - model_type = config.base_model_name_or_path - model_type = os.path.split(model_type)[-1] - model_types = [model_type] - else: - from collections.abc import Mapping, Sequence - def find_values(data, target_key): - stack = [data] - while stack: - obj = stack.pop() - if isinstance(obj, Mapping): - # Emit values for matches - if target_key in obj: - yield obj[target_key] - # Keep walking into nested values - stack.extend(obj.values()) - elif isinstance(obj, Sequence) and not isinstance(obj, (str, bytes, bytearray)): - # Walk sequences (lists/tuples/sets), but not strings/bytes - stack.extend(obj) - model_types = list(find_values(getattr(config, "to_dict", lambda *args, **kwargs: {})(), "model_type")) - pass - if model_types is None: - raise TypeError(f"Unsloth: Cannot determine model type for config file: {str(config)}") - # Standardize model_type - final_model_types = [] - for model_type in model_types: - model_type = model_type.lower() - model_type = model_type.replace("_", "") - model_type = model_type.replace("-", "") - model_type = model_type.replace("/", "") - model_type = model_type.replace(".", "") - final_model_types.append(model_type) - return tuple(sorted(final_model_types)) -pass diff --git a/unsloth/models/loader.py b/unsloth/models/loader.py index 44a74601d9..7e8a32caa7 100644 --- a/unsloth/models/loader.py +++ b/unsloth/models/loader.py @@ -20,7 +20,6 @@ HAS_FLASH_ATTENTION_SOFTCAPPING, USE_MODELSCOPE, get_transformers_model_type, - extract_model_type_from_config, ) from .granite import FastGraniteModel from .llama import FastLlamaModel, logger @@ -204,8 +203,7 @@ def from_pretrained( peft_error = str(error) is_peft = False pass - model_types = extract_model_type_from_config(model_config or peft_config) - print("model_types", model_types) + model_types = get_transformers_model_type(model_config or peft_config) if len(model_types) == 1: model_type = model_types[0] else: @@ -581,6 +579,12 @@ def from_pretrained( if not use_exact_model_name: model_name = get_model_name(model_name, load_in_4bit) + # Check modelscope + if USE_MODELSCOPE and not os.path.exists(model_name): + from modelscope import snapshot_download + model_name = snapshot_download(model_name) + pass + # First check if it's a normal model via AutoConfig from huggingface_hub.utils import disable_progress_bars, enable_progress_bars, are_progress_bars_disabled was_disabled = are_progress_bars_disabled() @@ -613,9 +617,8 @@ def from_pretrained( peft_error = str(error) is_peft = False pass - model_types = extract_model_type_from_config(model_config or peft_config) + model_types = get_transformers_model_type(model_config or peft_config) model_types_all = ",".join(model_types) - print("model_types", model_types) # Check versions lowered_model_name = model_name.lower() @@ -719,11 +722,6 @@ def from_pretrained( os.environ["UNSLOTH_DISABLE_STATIC_GENERATION"] = "1" pass - if USE_MODELSCOPE and not os.path.exists(model_name): - from modelscope import snapshot_download - model_name = snapshot_download(model_name) - pass - # Old transformers versions check both_exist = (is_model and is_peft) and not SUPPORTS_LLAMA32 @@ -793,15 +791,7 @@ def from_pretrained( else: redirector = contextlib.redirect_stdout(open(os.devnull, "w")) - # Get model types like Gemma3 etc - model_types = get_transformers_model_type( - model_name = model_name, - token = token, - revision = revision, - trust_remote_code = trust_remote_code, - ) model_types = ["siglip"] + model_types - # Set forced float32 env flag os.environ["UNSLOTH_FORCE_FLOAT32"] = "0" do_forced_float32 = False From 8ece4a6f915e27f536202017132d031094a518ac Mon Sep 17 00:00:00 2001 From: Daniel Han Date: Tue, 9 Sep 2025 22:14:10 -0700 Subject: [PATCH 116/272] Update loader.py --- unsloth/models/loader.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/unsloth/models/loader.py b/unsloth/models/loader.py index 7e8a32caa7..43c14050c2 100644 --- a/unsloth/models/loader.py +++ b/unsloth/models/loader.py @@ -204,6 +204,7 @@ def from_pretrained( is_peft = False pass model_types = get_transformers_model_type(model_config or peft_config) + print("model_types", model_types) if len(model_types) == 1: model_type = model_types[0] else: @@ -619,6 +620,7 @@ def from_pretrained( pass model_types = get_transformers_model_type(model_config or peft_config) model_types_all = ",".join(model_types) + print("model_types", model_types) # Check versions lowered_model_name = model_name.lower() From f3ac0e3b6d382dd432af4a49c919e4d8a2700480 Mon Sep 17 00:00:00 2001 From: Daniel Han Date: Tue, 9 Sep 2025 22:18:59 -0700 Subject: [PATCH 117/272] Update loader.py --- unsloth/models/loader.py | 14 ++++++-------- 1 file changed, 6 insertions(+), 8 deletions(-) diff --git a/unsloth/models/loader.py b/unsloth/models/loader.py index 43c14050c2..27fb3afe41 100644 --- a/unsloth/models/loader.py +++ b/unsloth/models/loader.py @@ -84,7 +84,7 @@ FORCE_FLOAT32 = [ "gemma3", "gemma3n", - "gptoss", + "gpt_oss", ] class FastLanguageModel(FastLlamaModel): @@ -204,7 +204,6 @@ def from_pretrained( is_peft = False pass model_types = get_transformers_model_type(model_config or peft_config) - print("model_types", model_types) if len(model_types) == 1: model_type = model_types[0] else: @@ -501,9 +500,9 @@ def from_pretrained( # Must be alphabetically sorted for each entry DISABLE_COMPILE_MODEL_NAMES = [ - "ayavision", + "aya_vision", "modernbert", - "granite,llavanext,siglipvisionmodel", # Granite-vision 3 + "granite,llava_next", # Granite-vision 3 ] @@ -620,7 +619,6 @@ def from_pretrained( pass model_types = get_transformers_model_type(model_config or peft_config) model_types_all = ",".join(model_types) - print("model_types", model_types) # Check versions lowered_model_name = model_name.lower() @@ -631,7 +629,7 @@ def from_pretrained( if "pixtral" in model_types_all and transformers_version < Version("4.49.0"): raise RuntimeError("Unsloth: Pixtral only works on transformers >= 4.49.0." + LATEST) # Qwen 2.5 - elif "qwen25" in model_types_all and transformers_version < Version("4.49.0"): + elif "qwen2_5" in model_types_all and transformers_version < Version("4.49.0"): raise RuntimeError("Unsloth: Qwen 2.5 only works on transformers >= 4.49.0." + LATEST) # Gemma 3 elif "gemma3" in model_types_all: @@ -671,7 +669,7 @@ def from_pretrained( # Olmo 2 elif "olmo2" in model_types_all and transformers_version < Version("4.50.0.dev0"): raise RuntimeError("Unsloth: OLMo-2 only works on transformers >= 4.50.0." + NIGHTLY) - elif "falconh1" in model_types_all: + elif "falcon_h1" in model_types_all: # Falcon must use float32 Triton ie TRITON_F32_DEFAULT = 'ieee' # since Mamba kernels error out on using lower precision os.environ["UNSLOTH_FORCE_CUSTOM_DTYPE"] = \ @@ -679,7 +677,7 @@ def from_pretrained( "if name.endswith(('q_proj', 'k_proj', 'v_proj', 'o_proj', 'gate_proj', 'up_proj', 'down_proj', 'head')): module.to(torch.float16)"\ ";"\ "os.environ['TRITON_F32_DEFAULT'] = 'ieee'" - elif "gptoss" in model_types_all: + elif "gpt_oss" in model_types_all: os.environ["UNSLOTH_DISABLE_STATIC_GENERATION"] = "1" if not load_in_4bit: # Only upcast MoE biases for MXFP4, not BnB From d2b0d4193a6e32cf370f2008d8ad05011a6ad0a6 Mon Sep 17 00:00:00 2001 From: Daniel Han Date: Tue, 9 Sep 2025 22:22:15 -0700 Subject: [PATCH 118/272] Update loader.py --- unsloth/models/loader.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/unsloth/models/loader.py b/unsloth/models/loader.py index 27fb3afe41..de2f32f9af 100644 --- a/unsloth/models/loader.py +++ b/unsloth/models/loader.py @@ -809,7 +809,7 @@ def from_pretrained( # Patch gradient checkpointing if use_gradient_checkpointing == "unsloth": patch_unsloth_smart_gradient_checkpointing(dtype = dtype) - + print(model_types) with redirector: patch_loss_functions(torch_compile = False) model_types, supports_sdpa = unsloth_compile_transformers( From e5920fe7027e7caf8602fc9a7d602a84ef197bed Mon Sep 17 00:00:00 2001 From: Daniel Han Date: Wed, 10 Sep 2025 01:21:49 -0700 Subject: [PATCH 119/272] Update rl.py --- unsloth/models/rl.py | 42 ++++++++++++++++++++++-------------------- 1 file changed, 22 insertions(+), 20 deletions(-) diff --git a/unsloth/models/rl.py b/unsloth/models/rl.py index f342a4d86b..14b75f6746 100644 --- a/unsloth/models/rl.py +++ b/unsloth/models/rl.py @@ -44,6 +44,8 @@ } from trl import __version__ as trl_version +from unsloth_zoo.utils import Version +trl_version = Version(trl_version) def vLLMSamplingParams(**kwargs): from vllm import SamplingParams @@ -804,7 +806,7 @@ def patch_functions(RLTrainer, trainer_file, RLTrainer_name, all_imports, import " " * 12 + "if (getattr(args, 'use_vllm', False) == False):\n" + \ " " * 16 + "args.use_vllm = True\n" - if "grpo" in trainer_file and trl_version >= "0.18": + if "grpo" in trainer_file and trl_version >= Version("0.18.0"): # If model has vllm_engine, then use vllm in colocate mode. Donot wait for server vllm_setter += \ " " * 12 + "args.vllm_mode='colocate'\n" @@ -850,26 +852,27 @@ def patch_functions(RLTrainer, trainer_file, RLTrainer_name, all_imports, import sampling_params # Add spaces # count the indentation of last line of sampling_params. - last_line = sampling_params.split("\n")[-1] - last_prev_line = sampling_params.split("\n")[-2] - last_prev_indentation = len(last_prev_line) - len(last_prev_line.lstrip()) - last_indentation = len(last_line) - len(last_line.lstrip()) - - - # Add extra arguments to SamplingParams - extra = "**getattr(getattr(args, 'vllm_sampling_params', vLLMSamplingParams()), '_set_kwargs', {})" - # Backwards replace - to_replace = ",\n" + " "*last_prev_indentation + extra + ",\n" + " "*last_indentation + ")" - sampling_params = to_replace.join(sampling_params.rsplit(")", 1)) - # Strip multiple commas - sampling_params = re.sub(r"[\,][\s]{0,}\,", ",", sampling_params) - - new_vllm_part = \ - f"\n{' '*8}if {args}.use_vllm:\n{sampling_params}"\ - f"\n{' '*8}else:\n" + splitted_sampling_params = sampling_params.split("\n") + if len(splitted_sampling_params) >= 2: + last_line = splitted_sampling_params[-1] + last_prev_line = splitted_sampling_params[-2] + last_prev_indentation = len(last_prev_line) - len(last_prev_line.lstrip()) + last_indentation = len(last_line) - len(last_line.lstrip()) + + # Add extra arguments to SamplingParams + extra = "**getattr(getattr(args, 'vllm_sampling_params', vLLMSamplingParams()), '_set_kwargs', {})" + # Backwards replace + to_replace = ",\n" + " "*last_prev_indentation + extra + ",\n" + " "*last_indentation + ")" + sampling_params = to_replace.join(sampling_params.rsplit(")", 1)) + # Strip multiple commas + sampling_params = re.sub(r"[\,][\s]{0,}\,", ",", sampling_params) + + new_vllm_part = \ + f"\n{' '*8}if {args}.use_vllm:\n{sampling_params}"\ + f"\n{' '*8}else:\n" pass - if trl_version >= "0.18": + if trl_version >= Version("0.18.0"): # Replace LLM init with already existing vLLM engine for colocate mode vllm_llm_init_pattern = r"self\.llm\s*=\s*LLM\(.*?\)*\)\s*?\n(?!,)" vllm_llm_replacement = "self.llm = model.vllm_engine\n" @@ -881,7 +884,6 @@ def patch_functions(RLTrainer, trainer_file, RLTrainer_name, all_imports, import ) init = init.replace(vllm_part, new_vllm_part) - pass # Search for vLLM calling in all child functions From bf0367eb45dc731104968052415184b8e2d080dc Mon Sep 17 00:00:00 2001 From: Daniel Han Date: Wed, 10 Sep 2025 01:24:02 -0700 Subject: [PATCH 120/272] Update pyproject.toml --- pyproject.toml | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/pyproject.toml b/pyproject.toml index c2cb87ce3b..c860a92db6 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -37,7 +37,7 @@ triton = [ ] huggingface = [ - "unsloth_zoo>=2025.9.3", + "unsloth_zoo>=2025.9.4", "packaging", "tyro", "transformers>=4.51.3,!=4.52.0,!=4.52.1,!=4.52.2,!=4.52.3,!=4.53.0,!=4.54.0,!=4.55.0,!=4.55.1", @@ -453,7 +453,7 @@ colab-ampere-torch220 = [ "flash-attn>=2.6.3", ] colab-new = [ - "unsloth_zoo>=2025.9.3", + "unsloth_zoo>=2025.9.4", "packaging", "tyro", "transformers>=4.51.3,!=4.47.0,!=4.52.0,!=4.52.1,!=4.52.2,!=4.52.3,!=4.53.0,!=4.54.0,!=4.55.0,!=4.55.1", From d2c2cc195a99b6b4dbeab7b6f65d1b302b7a9591 Mon Sep 17 00:00:00 2001 From: Daniel Han Date: Wed, 10 Sep 2025 01:26:58 -0700 Subject: [PATCH 121/272] Update loader.py --- unsloth/models/loader.py | 1 - 1 file changed, 1 deletion(-) diff --git a/unsloth/models/loader.py b/unsloth/models/loader.py index de2f32f9af..a57deef000 100644 --- a/unsloth/models/loader.py +++ b/unsloth/models/loader.py @@ -809,7 +809,6 @@ def from_pretrained( # Patch gradient checkpointing if use_gradient_checkpointing == "unsloth": patch_unsloth_smart_gradient_checkpointing(dtype = dtype) - print(model_types) with redirector: patch_loss_functions(torch_compile = False) model_types, supports_sdpa = unsloth_compile_transformers( From 35ca1776b08f81f05e16e268f09cb444f1af1e1b Mon Sep 17 00:00:00 2001 From: Daniel Han Date: Fri, 12 Sep 2025 18:53:46 -0700 Subject: [PATCH 122/272] Update loader.py --- unsloth/models/loader.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/unsloth/models/loader.py b/unsloth/models/loader.py index a57deef000..5ad283d39a 100644 --- a/unsloth/models/loader.py +++ b/unsloth/models/loader.py @@ -204,6 +204,7 @@ def from_pretrained( is_peft = False pass model_types = get_transformers_model_type(model_config or peft_config) + print("207", model_types_all) if len(model_types) == 1: model_type = model_types[0] else: @@ -619,6 +620,7 @@ def from_pretrained( pass model_types = get_transformers_model_type(model_config or peft_config) model_types_all = ",".join(model_types) + print("623", model_types_all) # Check versions lowered_model_name = model_name.lower() From 2eaf868efa817657405b4b67416b91be171b6285 Mon Sep 17 00:00:00 2001 From: Daniel Han Date: Fri, 12 Sep 2025 18:55:47 -0700 Subject: [PATCH 123/272] Update loader.py --- unsloth/models/loader.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/unsloth/models/loader.py b/unsloth/models/loader.py index 5ad283d39a..fd41390889 100644 --- a/unsloth/models/loader.py +++ b/unsloth/models/loader.py @@ -204,7 +204,7 @@ def from_pretrained( is_peft = False pass model_types = get_transformers_model_type(model_config or peft_config) - print("207", model_types_all) + print("207", model_types) if len(model_types) == 1: model_type = model_types[0] else: From 7c892e798fa9ff71f25185ad5e4fb353f3b1a7e6 Mon Sep 17 00:00:00 2001 From: Daniel Han Date: Sat, 13 Sep 2025 02:21:02 -0700 Subject: [PATCH 124/272] Update loader.py --- unsloth/models/loader.py | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/unsloth/models/loader.py b/unsloth/models/loader.py index fd41390889..ab258f3ed9 100644 --- a/unsloth/models/loader.py +++ b/unsloth/models/loader.py @@ -204,7 +204,6 @@ def from_pretrained( is_peft = False pass model_types = get_transformers_model_type(model_config or peft_config) - print("207", model_types) if len(model_types) == 1: model_type = model_types[0] else: @@ -620,11 +619,11 @@ def from_pretrained( pass model_types = get_transformers_model_type(model_config or peft_config) model_types_all = ",".join(model_types) - print("623", model_types_all) # Check versions lowered_model_name = model_name.lower() - os.environ["UNSLOTH_MODEL_NAME"] = lowered_model_name + if os.environ.get("UNSLOTH_MODEL_NAME", "") == "": + os.environ["UNSLOTH_MODEL_NAME"] = lowered_model_name LATEST = '\nPlease use transformers via `pip install --no-deps git+https://github.com/huggingface/transformers.git`' NIGHTLY = '\nPlease use nightly transformers via pip install --upgrade "transformers>=4.49.0"`' # Pixtral From 72ff24c5ebff286427f46d47a46b82627533ed7f Mon Sep 17 00:00:00 2001 From: Daniel Han Date: Sat, 13 Sep 2025 21:15:04 -0700 Subject: [PATCH 125/272] Versioning --- pyproject.toml | 4 ++-- unsloth/__init__.py | 2 +- unsloth/models/_utils.py | 2 +- 3 files changed, 4 insertions(+), 4 deletions(-) diff --git a/pyproject.toml b/pyproject.toml index d77683c00a..8df936f807 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -37,7 +37,7 @@ triton = [ ] huggingface = [ - "unsloth_zoo>=2025.9.5", + "unsloth_zoo>=2025.9.6", "packaging", "tyro", "transformers>=4.51.3,!=4.52.0,!=4.52.1,!=4.52.2,!=4.52.3,!=4.53.0,!=4.54.0,!=4.55.0,!=4.55.1", @@ -453,7 +453,7 @@ colab-ampere-torch220 = [ "flash-attn>=2.6.3", ] colab-new = [ - "unsloth_zoo>=2025.9.5", + "unsloth_zoo>=2025.9.6", "packaging", "tyro", "transformers>=4.51.3,!=4.47.0,!=4.52.0,!=4.52.1,!=4.52.2,!=4.52.3,!=4.53.0,!=4.54.0,!=4.55.0,!=4.55.1", diff --git a/unsloth/__init__.py b/unsloth/__init__.py index 8255e505a8..1be571b69b 100644 --- a/unsloth/__init__.py +++ b/unsloth/__init__.py @@ -240,7 +240,7 @@ def is_bf16_supported(): return SUPPORTS_BFLOAT16 # Check for unsloth_zoo try: unsloth_zoo_version = importlib_version("unsloth_zoo") - if Version(unsloth_zoo_version) < Version("2025.9.5"): + if Version(unsloth_zoo_version) < Version("2025.9.6"): print( "Unsloth: Please update Unsloth and Unsloth-Zoo to the latest version!\n"\ "Do this via `pip install --upgrade --force-reinstall --no-cache-dir --no-deps unsloth unsloth_zoo`" diff --git a/unsloth/models/_utils.py b/unsloth/models/_utils.py index e3ac56ac83..4cf34aa007 100644 --- a/unsloth/models/_utils.py +++ b/unsloth/models/_utils.py @@ -12,7 +12,7 @@ # See the License for the specific language governing permissions and # limitations under the License. -__version__ = "2025.9.4" +__version__ = "2025.9.5" __all__ = [ "SUPPORTS_BFLOAT16", From 227842c5b87203c7c4ff1c2fc76763c79f33493c Mon Sep 17 00:00:00 2001 From: Daniel Han Date: Mon, 15 Sep 2025 00:00:15 -0700 Subject: [PATCH 126/272] Update _utils.py --- unsloth/models/_utils.py | 16 +++++++++++++++- 1 file changed, 15 insertions(+), 1 deletion(-) diff --git a/unsloth/models/_utils.py b/unsloth/models/_utils.py index 4cf34aa007..707d7220b2 100644 --- a/unsloth/models/_utils.py +++ b/unsloth/models/_utils.py @@ -614,6 +614,18 @@ def _is_openai_available(): return False # Get Xformers try: from xformers import __version__ as xformers_version + # [TODO] Xformers does NOT work on RTX 50x (12), B200 (10), Jetson (11) + # See https://github.com/facebookresearch/xformers/issues/1329 + # CUDA error (/workspace/xfrm2/third_party/flash-attention/hopper/flash_fwd_launch_template.h:188) + major_version, minor_version = torch.cuda.get_device_capability() + if ( + f"{major_version}.{minor_version}" in ("10.0", "11.0", "12.0")) and \ + (xformers_version in (Version("0.0.32.post2"),) + ): + raise NotImplementedError( + "Unsloth: Xformers does not work in RTX 50X, Blackwell GPUs as of yet." + ) + pass # Temporarily disable 0.0.27 and higher - inference issues if False: #Version(xformers_version) >= Version("0.0.27"): raise ImportError( @@ -661,7 +673,9 @@ def _is_openai_available(): return False pass import xformers.ops.fmha as xformers xformers_attention = xformers.memory_efficient_attention -except: +except Exception as e: + print("========\nSwitching to SDPA PyTorch native attention which is slightly slower.\n========\n") + print(str(e)) xformers = None xformers_attention = None xformers_version = None From 505ae67fe77b77c04faa7cfb3284fd25441b5ade Mon Sep 17 00:00:00 2001 From: Daniel Han Date: Mon, 15 Sep 2025 00:03:50 -0700 Subject: [PATCH 127/272] Update _utils.py --- unsloth/models/_utils.py | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/unsloth/models/_utils.py b/unsloth/models/_utils.py index 707d7220b2..3878367650 100644 --- a/unsloth/models/_utils.py +++ b/unsloth/models/_utils.py @@ -84,7 +84,7 @@ from unsloth_zoo.utils import Version from importlib.metadata import version as importlib_version from unsloth import DEVICE_TYPE, DEVICE_COUNT - +from unsloth_zoo.log import logger from unsloth_zoo.tokenizer_utils import ( patch_tokenizer as _patch_tokenizer, ) @@ -608,8 +608,6 @@ def _is_openai_available(): return False elif DEVICE_TYPE == "xpu": SUPPORTS_BFLOAT16 = True -from transformers.models.llama.modeling_llama import logger - # ============================================= # Get Xformers try: From 80465dcabe0bd75dc8b43fddf3d8d672608fd087 Mon Sep 17 00:00:00 2001 From: Daniel Han Date: Mon, 15 Sep 2025 00:06:36 -0700 Subject: [PATCH 128/272] Update _utils.py --- unsloth/models/_utils.py | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/unsloth/models/_utils.py b/unsloth/models/_utils.py index 3878367650..2abc6b269b 100644 --- a/unsloth/models/_utils.py +++ b/unsloth/models/_utils.py @@ -617,13 +617,12 @@ def _is_openai_available(): return False # CUDA error (/workspace/xfrm2/third_party/flash-attention/hopper/flash_fwd_launch_template.h:188) major_version, minor_version = torch.cuda.get_device_capability() if ( - f"{major_version}.{minor_version}" in ("10.0", "11.0", "12.0")) and \ - (xformers_version in (Version("0.0.32.post2"),) + (f"{major_version}.{minor_version}" in ("10.0", "11.0", "12.0")) and \ + (Version(xformers_version) in (Version("0.0.32.post2"),)) ): raise NotImplementedError( "Unsloth: Xformers does not work in RTX 50X, Blackwell GPUs as of yet." ) - pass # Temporarily disable 0.0.27 and higher - inference issues if False: #Version(xformers_version) >= Version("0.0.27"): raise ImportError( From 4150e081ada733352975234f5a42f97a696a53c3 Mon Sep 17 00:00:00 2001 From: Daniel Han Date: Mon, 15 Sep 2025 01:21:43 -0700 Subject: [PATCH 129/272] Update _utils.py --- unsloth/models/_utils.py | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/unsloth/models/_utils.py b/unsloth/models/_utils.py index 2abc6b269b..a559d34ca4 100644 --- a/unsloth/models/_utils.py +++ b/unsloth/models/_utils.py @@ -621,7 +621,11 @@ def _is_openai_available(): return False (Version(xformers_version) in (Version("0.0.32.post2"),)) ): raise NotImplementedError( - "Unsloth: Xformers does not work in RTX 50X, Blackwell GPUs as of yet." + "Unsloth: Xformers does not work in RTX 50X, Blackwell GPUs as of yet. Please build from source via\n"\ + "```\n"\ + "pip install ninja\n"\ + "pip install -v --no-build-isolation -U git+https://github.com/facebookresearch/xformers.git@main#egg=xformers\n"\ + "```\n" ) # Temporarily disable 0.0.27 and higher - inference issues if False: #Version(xformers_version) >= Version("0.0.27"): From 032c2c840067870adbbba78ad3088ccd5e2ff849 Mon Sep 17 00:00:00 2001 From: Daniel Han Date: Mon, 15 Sep 2025 22:52:32 -0700 Subject: [PATCH 130/272] Update vision.py --- unsloth/models/vision.py | 23 ++++++++--------------- 1 file changed, 8 insertions(+), 15 deletions(-) diff --git a/unsloth/models/vision.py b/unsloth/models/vision.py index 1451ed92cd..2c77169cb9 100644 --- a/unsloth/models/vision.py +++ b/unsloth/models/vision.py @@ -636,24 +636,17 @@ def get_peft_model( torch.xpu.empty_cache() pass max_seq_length = model.max_seq_length - # if we pass loftq_config = None we will get an error + # If we pass loftq_config = None we will get an error loftq_config = validate_loftq_config(loftq_config, lora_dropout, bias, init_lora_weights, model) - lora_config_dict = { - "r" : r, - "lora_alpha" : lora_alpha, - "target_modules" : target_modules, - "target_parameters" : kwargs.get("target_parameters", None), - "lora_dropout" : lora_dropout, - "bias" : bias, - "task_type" : task_type, - "modules_to_save" : modules_to_save, - "use_rslora" : use_rslora, - "init_lora_weights" : init_lora_weights, - "loftq_config" : loftq_config, - } + + # Get only allowed parameters for LoraConfig + local_variables = { **locals(), **kwargs, } + del local_variables["kwargs"] + allowed_parameters = inspect.signature(LoraConfig).parameters.keys() lora_config = LoraConfig( - **{k:v for k,v in lora_config_dict.items() if k in LoraConfig.__doc__}, + **{ k : v for k, v in local_variables.items() if k in allowed_parameters }, ) + print(lora_config) model = prepare_model_for_kbit_training( model, use_gradient_checkpointing = use_gradient_checkpointing, From b105aae096e46646bf9ea5b7e0f541cad981f066 Mon Sep 17 00:00:00 2001 From: Daniel Han Date: Mon, 15 Sep 2025 23:00:14 -0700 Subject: [PATCH 131/272] Update vision.py --- unsloth/models/vision.py | 1 - 1 file changed, 1 deletion(-) diff --git a/unsloth/models/vision.py b/unsloth/models/vision.py index 2c77169cb9..f8c0f866f9 100644 --- a/unsloth/models/vision.py +++ b/unsloth/models/vision.py @@ -646,7 +646,6 @@ def get_peft_model( lora_config = LoraConfig( **{ k : v for k, v in local_variables.items() if k in allowed_parameters }, ) - print(lora_config) model = prepare_model_for_kbit_training( model, use_gradient_checkpointing = use_gradient_checkpointing, From 400df38fb04aaec151c1d5b1e0d2a1ac23ceca6f Mon Sep 17 00:00:00 2001 From: Daniel Han Date: Tue, 16 Sep 2025 03:00:39 -0700 Subject: [PATCH 132/272] Fix DataParallel --- unsloth/models/llama.py | 7 +++++-- unsloth/models/rl.py | 6 ++++++ 2 files changed, 11 insertions(+), 2 deletions(-) diff --git a/unsloth/models/llama.py b/unsloth/models/llama.py index f7a53d05fd..e04ffd029e 100644 --- a/unsloth/models/llama.py +++ b/unsloth/models/llama.py @@ -1200,7 +1200,8 @@ def _CausalLM_fast_forward( if not RETURN_LOGITS and labels is not None: - n_items = kwargs.get("num_items_in_batch", None) or kwargs.get("n_items", None) + n_items = kwargs.get("num_items_in_batch", None) + if n_items is None: n_items = kwargs.get("n_items", None) if self.config.model_type == "falcon_h1": hidden_states = hidden_states * self.config.lm_head_multiplier @@ -1264,12 +1265,14 @@ def _CausalLM_fast_forward( shift_labels[..., :-1] = labels[..., 1:] shift_labels[..., -1] = -100 # shift_labels = torch.hstack((labels[..., 1:], self.extra_ignored_labels[:labels.shape[0]])) + n_items = kwargs.get("num_items_in_batch", None) + if n_items is None: n_items = kwargs.get("n_items", None) loss = fast_cross_entropy_loss( logits = shift_logits, labels = shift_labels, logit_softcapping = logit_softcapping, logit_scaling = logit_scaling, - n_items = kwargs.get("num_items_in_batch", None) or kwargs.get("n_items", None), + n_items = n_items, ) else: if logit_scaling != 0: diff --git a/unsloth/models/rl.py b/unsloth/models/rl.py index 53f5eee66c..9e940c763b 100644 --- a/unsloth/models/rl.py +++ b/unsloth/models/rl.py @@ -110,6 +110,7 @@ def generate_with_clone(*args, **kwargs): from contextlib import nullcontext from torch.nn import functional as F from transformers import DataCollatorForSeq2Seq, DataCollatorForLanguageModeling as TransformersDataCollatorForLanguageModeling +from transformers.training_args import ParallelMode torch_compile_options = {{ "epilogue_fusion" : True, @@ -160,6 +161,11 @@ def __init__({RLTrainer_arguments}, ): if args is None: args = Unsloth{RLConfig_name}() {RLTrainer_extra_args} + # [TODO] Fix up DataParallel multiplying batch sizes + # [TODO] DDP works, but DP seems to not work? [TODO] + if getattr(args, "parallel_mode", None) == ParallelMode.NOT_DISTRIBUTED and args.n_gpu > 1: + if getattr(args, "_n_gpu", 1) != 1: + args._n_gpu = 1 super().__init__({RLTrainer_call_args}{RLTrainer_kwargs}) {RLTrainer_post} pass From 809a8b3b206db30c676852af07270db8c44b7319 Mon Sep 17 00:00:00 2001 From: Daniel Han Date: Tue, 16 Sep 2025 03:02:52 -0700 Subject: [PATCH 133/272] Update _utils.py --- unsloth/models/_utils.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/unsloth/models/_utils.py b/unsloth/models/_utils.py index a559d34ca4..194d18771c 100644 --- a/unsloth/models/_utils.py +++ b/unsloth/models/_utils.py @@ -12,7 +12,7 @@ # See the License for the specific language governing permissions and # limitations under the License. -__version__ = "2025.9.5" +__version__ = "2025.9.6" __all__ = [ "SUPPORTS_BFLOAT16", From 3dcc0911eb5e5ae360456e281f3e9ca99c5f95b8 Mon Sep 17 00:00:00 2001 From: Daniel Han Date: Wed, 17 Sep 2025 02:13:07 -0700 Subject: [PATCH 134/272] Update rl.py --- unsloth/models/rl.py | 3 +++ 1 file changed, 3 insertions(+) diff --git a/unsloth/models/rl.py b/unsloth/models/rl.py index 3e2fcf22be..6f1f000e68 100644 --- a/unsloth/models/rl.py +++ b/unsloth/models/rl.py @@ -271,14 +271,17 @@ def _patch_trl_rl_trainers(trainer_file = "grpo_trainer"): "if not force_float32 and (float16 and use_bf16): raise TypeError('Unsloth: Model is in float16 precision but you want to use bfloat16 precision. Set fp16 to `True` and bf16 to `False`')\n"\ "if not force_float32 and (not float16 and use_fp16): raise TypeError('Unsloth: Model is in bfloat16 precision but you want to use float16 precision. Set fp16 to `False` and bf16 to `True`')\n"\ "if force_float32:\n"\ + " # Forced float32 training\n"\ " args.fp16 = False\n"\ " args.bf16 = False\n"\ " os.environ['ACCELERATE_MIXED_PRECISION'] = 'no'\n"\ "elif (not use_bf16 and not use_fp16) and mixed_precision_dtype == 'float32':\n"\ + " # Mixed precision training\n"\ " args.fp16 = float16\n"\ " args.bf16 = not float16\n"\ " os.environ['ACCELERATE_MIXED_PRECISION'] = 'fp16' if float16 else 'bf16'\n" "elif mixed_precision_dtype == 'bfloat16':\n"\ + " # Both False since bfloat16 full finetuning doesn't do any autocasting.\n"\ " args.fp16 = False\n"\ " args.bf16 = False\n"\ " os.environ['ACCELERATE_MIXED_PRECISION'] = 'no'\n" From 28b1d50016921db9ada7bcdcdb67c61b92c9f379 Mon Sep 17 00:00:00 2001 From: Daniel Han Date: Wed, 17 Sep 2025 02:40:22 -0700 Subject: [PATCH 135/272] Update synthetic.py --- unsloth/dataprep/synthetic.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/unsloth/dataprep/synthetic.py b/unsloth/dataprep/synthetic.py index 52c114fab6..60742b7fdc 100644 --- a/unsloth/dataprep/synthetic.py +++ b/unsloth/dataprep/synthetic.py @@ -28,6 +28,7 @@ patch_vllm, delete_vllm, ) +from unsloth_zoo.log import logger import numpy as np from .synthetic_configs import ( @@ -117,6 +118,7 @@ def __init__( else: subprocess_commands += ["--" + flag, which,] pass + logger.info(subprocess_commands) vllm_process = subprocess.Popen( subprocess_commands, stdout = subprocess.PIPE, From de162d3e2a724dd178d24961bd9b989a68b70f2d Mon Sep 17 00:00:00 2001 From: Daniel Han Date: Wed, 17 Sep 2025 02:56:36 -0700 Subject: [PATCH 136/272] Update synthetic.py --- unsloth/dataprep/synthetic.py | 1 + 1 file changed, 1 insertion(+) diff --git a/unsloth/dataprep/synthetic.py b/unsloth/dataprep/synthetic.py index 60742b7fdc..2cca155d6d 100644 --- a/unsloth/dataprep/synthetic.py +++ b/unsloth/dataprep/synthetic.py @@ -77,6 +77,7 @@ def __init__( return_args = True, enable_lora = False, use_bitsandbytes = False, + compilation_config = 3, **kwargs, ) if "dtype" in engine_args: From a507a7d82bb1792986ffaa99c9f10b4de7e6bba3 Mon Sep 17 00:00:00 2001 From: Daniel Han Date: Wed, 17 Sep 2025 03:03:32 -0700 Subject: [PATCH 137/272] Update synthetic.py --- unsloth/dataprep/synthetic.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/unsloth/dataprep/synthetic.py b/unsloth/dataprep/synthetic.py index 2cca155d6d..d52f1df373 100644 --- a/unsloth/dataprep/synthetic.py +++ b/unsloth/dataprep/synthetic.py @@ -99,7 +99,7 @@ def __init__( if "model" in engine_args: del engine_args["model"] if "compilation_config" in engine_args: # Cannot parse in vllm serve - engine_args["compilation_config"] = 3 + engine_args["compilation_config"] = "'" + str(engine_args["compilation_config"]) + "'" subprocess_commands = [ "vllm", "serve", str(model_name), From cda72638c333e653d1ac74df30a69b6abfbf3624 Mon Sep 17 00:00:00 2001 From: Daniel Han Date: Wed, 17 Sep 2025 03:06:04 -0700 Subject: [PATCH 138/272] Update synthetic.py --- unsloth/dataprep/synthetic.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/unsloth/dataprep/synthetic.py b/unsloth/dataprep/synthetic.py index d52f1df373..68dd475e59 100644 --- a/unsloth/dataprep/synthetic.py +++ b/unsloth/dataprep/synthetic.py @@ -99,7 +99,7 @@ def __init__( if "model" in engine_args: del engine_args["model"] if "compilation_config" in engine_args: # Cannot parse in vllm serve - engine_args["compilation_config"] = "'" + str(engine_args["compilation_config"]) + "'" + engine_args["compilation_config"] = '"' + str(engine_args["compilation_config"]) + '"' subprocess_commands = [ "vllm", "serve", str(model_name), From dd8ad929e13235091c0379a03a2f09ac3a5c61a1 Mon Sep 17 00:00:00 2001 From: Daniel Han Date: Wed, 17 Sep 2025 03:07:03 -0700 Subject: [PATCH 139/272] Update synthetic.py --- unsloth/dataprep/synthetic.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/unsloth/dataprep/synthetic.py b/unsloth/dataprep/synthetic.py index 68dd475e59..53d655ce0e 100644 --- a/unsloth/dataprep/synthetic.py +++ b/unsloth/dataprep/synthetic.py @@ -97,15 +97,15 @@ def __init__( engine_args["dtype"] = "auto" if "device" in engine_args: del engine_args["device"] if "model" in engine_args: del engine_args["model"] - if "compilation_config" in engine_args: - # Cannot parse in vllm serve - engine_args["compilation_config"] = '"' + str(engine_args["compilation_config"]) + '"' subprocess_commands = [ "vllm", "serve", str(model_name), ] for key, value in engine_args.items(): flag = key.replace("_", "-") + if key == "compilation_config": + subprocess_commands += ["--" + '"' + str(value) + '"',] + continue which = str(value).replace("torch.", "") if which == "True": # Ignore --enforce-eager True From a725b98363e50b7c80649e83975c1f9017f01eed Mon Sep 17 00:00:00 2001 From: Daniel Han Date: Wed, 17 Sep 2025 03:24:07 -0700 Subject: [PATCH 140/272] Update synthetic.py --- unsloth/dataprep/synthetic.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/unsloth/dataprep/synthetic.py b/unsloth/dataprep/synthetic.py index 53d655ce0e..7c421b33bf 100644 --- a/unsloth/dataprep/synthetic.py +++ b/unsloth/dataprep/synthetic.py @@ -104,7 +104,7 @@ def __init__( for key, value in engine_args.items(): flag = key.replace("_", "-") if key == "compilation_config": - subprocess_commands += ["--" + '"' + str(value) + '"',] + subprocess_commands += ["--" + flag, '"' + str(value) + '"',] continue which = str(value).replace("torch.", "") if which == "True": From 321f1a33b0e243691b8e297ac0170393d51456ff Mon Sep 17 00:00:00 2001 From: Daniel Han Date: Wed, 17 Sep 2025 03:26:57 -0700 Subject: [PATCH 141/272] Update synthetic.py --- unsloth/dataprep/synthetic.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/unsloth/dataprep/synthetic.py b/unsloth/dataprep/synthetic.py index 7c421b33bf..7e27b8261d 100644 --- a/unsloth/dataprep/synthetic.py +++ b/unsloth/dataprep/synthetic.py @@ -104,7 +104,7 @@ def __init__( for key, value in engine_args.items(): flag = key.replace("_", "-") if key == "compilation_config": - subprocess_commands += ["--" + flag, '"' + str(value) + '"',] + subprocess_commands += ["--" + flag, "'" + str(value) + "'",] continue which = str(value).replace("torch.", "") if which == "True": From 357e5019b7341c9b19f62db146950113e4aa58b9 Mon Sep 17 00:00:00 2001 From: Daniel Han Date: Wed, 17 Sep 2025 03:30:11 -0700 Subject: [PATCH 142/272] Update synthetic.py --- unsloth/dataprep/synthetic.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/unsloth/dataprep/synthetic.py b/unsloth/dataprep/synthetic.py index 7e27b8261d..aa5296c58b 100644 --- a/unsloth/dataprep/synthetic.py +++ b/unsloth/dataprep/synthetic.py @@ -104,7 +104,8 @@ def __init__( for key, value in engine_args.items(): flag = key.replace("_", "-") if key == "compilation_config": - subprocess_commands += ["--" + flag, "'" + str(value) + "'",] + quoted_compilation_config = '"' + str(value) + '"' + subprocess_commands += ["--" + flag, "'" + quoted_compilation_config[1:-1] + "'",] continue which = str(value).replace("torch.", "") if which == "True": From 8a03656b958d023c4e2639ef3cf7d6c0616f4efb Mon Sep 17 00:00:00 2001 From: Daniel Han Date: Wed, 17 Sep 2025 03:32:54 -0700 Subject: [PATCH 143/272] Update synthetic.py --- unsloth/dataprep/synthetic.py | 1 + 1 file changed, 1 insertion(+) diff --git a/unsloth/dataprep/synthetic.py b/unsloth/dataprep/synthetic.py index aa5296c58b..eb73a5fb84 100644 --- a/unsloth/dataprep/synthetic.py +++ b/unsloth/dataprep/synthetic.py @@ -121,6 +121,7 @@ def __init__( subprocess_commands += ["--" + flag, which,] pass logger.info(subprocess_commands) + print(subprocess_commands) vllm_process = subprocess.Popen( subprocess_commands, stdout = subprocess.PIPE, From d7832d01baaef9a791c509d69c122c61385425f2 Mon Sep 17 00:00:00 2001 From: Daniel Han Date: Wed, 17 Sep 2025 03:42:00 -0700 Subject: [PATCH 144/272] Update synthetic.py --- unsloth/dataprep/synthetic.py | 6 ++---- 1 file changed, 2 insertions(+), 4 deletions(-) diff --git a/unsloth/dataprep/synthetic.py b/unsloth/dataprep/synthetic.py index eb73a5fb84..70f94e5584 100644 --- a/unsloth/dataprep/synthetic.py +++ b/unsloth/dataprep/synthetic.py @@ -102,10 +102,9 @@ def __init__( "vllm", "serve", str(model_name), ] for key, value in engine_args.items(): - flag = key.replace("_", "-") + flag = key.replace("_", "-") if key == "compilation_config": - quoted_compilation_config = '"' + str(value) + '"' - subprocess_commands += ["--" + flag, "'" + quoted_compilation_config[1:-1] + "'",] + subprocess_commands += ["--" + flag, str(value),] continue which = str(value).replace("torch.", "") if which == "True": @@ -121,7 +120,6 @@ def __init__( subprocess_commands += ["--" + flag, which,] pass logger.info(subprocess_commands) - print(subprocess_commands) vllm_process = subprocess.Popen( subprocess_commands, stdout = subprocess.PIPE, From 84f54348de880229dd67afbb737ea247839a6afa Mon Sep 17 00:00:00 2001 From: Daniel Han Date: Wed, 17 Sep 2025 04:09:59 -0700 Subject: [PATCH 145/272] Update synthetic.py --- unsloth/dataprep/synthetic.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/unsloth/dataprep/synthetic.py b/unsloth/dataprep/synthetic.py index 70f94e5584..b75918237b 100644 --- a/unsloth/dataprep/synthetic.py +++ b/unsloth/dataprep/synthetic.py @@ -104,7 +104,8 @@ def __init__( for key, value in engine_args.items(): flag = key.replace("_", "-") if key == "compilation_config": - subprocess_commands += ["--" + flag, str(value),] + # [TODO] Unsure why subprocess doesn't process json properly + subprocess_commands += ["-O3",] continue which = str(value).replace("torch.", "") if which == "True": From 17b2e98f3df7735166a6c3f8b4ba2689418bc6e3 Mon Sep 17 00:00:00 2001 From: Daniel Han Date: Wed, 17 Sep 2025 04:16:41 -0700 Subject: [PATCH 146/272] Update synthetic.py --- unsloth/dataprep/synthetic.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/unsloth/dataprep/synthetic.py b/unsloth/dataprep/synthetic.py index b75918237b..9651df23e8 100644 --- a/unsloth/dataprep/synthetic.py +++ b/unsloth/dataprep/synthetic.py @@ -105,7 +105,8 @@ def __init__( flag = key.replace("_", "-") if key == "compilation_config": # [TODO] Unsure why subprocess doesn't process json properly - subprocess_commands += ["-O3",] + # Also -O3 breaks on T4! + # subprocess_commands += ["-O3",] continue which = str(value).replace("torch.", "") if which == "True": From 5364138046cdddedc37594ae87f5e51bb0265031 Mon Sep 17 00:00:00 2001 From: Daniel Han Date: Wed, 17 Sep 2025 06:35:44 -0700 Subject: [PATCH 147/272] Update mapper.py --- unsloth/models/mapper.py | 10 ++++++++++ 1 file changed, 10 insertions(+) diff --git a/unsloth/models/mapper.py b/unsloth/models/mapper.py index be269316fe..eb9119b681 100644 --- a/unsloth/models/mapper.py +++ b/unsloth/models/mapper.py @@ -956,6 +956,16 @@ "google/gemma-3-270m", "unsloth/gemma-3-270m-bnb-4bit", ), + "unsloth/Magistral-Small-2507-unsloth-bnb-4bit" : ( + "unsloth/Magistral-Small-2507", + "mistralai/Magistral-Small-2507", + "unsloth/Magistral-Small-2507-bnb-4bit", + ), + "unsloth/Magistral-Small-2509-unsloth-bnb-4bit" : ( + "unsloth/Magistral-Small-2509", + "mistralai/Magistral-Small-2509", + "unsloth/Magistral-Small-2509-bnb-4bit", + ), } INT_TO_FLOAT_MAPPER = {} From 8dbd0084d4097cf3c5eb03027ecdf5ec5bdacc17 Mon Sep 17 00:00:00 2001 From: Daniel Han Date: Wed, 17 Sep 2025 08:21:10 -0700 Subject: [PATCH 148/272] Versioning --- pyproject.toml | 4 ++-- unsloth/models/_utils.py | 2 +- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/pyproject.toml b/pyproject.toml index 70fc3bdedc..c3915c1cd6 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -37,7 +37,7 @@ triton = [ ] huggingface = [ - "unsloth_zoo>=2025.9.7", + "unsloth_zoo>=2025.9.8", "packaging", "tyro", "transformers>=4.51.3,!=4.52.0,!=4.52.1,!=4.52.2,!=4.52.3,!=4.53.0,!=4.54.0,!=4.55.0,!=4.55.1,<=4.55.4", @@ -453,7 +453,7 @@ colab-ampere-torch220 = [ "flash-attn>=2.6.3", ] colab-new = [ - "unsloth_zoo>=2025.9.7", + "unsloth_zoo>=2025.9.8", "packaging", "tyro", "transformers>=4.51.3,!=4.47.0,!=4.52.0,!=4.52.1,!=4.52.2,!=4.52.3,!=4.53.0,!=4.54.0,!=4.55.0,!=4.55.1,<=4.55.4", diff --git a/unsloth/models/_utils.py b/unsloth/models/_utils.py index 41adc74650..d2ebc29bf2 100644 --- a/unsloth/models/_utils.py +++ b/unsloth/models/_utils.py @@ -12,7 +12,7 @@ # See the License for the specific language governing permissions and # limitations under the License. -__version__ = "2025.9.6" +__version__ = "2025.9.7" __all__ = [ "SUPPORTS_BFLOAT16", From d7ca79f18ef5b794b3684768708ab7ebb57a4acc Mon Sep 17 00:00:00 2001 From: Daniel Han Date: Wed, 17 Sep 2025 22:01:19 -0700 Subject: [PATCH 149/272] Update loader.py --- unsloth/models/loader.py | 39 ++++++++++++++++++++------------------- 1 file changed, 20 insertions(+), 19 deletions(-) diff --git a/unsloth/models/loader.py b/unsloth/models/loader.py index da40fb57d8..e891340221 100644 --- a/unsloth/models/loader.py +++ b/unsloth/models/loader.py @@ -83,8 +83,8 @@ global FORCE_FLOAT32 FORCE_FLOAT32 = [ - "gemma3,", # Add comma bc gemma3 will match gemma3n - "gemma3n", + "gemma3,", + "gemma3n,", "gpt_oss", ] @@ -627,7 +627,7 @@ def from_pretrained( is_peft = False pass model_types = get_transformers_model_type(model_config or peft_config) - model_types_all = ",".join(model_types) + model_types_all = ",".join(model_types) + "," # Check versions lowered_model_name = model_name.lower() @@ -642,21 +642,22 @@ def from_pretrained( elif "qwen2_5" in model_types_all and transformers_version < Version("4.49.0"): raise RuntimeError("Unsloth: Qwen 2.5 only works on transformers >= 4.49.0." + LATEST) # Gemma 3 - elif "gemma3" in model_types_all: - if "gemma3n" in model_types_all: - if transformers_version < Version("4.53.0"): - raise RuntimeError("Unsloth: Gemma 3N only works on transformers >= 4.53.0" + LATEST) - os.environ["UNSLOTH_DISABLE_STATIC_GENERATION"] = "1" - os.environ["UNSLOTH_FORCE_CUSTOM_DTYPE"] = \ - "float16;torch.float16;torch.float16;"\ - "if name.endswith('norm'): "\ - "module._pre_set_compute_dtype = torch.float32\n"\ - ";"\ - "from unsloth_zoo.temporary_patches.gemma3n import patch_Gemma3nConv_Embed_forwards; patch_Gemma3nConv_Embed_forwards()" - else: - if transformers_version < Version("4.50.0.dev0"): - raise RuntimeError("Unsloth: Gemma 3 only works on transformers >= 4.50.0." + NIGHTLY) - + elif "gemma3," in model_types_all: + if transformers_version < Version("4.50.0.dev0"): + raise RuntimeError("Unsloth: Gemma 3 only works on transformers >= 4.50.0." + NIGHTLY) + # Set norms to float32 since anyways they get upcasted to float32 + # common in both gemma-3 and gemma-3n + os.environ["UNSLOTH_HIGH_PRECISION_LAYERNORM"] = "1" + elif "gemma3n," in model_types_all: + if transformers_version < Version("4.53.0"): + raise RuntimeError("Unsloth: Gemma 3N only works on transformers >= 4.53.0" + LATEST) + os.environ["UNSLOTH_DISABLE_STATIC_GENERATION"] = "1" + os.environ["UNSLOTH_FORCE_CUSTOM_DTYPE"] = \ + "float16;torch.float16;torch.float16;"\ + "if name.endswith('norm'): "\ + "module._pre_set_compute_dtype = torch.float32\n"\ + ";"\ + "from unsloth_zoo.temporary_patches.gemma3n import patch_Gemma3nConv_Embed_forwards; patch_Gemma3nConv_Embed_forwards()" # Set norms to float32 since anyways they get upcasted to float32 # common in both gemma-3 and gemma-3n os.environ["UNSLOTH_HIGH_PRECISION_LAYERNORM"] = "1" @@ -811,7 +812,7 @@ def from_pretrained( for disable_name in FORCE_FLOAT32: # add comma to model_types_all matching in case of exact match for end if (disable_name.lower() == model_type_arch.lower().replace("-", "").replace("_", "") or \ - disable_name.lower() in f'{model_types_all},') and \ + disable_name.lower() in model_types_all) and \ ((dtype == torch.float16) or not SUPPORTS_BFLOAT16): os.environ["UNSLOTH_FORCE_FLOAT32"] = "1" dtype = torch.bfloat16 # Change to bfloat16 loading From bb90785ad3066b4ba926cf1e607f120128c32982 Mon Sep 17 00:00:00 2001 From: Daniel Han Date: Wed, 17 Sep 2025 22:10:29 -0700 Subject: [PATCH 150/272] Update loader.py --- unsloth/models/loader.py | 39 +++++++++++++++++++-------------------- 1 file changed, 19 insertions(+), 20 deletions(-) diff --git a/unsloth/models/loader.py b/unsloth/models/loader.py index e891340221..da40fb57d8 100644 --- a/unsloth/models/loader.py +++ b/unsloth/models/loader.py @@ -83,8 +83,8 @@ global FORCE_FLOAT32 FORCE_FLOAT32 = [ - "gemma3,", - "gemma3n,", + "gemma3,", # Add comma bc gemma3 will match gemma3n + "gemma3n", "gpt_oss", ] @@ -627,7 +627,7 @@ def from_pretrained( is_peft = False pass model_types = get_transformers_model_type(model_config or peft_config) - model_types_all = ",".join(model_types) + "," + model_types_all = ",".join(model_types) # Check versions lowered_model_name = model_name.lower() @@ -642,22 +642,21 @@ def from_pretrained( elif "qwen2_5" in model_types_all and transformers_version < Version("4.49.0"): raise RuntimeError("Unsloth: Qwen 2.5 only works on transformers >= 4.49.0." + LATEST) # Gemma 3 - elif "gemma3," in model_types_all: - if transformers_version < Version("4.50.0.dev0"): - raise RuntimeError("Unsloth: Gemma 3 only works on transformers >= 4.50.0." + NIGHTLY) - # Set norms to float32 since anyways they get upcasted to float32 - # common in both gemma-3 and gemma-3n - os.environ["UNSLOTH_HIGH_PRECISION_LAYERNORM"] = "1" - elif "gemma3n," in model_types_all: - if transformers_version < Version("4.53.0"): - raise RuntimeError("Unsloth: Gemma 3N only works on transformers >= 4.53.0" + LATEST) - os.environ["UNSLOTH_DISABLE_STATIC_GENERATION"] = "1" - os.environ["UNSLOTH_FORCE_CUSTOM_DTYPE"] = \ - "float16;torch.float16;torch.float16;"\ - "if name.endswith('norm'): "\ - "module._pre_set_compute_dtype = torch.float32\n"\ - ";"\ - "from unsloth_zoo.temporary_patches.gemma3n import patch_Gemma3nConv_Embed_forwards; patch_Gemma3nConv_Embed_forwards()" + elif "gemma3" in model_types_all: + if "gemma3n" in model_types_all: + if transformers_version < Version("4.53.0"): + raise RuntimeError("Unsloth: Gemma 3N only works on transformers >= 4.53.0" + LATEST) + os.environ["UNSLOTH_DISABLE_STATIC_GENERATION"] = "1" + os.environ["UNSLOTH_FORCE_CUSTOM_DTYPE"] = \ + "float16;torch.float16;torch.float16;"\ + "if name.endswith('norm'): "\ + "module._pre_set_compute_dtype = torch.float32\n"\ + ";"\ + "from unsloth_zoo.temporary_patches.gemma3n import patch_Gemma3nConv_Embed_forwards; patch_Gemma3nConv_Embed_forwards()" + else: + if transformers_version < Version("4.50.0.dev0"): + raise RuntimeError("Unsloth: Gemma 3 only works on transformers >= 4.50.0." + NIGHTLY) + # Set norms to float32 since anyways they get upcasted to float32 # common in both gemma-3 and gemma-3n os.environ["UNSLOTH_HIGH_PRECISION_LAYERNORM"] = "1" @@ -812,7 +811,7 @@ def from_pretrained( for disable_name in FORCE_FLOAT32: # add comma to model_types_all matching in case of exact match for end if (disable_name.lower() == model_type_arch.lower().replace("-", "").replace("_", "") or \ - disable_name.lower() in model_types_all) and \ + disable_name.lower() in f'{model_types_all},') and \ ((dtype == torch.float16) or not SUPPORTS_BFLOAT16): os.environ["UNSLOTH_FORCE_FLOAT32"] = "1" dtype = torch.bfloat16 # Change to bfloat16 loading From 3289826add711c92dee44f1117fa6a54d6e68b91 Mon Sep 17 00:00:00 2001 From: Daniel Han Date: Thu, 18 Sep 2025 02:22:14 -0700 Subject: [PATCH 151/272] Update rl.py --- unsloth/models/rl.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/unsloth/models/rl.py b/unsloth/models/rl.py index 6f1f000e68..3d5f6d084b 100644 --- a/unsloth/models/rl.py +++ b/unsloth/models/rl.py @@ -259,7 +259,8 @@ def _patch_trl_rl_trainers(trainer_file = "grpo_trainer"): "use_fp16 = getattr(args, 'fp16', False)\n"\ "if type(use_fp16) is not bool: use_fp16 = False\n"\ "force_float32 = False\n"\ - "if os.environ.get('UNSLOTH_FORCE_FLOAT32', '0') == '1':\n"\ + "full_finetuning = os.environ.get('UNSLOTH_ENABLE_FULL_FINETUNING', '0') == '1'\n"\ + "if not full_finetuning and (os.environ.get('UNSLOTH_FORCE_FLOAT32', '0') == '1'):\n"\ " print('Unsloth: Switching to float32 training since model cannot work with float16')\n"\ " force_float32 = True\n"\ "mixed_precision_dtype = os.environ.get('UNSLOTH_MIXED_PRECISION', 'float32')\n"\ From a04211436f8a11aaece59d4662a29ab4c825a0b1 Mon Sep 17 00:00:00 2001 From: Daniel Han Date: Thu, 18 Sep 2025 04:31:28 -0700 Subject: [PATCH 152/272] Versioning --- pyproject.toml | 4 ++-- unsloth/models/_utils.py | 2 +- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/pyproject.toml b/pyproject.toml index c3915c1cd6..4f9c308b32 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -37,7 +37,7 @@ triton = [ ] huggingface = [ - "unsloth_zoo>=2025.9.8", + "unsloth_zoo>=2025.9.9", "packaging", "tyro", "transformers>=4.51.3,!=4.52.0,!=4.52.1,!=4.52.2,!=4.52.3,!=4.53.0,!=4.54.0,!=4.55.0,!=4.55.1,<=4.55.4", @@ -453,7 +453,7 @@ colab-ampere-torch220 = [ "flash-attn>=2.6.3", ] colab-new = [ - "unsloth_zoo>=2025.9.8", + "unsloth_zoo>=2025.9.9", "packaging", "tyro", "transformers>=4.51.3,!=4.47.0,!=4.52.0,!=4.52.1,!=4.52.2,!=4.52.3,!=4.53.0,!=4.54.0,!=4.55.0,!=4.55.1,<=4.55.4", diff --git a/unsloth/models/_utils.py b/unsloth/models/_utils.py index 79134005dc..5f41352d97 100644 --- a/unsloth/models/_utils.py +++ b/unsloth/models/_utils.py @@ -12,7 +12,7 @@ # See the License for the specific language governing permissions and # limitations under the License. -__version__ = "2025.9.7" +__version__ = "2025.9.8" __all__ = [ "SUPPORTS_BFLOAT16", From ffa04dde12b7fa9430566cce8b0309531f7af2ba Mon Sep 17 00:00:00 2001 From: Daniel Han Date: Thu, 18 Sep 2025 15:45:42 -0700 Subject: [PATCH 153/272] Update _utils.py --- unsloth/models/_utils.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/unsloth/models/_utils.py b/unsloth/models/_utils.py index 5f41352d97..79134005dc 100644 --- a/unsloth/models/_utils.py +++ b/unsloth/models/_utils.py @@ -12,7 +12,7 @@ # See the License for the specific language governing permissions and # limitations under the License. -__version__ = "2025.9.8" +__version__ = "2025.9.7" __all__ = [ "SUPPORTS_BFLOAT16", From b3654449bdd237e642e5f44c6e96c74e203232f7 Mon Sep 17 00:00:00 2001 From: Daniel Han Date: Thu, 18 Sep 2025 18:57:41 -0700 Subject: [PATCH 154/272] Fix auto_mapping --- unsloth/models/llama.py | 4 +++- unsloth/models/vision.py | 4 +++- 2 files changed, 6 insertions(+), 2 deletions(-) diff --git a/unsloth/models/llama.py b/unsloth/models/llama.py index 7414c07326..6326f519f1 100644 --- a/unsloth/models/llama.py +++ b/unsloth/models/llama.py @@ -25,7 +25,7 @@ from torch.nn.functional import scaled_dot_product_attention from transformers import __version__ as transformers_version from unsloth_zoo.utils import Version, _get_dtype -from unsloth_zoo.hf_utils import dtype_from_config, add_dtype_kwargs +from unsloth_zoo.hf_utils import dtype_from_config, add_dtype_kwargs, fix_lora_auto_mapping from unsloth_zoo.peft_utils import SKIP_QUANTIZATION_MODULES from unsloth import DEVICE_TYPE, DEVICE_COUNT @@ -2632,6 +2632,8 @@ def get_peft_model( pass model = _get_peft_model(model, lora_config) + # Fix LoraConfig.auto_mapping is None + fix_lora_auto_mapping(model) # Apply QAT + LoRA if specified if qat_scheme is not None: diff --git a/unsloth/models/vision.py b/unsloth/models/vision.py index d6c710c281..d03ffb45a9 100644 --- a/unsloth/models/vision.py +++ b/unsloth/models/vision.py @@ -43,7 +43,7 @@ from transformers import __version__ as transformers_version from triton import __version__ as triton_version from unsloth_zoo.utils import _get_dtype -from unsloth_zoo.hf_utils import dtype_from_config, add_dtype_kwargs +from unsloth_zoo.hf_utils import dtype_from_config, add_dtype_kwargs, fix_lora_auto_mapping from unsloth_zoo.patching_utils import patch_model_and_tokenizer from unsloth_zoo.training_utils import prepare_model_for_training @@ -758,6 +758,8 @@ def get_peft_model( use_gradient_checkpointing = use_gradient_checkpointing, ) model = _get_peft_model(model, lora_config) + # Fix LoraConfig.auto_mapping is None + fix_lora_auto_mapping(model) # Enable gradients on modules which are trainable requires_grad_for_gradient_checkpointing(model) trust_remote_code = getattr(model, "_unsloth_trust_remote_code", False) From 5ce7bf895269e15d56b2de088993e829285c0805 Mon Sep 17 00:00:00 2001 From: Daniel Han Date: Sat, 20 Sep 2025 02:09:50 -0700 Subject: [PATCH 155/272] Update loader.py --- unsloth/models/loader.py | 126 ++++++++++++++++++++++----------------- 1 file changed, 70 insertions(+), 56 deletions(-) diff --git a/unsloth/models/loader.py b/unsloth/models/loader.py index 3eb80fc0dd..ce90874f94 100644 --- a/unsloth/models/loader.py +++ b/unsloth/models/loader.py @@ -82,12 +82,28 @@ ) global FORCE_FLOAT32 +# Forces float32 precision since float16 goes to infinity FORCE_FLOAT32 = [ - "gemma3,", # Add comma bc gemma3 will match gemma3n + "gemma3,", # Add comma bc gemma3 will match gemma3n "gemma3n", "gpt_oss", ] +global DISABLE_COMPILE_MODEL_NAMES +# Must be alphabetically sorted for each entry +DISABLE_COMPILE_MODEL_NAMES = [ + "aya_vision", + "modernbert", + "granite,llava_next", # Granite-vision 3 +] + +global DISABLE_SDPA_MODEL_NAMES +# Disables some SDPA modules since it's wrong +DISABLE_SDPA_MODEL_NAMES = [ + "gemma3,", # Add comma bc gemma3 will match gemma3n +] + + class FastLanguageModel(FastLlamaModel): @staticmethod def from_pretrained( @@ -213,16 +229,27 @@ def from_pretrained( peft_error = str(error) is_peft = False pass - model_types = get_transformers_model_type(peft_config or model_config) + + # Old transformers versions check + both_exist = (is_model and is_peft) and not SUPPORTS_LLAMA32 + + # Error out if both LoRA and normal model config exists. + if both_exist: + raise RuntimeError( + "Unsloth: Your repo has a LoRA adapter and a base model.\n"\ + "You have 2 files `config.json` and `adapter_config.json`.\n"\ + "We must only allow one config file.\n"\ + "Please separate the LoRA and base models to 2 repos." + ) + model_types = get_transformers_model_type( + peft_config if peft_config is not None else model_config + ) if len(model_types) == 1: model_type = model_types[0] else: # Leave as tuple if more than one arch model_type = model_types - # Old transformers versions check - both_exist = (is_model and is_peft) and not SUPPORTS_LLAMA32 - # New transformers need to check manually. if SUPPORTS_LLAMA32: # Check if folder exists locally @@ -240,17 +267,8 @@ def from_pretrained( pass pass - # Error out if both LoRA and normal model config exists. - if both_exist: - raise RuntimeError( - "Unsloth: Your repo has a LoRA adapter and a base model.\n"\ - "You have 2 files `config.json` and `adapter_config.json`.\n"\ - "We must only allow one config file.\n"\ - "Please separate the LoRA and base models to 2 repos." - ) - - elif not is_model and not is_peft: - error = autoconfig_error or peft_error + if not is_model and not is_peft: + error = autoconfig_error if autoconfig_error is not None else peft_error # Old transformers version if "rope_scaling" in error.lower() and not SUPPORTS_LLAMA31: raise ImportError( @@ -498,13 +516,6 @@ def from_pretrained( from transformers import AutoModelForVision2Seq pass -# Must be alphabetically sorted for each entry -DISABLE_COMPILE_MODEL_NAMES = [ - "aya_vision", - "modernbert", - "granite,llava_next", # Granite-vision 3 -] - class FastModel(FastBaseModel): @staticmethod @@ -626,8 +637,20 @@ def from_pretrained( peft_error = str(error) is_peft = False pass - model_types = get_transformers_model_type(peft_config or model_config) - model_types_all = ",".join(model_types) + # Old transformers versions check + both_exist = (is_model and is_peft) and not SUPPORTS_LLAMA32 + # Error out if both LoRA and normal model config exists. + if both_exist: + raise RuntimeError( + "Unsloth: Your repo has a LoRA adapter and a base model.\n"\ + "You have 2 files `config.json` and `adapter_config.json`.\n"\ + "We must only allow one config file.\n"\ + "Please separate the LoRA and base models to 2 repos." + ) + model_types = get_transformers_model_type( + peft_config if peft_config is not None else model_config + ) + model_types_all = ",".join(model_types) + "," # Check versions lowered_model_name = model_name.lower() @@ -643,20 +666,22 @@ def from_pretrained( raise RuntimeError("Unsloth: Qwen 2.5 only works on transformers >= 4.49.0." + LATEST) # Gemma 3 elif "gemma3" in model_types_all: - if "gemma3n" in model_types_all: - if transformers_version < Version("4.53.0"): - raise RuntimeError("Unsloth: Gemma 3N only works on transformers >= 4.53.0" + LATEST) - os.environ["UNSLOTH_DISABLE_STATIC_GENERATION"] = "1" - os.environ["UNSLOTH_FORCE_CUSTOM_DTYPE"] = \ - "float16;torch.float16;torch.float16;"\ - "if name.endswith('norm'): "\ - "module._pre_set_compute_dtype = torch.float32\n"\ - ";"\ - "from unsloth_zoo.temporary_patches.gemma3n import patch_Gemma3nConv_Embed_forwards; patch_Gemma3nConv_Embed_forwards()" - else: - if transformers_version < Version("4.50.0.dev0"): - raise RuntimeError("Unsloth: Gemma 3 only works on transformers >= 4.50.0." + NIGHTLY) - + if transformers_version < Version("4.50.0.dev0"): + raise RuntimeError("Unsloth: Gemma 3 only works on transformers >= 4.50.0." + NIGHTLY) + # Set norms to float32 since anyways they get upcasted to float32 + # common in both gemma-3 and gemma-3n + os.environ["UNSLOTH_HIGH_PRECISION_LAYERNORM"] = "1" + # Gemma 3N + elif "gemma3n" in model_types_all: + if transformers_version < Version("4.53.0"): + raise RuntimeError("Unsloth: Gemma 3N only works on transformers >= 4.53.0" + LATEST) + os.environ["UNSLOTH_DISABLE_STATIC_GENERATION"] = "1" + os.environ["UNSLOTH_FORCE_CUSTOM_DTYPE"] = \ + "float16;torch.float16;torch.float16;"\ + "if name.endswith('norm'): "\ + "module._pre_set_compute_dtype = torch.float32\n"\ + ";"\ + "from unsloth_zoo.temporary_patches.gemma3n import patch_Gemma3nConv_Embed_forwards; patch_Gemma3nConv_Embed_forwards()" # Set norms to float32 since anyways they get upcasted to float32 # common in both gemma-3 and gemma-3n os.environ["UNSLOTH_HIGH_PRECISION_LAYERNORM"] = "1" @@ -732,9 +757,6 @@ def from_pretrained( os.environ["UNSLOTH_DISABLE_STATIC_GENERATION"] = "1" pass - # Old transformers versions check - both_exist = (is_model and is_peft) and not SUPPORTS_LLAMA32 - # New transformers need to check manually. if SUPPORTS_LLAMA32: # Check if folder exists locally @@ -751,17 +773,8 @@ def from_pretrained( pass pass - # Error out if both LoRA and normal model config exists. - if both_exist: - raise RuntimeError( - "Unsloth: Your repo has a LoRA adapter and a base model.\n"\ - "You have 2 files `config.json` and `adapter_config.json`.\n"\ - "We must only allow one config file.\n"\ - "Please separate the LoRA and base models to 2 repos." - ) - - elif not is_model and not is_peft: - error = autoconfig_error or peft_error + if not is_model and not is_peft: + error = autoconfig_error if autoconfig_error is not None else peft_error # Old transformers version if "rope_scaling" in error.lower() and not SUPPORTS_LLAMA31: raise ImportError( @@ -811,7 +824,7 @@ def from_pretrained( for disable_name in FORCE_FLOAT32: # add comma to model_types_all matching in case of exact match for end if (disable_name.lower() == model_type_arch.lower().replace("-", "").replace("_", "") or \ - disable_name.lower() in f'{model_types_all},') and \ + disable_name.lower() in model_types_all) and \ ((dtype == torch.float16) or not SUPPORTS_BFLOAT16): os.environ["UNSLOTH_FORCE_FLOAT32"] = "1" dtype = torch.bfloat16 # Change to bfloat16 loading @@ -855,8 +868,9 @@ def from_pretrained( unsloth_force_compile = unsloth_force_compile, ) pass - # Fix SDPA - if "gemma3n" in model_types_all: + # Fix SDPA issues + for model_type in DISABLE_SDPA_MODEL_NAMES: + if model_type in model_types_all: supports_sdpa = False pass From 755e6e2026cc120f876d489a6daaf512255f843d Mon Sep 17 00:00:00 2001 From: Daniel Han Date: Sat, 20 Sep 2025 02:12:12 -0700 Subject: [PATCH 156/272] Update loader.py --- unsloth/models/loader.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/unsloth/models/loader.py b/unsloth/models/loader.py index ce90874f94..a400cb621a 100644 --- a/unsloth/models/loader.py +++ b/unsloth/models/loader.py @@ -871,7 +871,7 @@ def from_pretrained( # Fix SDPA issues for model_type in DISABLE_SDPA_MODEL_NAMES: if model_type in model_types_all: - supports_sdpa = False + supports_sdpa = False pass # Check if this is local model since the tokenizer gets overwritten From d01b8af06d3b56ee1cc44934b620e0861c7d666d Mon Sep 17 00:00:00 2001 From: Daniel Han Date: Sat, 20 Sep 2025 16:54:25 -0700 Subject: [PATCH 157/272] Update vision.py --- unsloth/models/vision.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/unsloth/models/vision.py b/unsloth/models/vision.py index 3f5ae816ea..4e9b88a205 100644 --- a/unsloth/models/vision.py +++ b/unsloth/models/vision.py @@ -242,6 +242,8 @@ def unsloth_base_fast_generate( kwargs["compile_config"] = _compile_config pass + kwargs["cache_implementation"] = "static" + with torch.inference_mode(), autocaster: output = self._old_generate(*args, **kwargs) From d048d3aedaaa5d5d62bdfd679f8c42ffd295ce27 Mon Sep 17 00:00:00 2001 From: Daniel Han Date: Sat, 20 Sep 2025 18:01:46 -0700 Subject: [PATCH 158/272] Update vision.py --- unsloth/models/vision.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/unsloth/models/vision.py b/unsloth/models/vision.py index 4e9b88a205..934f2bbc5e 100644 --- a/unsloth/models/vision.py +++ b/unsloth/models/vision.py @@ -422,7 +422,7 @@ def from_pretrained( # Stop SDPA for some archs like Pixtral / Mistral3 if not ("attn_implementation" in kwargs): kwargs["attn_implementation"] = "sdpa" - if not supports_sdpa: + if not supports_sdpa and (os.environ.get("UNSLOTH_ENABLE_FLEX_ATTENTION", "0") == "0"): print(f"Unsloth: {model_type_arch.title()} does not support SDPA - switching to fast eager.") del kwargs["attn_implementation"] pass From 81ba78e9a11500de05596dd0b28979abe3b14050 Mon Sep 17 00:00:00 2001 From: Daniel Han Date: Sat, 20 Sep 2025 18:29:35 -0700 Subject: [PATCH 159/272] Update loader.py --- unsloth/models/loader.py | 1 + 1 file changed, 1 insertion(+) diff --git a/unsloth/models/loader.py b/unsloth/models/loader.py index a400cb621a..baeedd339d 100644 --- a/unsloth/models/loader.py +++ b/unsloth/models/loader.py @@ -868,6 +868,7 @@ def from_pretrained( unsloth_force_compile = unsloth_force_compile, ) pass + print("supports_sdpa", supports_sdpa) # Fix SDPA issues for model_type in DISABLE_SDPA_MODEL_NAMES: if model_type in model_types_all: From 0bb74fe32ae1f0790142f88eadefbaba36a94742 Mon Sep 17 00:00:00 2001 From: Daniel Han Date: Sat, 20 Sep 2025 18:31:36 -0700 Subject: [PATCH 160/272] Message --- unsloth/models/loader.py | 1 - unsloth/models/vision.py | 5 +++-- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/unsloth/models/loader.py b/unsloth/models/loader.py index baeedd339d..a400cb621a 100644 --- a/unsloth/models/loader.py +++ b/unsloth/models/loader.py @@ -868,7 +868,6 @@ def from_pretrained( unsloth_force_compile = unsloth_force_compile, ) pass - print("supports_sdpa", supports_sdpa) # Fix SDPA issues for model_type in DISABLE_SDPA_MODEL_NAMES: if model_type in model_types_all: diff --git a/unsloth/models/vision.py b/unsloth/models/vision.py index 934f2bbc5e..9e954894a4 100644 --- a/unsloth/models/vision.py +++ b/unsloth/models/vision.py @@ -422,8 +422,9 @@ def from_pretrained( # Stop SDPA for some archs like Pixtral / Mistral3 if not ("attn_implementation" in kwargs): kwargs["attn_implementation"] = "sdpa" - if not supports_sdpa and (os.environ.get("UNSLOTH_ENABLE_FLEX_ATTENTION", "0") == "0"): - print(f"Unsloth: {model_type_arch.title()} does not support SDPA - switching to fast eager.") + if not supports_sdpa: + if os.environ.get("UNSLOTH_ENABLE_FLEX_ATTENTION", "0") == "0": + print(f"Unsloth: {model_type_arch.title()} does not support SDPA - switching to fast eager.") del kwargs["attn_implementation"] pass From 14fdb224e61a24e22660f54033ace71fc62c0163 Mon Sep 17 00:00:00 2001 From: Daniel Han Date: Sat, 20 Sep 2025 19:12:58 -0700 Subject: [PATCH 161/272] Update vision.py --- unsloth/models/vision.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/unsloth/models/vision.py b/unsloth/models/vision.py index 9e954894a4..d716cdf9a1 100644 --- a/unsloth/models/vision.py +++ b/unsloth/models/vision.py @@ -242,7 +242,7 @@ def unsloth_base_fast_generate( kwargs["compile_config"] = _compile_config pass - kwargs["cache_implementation"] = "static" + print(kwargs["cache_implementation"], args, kwargs) with torch.inference_mode(), autocaster: output = self._old_generate(*args, **kwargs) From ce4f2b6cb2f8b1a060cdebddfeee3bffaa1291f7 Mon Sep 17 00:00:00 2001 From: Daniel Han Date: Sat, 20 Sep 2025 19:18:05 -0700 Subject: [PATCH 162/272] Update loader.py --- unsloth/models/loader.py | 1 - 1 file changed, 1 deletion(-) diff --git a/unsloth/models/loader.py b/unsloth/models/loader.py index a400cb621a..c25fd86966 100644 --- a/unsloth/models/loader.py +++ b/unsloth/models/loader.py @@ -713,7 +713,6 @@ def from_pretrained( ";"\ "os.environ['TRITON_F32_DEFAULT'] = 'ieee'" elif "gpt_oss" in model_types_all: - os.environ["UNSLOTH_DISABLE_STATIC_GENERATION"] = "1" if not load_in_4bit: # Only upcast MoE biases for MXFP4, not BnB # Set norms to float32 since anyways they get upcasted to float32 From e333b03f0645a5c1c3ee33e0319f80f0259ccddc Mon Sep 17 00:00:00 2001 From: Daniel Han Date: Sat, 20 Sep 2025 19:21:20 -0700 Subject: [PATCH 163/272] Update vision.py --- unsloth/models/vision.py | 2 -- 1 file changed, 2 deletions(-) diff --git a/unsloth/models/vision.py b/unsloth/models/vision.py index d716cdf9a1..f257bf4ec3 100644 --- a/unsloth/models/vision.py +++ b/unsloth/models/vision.py @@ -242,8 +242,6 @@ def unsloth_base_fast_generate( kwargs["compile_config"] = _compile_config pass - print(kwargs["cache_implementation"], args, kwargs) - with torch.inference_mode(), autocaster: output = self._old_generate(*args, **kwargs) From 456d225e1a9f1297bc0067840cb8a8b975712e72 Mon Sep 17 00:00:00 2001 From: Daniel Han Date: Sat, 20 Sep 2025 19:23:03 -0700 Subject: [PATCH 164/272] cache_implementation --- unsloth/models/loader.py | 1 + unsloth/models/vision.py | 7 +++++-- 2 files changed, 6 insertions(+), 2 deletions(-) diff --git a/unsloth/models/loader.py b/unsloth/models/loader.py index c25fd86966..a400cb621a 100644 --- a/unsloth/models/loader.py +++ b/unsloth/models/loader.py @@ -713,6 +713,7 @@ def from_pretrained( ";"\ "os.environ['TRITON_F32_DEFAULT'] = 'ieee'" elif "gpt_oss" in model_types_all: + os.environ["UNSLOTH_DISABLE_STATIC_GENERATION"] = "1" if not load_in_4bit: # Only upcast MoE biases for MXFP4, not BnB # Set norms to float32 since anyways they get upcasted to float32 diff --git a/unsloth/models/vision.py b/unsloth/models/vision.py index f257bf4ec3..e3b87c3dde 100644 --- a/unsloth/models/vision.py +++ b/unsloth/models/vision.py @@ -217,8 +217,11 @@ def unsloth_base_fast_generate( if getattr(self, "_supports_static_cache", getattr(self, "_can_compile_fullgraph", True)): if os.environ.get("UNSLOTH_DISABLE_STATIC_GENERATION", "0") == "0": cache_implementation = "static" - else: + elif Version(transformers_version) < Version("4.56.0.dev0"): cache_implementation = None + else: + # Should work in latest transformers! + cache_implementation = "static" else: cache_implementation = None if cache_implementation is not None: @@ -241,7 +244,7 @@ def unsloth_base_fast_generate( if cache_implementation is not None: kwargs["compile_config"] = _compile_config pass - + print(cache_implementation) with torch.inference_mode(), autocaster: output = self._old_generate(*args, **kwargs) From 1cd7b85b07f03071936d3e336d17540cc445b987 Mon Sep 17 00:00:00 2001 From: Daniel Han Date: Sat, 20 Sep 2025 19:37:23 -0700 Subject: [PATCH 165/272] Update vision.py --- unsloth/models/vision.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/unsloth/models/vision.py b/unsloth/models/vision.py index e3b87c3dde..f0ae0e39a8 100644 --- a/unsloth/models/vision.py +++ b/unsloth/models/vision.py @@ -233,7 +233,7 @@ def unsloth_base_fast_generate( if Version(transformers_version) < Version("4.56.0.dev0"): cache_implementation = "hybrid" else: - cache_implementation = "static" + cache_implementation = None if "generation_config" in kwargs: kwargs["generation_config"].cache_implementation = cache_implementation From 2b0d2195a3c70b375c85a3106aba01215ee5e082 Mon Sep 17 00:00:00 2001 From: Daniel Han Date: Sat, 20 Sep 2025 21:35:13 -0700 Subject: [PATCH 166/272] Update loader.py --- unsloth/models/loader.py | 16 ++++++++-------- 1 file changed, 8 insertions(+), 8 deletions(-) diff --git a/unsloth/models/loader.py b/unsloth/models/loader.py index a400cb621a..a445dd72f1 100644 --- a/unsloth/models/loader.py +++ b/unsloth/models/loader.py @@ -664,14 +664,7 @@ def from_pretrained( # Qwen 2.5 elif "qwen2_5" in model_types_all and transformers_version < Version("4.49.0"): raise RuntimeError("Unsloth: Qwen 2.5 only works on transformers >= 4.49.0." + LATEST) - # Gemma 3 - elif "gemma3" in model_types_all: - if transformers_version < Version("4.50.0.dev0"): - raise RuntimeError("Unsloth: Gemma 3 only works on transformers >= 4.50.0." + NIGHTLY) - # Set norms to float32 since anyways they get upcasted to float32 - # common in both gemma-3 and gemma-3n - os.environ["UNSLOTH_HIGH_PRECISION_LAYERNORM"] = "1" - # Gemma 3N + # Gemma 3N must be beefore Gemma 3 elif "gemma3n" in model_types_all: if transformers_version < Version("4.53.0"): raise RuntimeError("Unsloth: Gemma 3N only works on transformers >= 4.53.0" + LATEST) @@ -685,6 +678,13 @@ def from_pretrained( # Set norms to float32 since anyways they get upcasted to float32 # common in both gemma-3 and gemma-3n os.environ["UNSLOTH_HIGH_PRECISION_LAYERNORM"] = "1" + # Gemma 3 + elif "gemma3" in model_types_all: + if transformers_version < Version("4.50.0.dev0"): + raise RuntimeError("Unsloth: Gemma 3 only works on transformers >= 4.50.0." + NIGHTLY) + # Set norms to float32 since anyways they get upcasted to float32 + # common in both gemma-3 and gemma-3n + os.environ["UNSLOTH_HIGH_PRECISION_LAYERNORM"] = "1" # Cohere elif "cohere2" in model_types_all and transformers_version < Version("4.50.0.dev0"): raise RuntimeError("Unsloth: Cohere's Command model only works on transformers >= 4.50.0." + NIGHTLY) From d1c92839451f0d11673d5814460089f99ef1eea9 Mon Sep 17 00:00:00 2001 From: Daniel Han Date: Sat, 20 Sep 2025 21:39:37 -0700 Subject: [PATCH 167/272] Update vision.py --- unsloth/models/vision.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/unsloth/models/vision.py b/unsloth/models/vision.py index f0ae0e39a8..e3b87c3dde 100644 --- a/unsloth/models/vision.py +++ b/unsloth/models/vision.py @@ -233,7 +233,7 @@ def unsloth_base_fast_generate( if Version(transformers_version) < Version("4.56.0.dev0"): cache_implementation = "hybrid" else: - cache_implementation = None + cache_implementation = "static" if "generation_config" in kwargs: kwargs["generation_config"].cache_implementation = cache_implementation From a0df6ab7090090f907cdb763468f6ec0cc373c75 Mon Sep 17 00:00:00 2001 From: Daniel Han Date: Sat, 20 Sep 2025 22:19:47 -0700 Subject: [PATCH 168/272] Update vision.py --- unsloth/models/vision.py | 1 + 1 file changed, 1 insertion(+) diff --git a/unsloth/models/vision.py b/unsloth/models/vision.py index e3b87c3dde..d5076f7762 100644 --- a/unsloth/models/vision.py +++ b/unsloth/models/vision.py @@ -244,6 +244,7 @@ def unsloth_base_fast_generate( if cache_implementation is not None: kwargs["compile_config"] = _compile_config pass + kwargs["cache_implementation"] = None print(cache_implementation) with torch.inference_mode(), autocaster: output = self._old_generate(*args, **kwargs) From 450b2da52f45722b62c4084699f5102869577193 Mon Sep 17 00:00:00 2001 From: Daniel Han Date: Sat, 20 Sep 2025 22:23:10 -0700 Subject: [PATCH 169/272] Update vision.py --- unsloth/models/vision.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/unsloth/models/vision.py b/unsloth/models/vision.py index d5076f7762..13b078378d 100644 --- a/unsloth/models/vision.py +++ b/unsloth/models/vision.py @@ -244,7 +244,7 @@ def unsloth_base_fast_generate( if cache_implementation is not None: kwargs["compile_config"] = _compile_config pass - kwargs["cache_implementation"] = None + # kwargs["cache_implementation"] = None print(cache_implementation) with torch.inference_mode(), autocaster: output = self._old_generate(*args, **kwargs) From b1116d59a22dc2a09e19d1ced2e48254de2dc742 Mon Sep 17 00:00:00 2001 From: Daniel Han Date: Sun, 21 Sep 2025 00:52:40 -0700 Subject: [PATCH 170/272] Update loader.py --- unsloth/models/loader.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/unsloth/models/loader.py b/unsloth/models/loader.py index a445dd72f1..1a9c145368 100644 --- a/unsloth/models/loader.py +++ b/unsloth/models/loader.py @@ -690,7 +690,7 @@ def from_pretrained( raise RuntimeError("Unsloth: Cohere's Command model only works on transformers >= 4.50.0." + NIGHTLY) # Sesame elif "csm" in model_types_all: - os.environ["UNSLOTH_COMPILE_DISABLE"] = "1" # Inference is too slow + os.environ["UNSLOTH_COMPILE_DISABLE"] = "partial" # Inference is too slow os.environ["UNSLOTH_DISABLE_STATIC_GENERATION"] = "1" # Sesame fails os.environ["UNSLOTH_FORCE_CUSTOM_DTYPE"] = \ "all;torch.float32;torch.float16;"\ @@ -745,7 +745,7 @@ def from_pretrained( else: for check_model_name in DISABLE_COMPILE_MODEL_NAMES: if check_model_name in lowered_model_name: - os.environ["UNSLOTH_COMPILE_DISABLE"] = "1" + os.environ["UNSLOTH_COMPILE_DISABLE"] = "partial" os.environ["UNSLOTH_DISABLE_STATIC_GENERATION"] = "1" if transformers_version < Version("4.50.0.dev0"): raise RuntimeError(f"Unsloth: {check_model_name} only works on transformers >= 4.50.0." + NIGHTLY) From 7210cb1d5b53601efde268751f788825e3302e74 Mon Sep 17 00:00:00 2001 From: Daniel Han Date: Sun, 21 Sep 2025 03:30:11 -0700 Subject: [PATCH 171/272] Update vision.py --- unsloth/models/vision.py | 2 -- 1 file changed, 2 deletions(-) diff --git a/unsloth/models/vision.py b/unsloth/models/vision.py index 13b078378d..2ea4d1f71f 100644 --- a/unsloth/models/vision.py +++ b/unsloth/models/vision.py @@ -244,8 +244,6 @@ def unsloth_base_fast_generate( if cache_implementation is not None: kwargs["compile_config"] = _compile_config pass - # kwargs["cache_implementation"] = None - print(cache_implementation) with torch.inference_mode(), autocaster: output = self._old_generate(*args, **kwargs) From f148170a3cc007d2cf3f28c76cb1c9797c828bcd Mon Sep 17 00:00:00 2001 From: Daniel Han Date: Sun, 21 Sep 2025 03:54:20 -0700 Subject: [PATCH 172/272] Save max_seq_length --- unsloth/models/llama.py | 6 ++++++ unsloth/models/vision.py | 6 ++++++ 2 files changed, 12 insertions(+) diff --git a/unsloth/models/llama.py b/unsloth/models/llama.py index 6326f519f1..fee83441fe 100644 --- a/unsloth/models/llama.py +++ b/unsloth/models/llama.py @@ -2170,6 +2170,9 @@ def from_pretrained( m = m.model pass m.max_seq_length = max_seq_length + # Save to modules as well + for module in model.modules(): + module.max_seq_length = max_seq_length # We check the tokenizer first for errors if fix_tokenizer: @@ -2892,6 +2895,9 @@ def patch_peft_model( internal_model = internal_model.model pass internal_model.max_seq_length = max_seq_length + # Save to modules as well + for module in model.modules(): + module.max_seq_length = max_seq_length # Patch tokenizer to pad to the right internal_model = model diff --git a/unsloth/models/vision.py b/unsloth/models/vision.py index 2ea4d1f71f..6d8250318d 100644 --- a/unsloth/models/vision.py +++ b/unsloth/models/vision.py @@ -645,6 +645,9 @@ def from_pretrained( m = m.model pass m.max_seq_length = max_seq_length + # Save to modules as well + for module in model.modules(): + module.max_seq_length = max_seq_length m._saved_temp_tokenizer = tokenizer # Also set is_loaded_in_8bit to disable incorrect DDP m.is_loaded_in_8bit = True if not full_finetuning else False @@ -780,6 +783,9 @@ def get_peft_model( trust_remote_code = getattr(model, "_unsloth_trust_remote_code", False) model = FastBaseModel.post_patch_model(model, use_gradient_checkpointing, trust_remote_code = trust_remote_code) model.max_seq_length = max_seq_length + # Save to modules as well + for module in model.modules(): + module.max_seq_length = max_seq_length # Clear deleted GPU items for _ in range(3): gc.collect() From 7fa66da748deb9144122107f083a79a4c8f97c18 Mon Sep 17 00:00:00 2001 From: Daniel Han Date: Sun, 21 Sep 2025 03:56:12 -0700 Subject: [PATCH 173/272] Update _utils.py --- unsloth/models/_utils.py | 1 + 1 file changed, 1 insertion(+) diff --git a/unsloth/models/_utils.py b/unsloth/models/_utils.py index 60abcea702..8275283c6a 100644 --- a/unsloth/models/_utils.py +++ b/unsloth/models/_utils.py @@ -137,6 +137,7 @@ # ============================================= # Disable some warnings which can get annoying warnings.filterwarnings(action = "ignore", category = UserWarning, module = "torch") +warnings.filterwarnings(action = "ignore", category = FutureWarning, module = "torch") warnings.filterwarnings(action = "ignore", category = UserWarning, module = "huggingface_hub") warnings.filterwarnings(action = "ignore", category = FutureWarning, module = "huggingface_hub") warnings.filterwarnings(action = "ignore", category = UserWarning, module = "trl") From 0b49db1a0b2d13789da43a9f3326063093cdec54 Mon Sep 17 00:00:00 2001 From: Daniel Han Date: Sun, 21 Sep 2025 18:01:21 -0700 Subject: [PATCH 174/272] Update rl.py --- unsloth/models/rl.py | 33 +++++++++++++++++++++++++++++++-- 1 file changed, 31 insertions(+), 2 deletions(-) diff --git a/unsloth/models/rl.py b/unsloth/models/rl.py index 3d5f6d084b..889cbaccaf 100644 --- a/unsloth/models/rl.py +++ b/unsloth/models/rl.py @@ -116,6 +116,22 @@ def generate_with_clone(*args, **kwargs): from transformers import DataCollatorForSeq2Seq, DataCollatorForLanguageModeling as TransformersDataCollatorForLanguageModeling from transformers.training_args import ParallelMode +# Wrap trainer with padding to right and enable training mode +import functools +def prepare_for_training_mode(f): + @functools.wraps(f) + def wrapper(self, *args, **kwargs): + # Enable training mode + if hasattr(self, model) and hasattr(self.model, "for_training"): + self.model.for_training() + output = f(self, *args, **kwargs) + # Return inference mode + if hasattr(self, model) and hasattr(self.model, "for_inference"): + self.model.for_inference() + return output + return wrapper +pass + torch_compile_options = {{ "epilogue_fusion" : True, "max_autotune" : False, @@ -174,7 +190,11 @@ def __init__({RLTrainer_arguments}, if getattr(args, "parallel_mode", None) == ParallelMode.NOT_DISTRIBUTED and args.n_gpu > 1: if getattr(args, "_n_gpu", 1) != 1: args._n_gpu = 1 + if "model" in locals() and hasattr(model, "for_training"): + model.for_training() super().__init__({RLTrainer_call_args}{RLTrainer_kwargs}) + if "model" in locals() and hasattr(model, "for_inference"): + model.for_inference() {RLTrainer_post} pass ''' @@ -460,7 +480,7 @@ def _patch_trl_rl_trainers(trainer_file = "grpo_trainer"): # Add accelerator scaler to model if "model" in call_args: - neftune_check = \ + accelerator_check = \ "if hasattr(self, 'accelerator'):\n"\ " scaler = self.accelerator.scaler\n"\ " current_model = model\n"\ @@ -469,7 +489,16 @@ def _patch_trl_rl_trainers(trainer_file = "grpo_trainer"): " current_model = current_model.model\n"\ " current_model.accelerator_scaler = scaler\n"\ "pass\n" - RLTrainer_post += neftune_check + RLTrainer_post += accelerator_check + pass + + # Add enabling and disabling training modes + if "model" in call_args: + training_check = \ + "if hasattr(self, 'train'):\n"\ + " self.train = prepare_for_training_mode(self.train)\n"\ + "pass\n" + RLTrainer_post += training_check pass # Edit optional metrics From f1c47f860c7f9a7dc34bd5eba27c61daf67af38b Mon Sep 17 00:00:00 2001 From: Daniel Han Date: Sun, 21 Sep 2025 18:10:27 -0700 Subject: [PATCH 175/272] Update vision.py --- unsloth/models/vision.py | 26 ++++++++++++++++++-------- 1 file changed, 18 insertions(+), 8 deletions(-) diff --git a/unsloth/models/vision.py b/unsloth/models/vision.py index 6d8250318d..36e2cdd459 100644 --- a/unsloth/models/vision.py +++ b/unsloth/models/vision.py @@ -247,7 +247,7 @@ def unsloth_base_fast_generate( with torch.inference_mode(), autocaster: output = self._old_generate(*args, **kwargs) - FastBaseModel.for_training(self) + # FastBaseModel.for_training(self) return output pass @@ -576,7 +576,7 @@ def from_pretrained( if (whisper_language and whisper_task) or auto_model.__name__.endswith("ForConditionalGeneration"): tokenizer = auto_processor.from_pretrained( tokenizer_name, - padding_side = "right", + padding_side = "left", token = token, language = whisper_language, task = whisper_task, @@ -585,19 +585,19 @@ def from_pretrained( try: tokenizer = auto_processor.from_pretrained( tokenizer_name, - padding_side = "right", + padding_side = "left", token = token, ) except: tokenizer = get_auto_processor( tokenizer_name, - padding_side = "right", + padding_side = "left", token = token, ) if hasattr(tokenizer, "tokenizer"): __tokenizer = tokenizer.tokenizer # Add padding side as well - __tokenizer.padding_side = "right" + __tokenizer.padding_side = "left" # Check bos, eos, pad tokens if hasattr(__tokenizer, "bos_token"): tokenizer.bos_token = __tokenizer.bos_token @@ -800,6 +800,11 @@ def get_peft_model( # Add for_inference and for_training model.for_training = functools.partial(FastBaseModel.for_training, model) model.for_inference = functools.partial(FastBaseModel.for_inference, model) + m = model + while hasattr(m, "model"): + m.for_training = functools.partial(FastBaseModel.for_training, m) + m.for_inference = functools.partial(FastBaseModel.for_inference, m) + m = m.model return model pass @@ -835,12 +840,12 @@ def post_patch_model( pass patch_saving_functions(model, vision = True) - # Patch tokenizer to pad to the right + # Patch tokenizer to pad to the left m = model while hasattr(m, "model"): if hasattr(m, "_saved_temp_tokenizer"): if hasattr(m._saved_temp_tokenizer, "tokenizer"): - m._saved_temp_tokenizer.tokenizer.padding_side = "right" + m._saved_temp_tokenizer.tokenizer.padding_side = "left" pass # Also set is_loaded_in_8bit to disable incorrect DDP m.is_loaded_in_8bit = True if not full_finetuning else False @@ -848,7 +853,7 @@ def post_patch_model( pass if hasattr(m, "_saved_temp_tokenizer"): if hasattr(m._saved_temp_tokenizer, "tokenizer"): - m._saved_temp_tokenizer.tokenizer.padding_side = "right" + m._saved_temp_tokenizer.tokenizer.padding_side = "left" pass # Also set is_loaded_in_8bit to disable incorrect DDP m.is_loaded_in_8bit = True if not full_finetuning else False @@ -864,6 +869,11 @@ def post_patch_model( # Add for_inference and for_training model.for_training = functools.partial(FastBaseModel.for_training, model) model.for_inference = functools.partial(FastBaseModel.for_inference, model) + m = model + while hasattr(m, "model"): + m.for_training = functools.partial(FastBaseModel.for_training, m) + m.for_inference = functools.partial(FastBaseModel.for_inference, m) + m = m.model return model pass From 27f62038eda9b01c3022d2dcd25ac268bb1a030a Mon Sep 17 00:00:00 2001 From: Daniel Han Date: Sun, 21 Sep 2025 18:15:39 -0700 Subject: [PATCH 176/272] Update llama.py --- unsloth/models/llama.py | 15 +++++++++++++++ 1 file changed, 15 insertions(+) diff --git a/unsloth/models/llama.py b/unsloth/models/llama.py index fee83441fe..8708fb5218 100644 --- a/unsloth/models/llama.py +++ b/unsloth/models/llama.py @@ -2231,6 +2231,11 @@ def from_pretrained( # Add for_inference and for_training model.for_training = functools.partial(FastLlamaModel.for_training, model) model.for_inference = functools.partial(FastLlamaModel.for_inference, model) + m = model + while hasattr(m, "model"): + m.for_training = functools.partial(FastBaseModel.for_training, m) + m.for_inference = functools.partial(FastBaseModel.for_inference, m) + m = m.model # Patch generate is_classification = "Classification" in str(type(model)) @@ -2707,6 +2712,11 @@ def get_peft_model( # Add for_inference and for_training model.for_training = functools.partial(FastLlamaModel.for_training, model) model.for_inference = functools.partial(FastLlamaModel.for_inference, model) + m = model + while hasattr(m, "model"): + m.for_training = functools.partial(FastBaseModel.for_training, m) + m.for_inference = functools.partial(FastBaseModel.for_inference, m) + m = m.model return model pass @@ -2922,6 +2932,11 @@ def patch_peft_model( # Add for_inference and for_training model.for_training = functools.partial(FastLlamaModel.for_training, model) model.for_inference = functools.partial(FastLlamaModel.for_inference, model) + m = model + while hasattr(m, "model"): + m.for_training = functools.partial(FastBaseModel.for_training, m) + m.for_inference = functools.partial(FastBaseModel.for_inference, m) + m = m.model return model pass From f06179fa85d7810a5c82de1212ca1a3706a71f9e Mon Sep 17 00:00:00 2001 From: Datta Nimmaturi Date: Mon, 22 Sep 2025 10:50:03 +0530 Subject: [PATCH 177/272] Mistral3 vllm (#3349) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit * [WIP] use vLLM for vision language models * Update README.md Editing icon sizes * Update README.md Updating icon sizes * Update README.md (#2885) * MoE kernels AGPLv3 * versioning * Many bug fixes (#2908) * add deepseek v3 * add deepseek r1 base * add deepseek r1 zero * add deepseek distill llama * add deepseek distill models * remove redundant code when constructing model names * add mistral small to registry * rename model registration methods * rename deepseek registration methods * refactor naming for mistral and phi * add global register models * refactor model registration tests for new registry apis * add model search method * remove deprecated registration api * add quant type test * add registry readme * make llama registration more specific * clear registry when executing individual model registration file * more registry readme updates * Update _auto_install.py * Llama4 * Update synthetic.py * Update synthetic.py * Update synthetic.py * Update synthetic.py * Update synthetic.py * Update synthetic.py * Update synthetic.py * Update synthetic.py * Update synthetic.py * Update synthetic.py * Update synthetic.py * Synthetic data * Update mapper.py * Xet and Synthetic * Update synthetic.py * Update loader.py * Update synthetic.py * Update synthetic.py * Update synthetic.py * Update synthetic.py * Update synthetic.py * Update synthetic.py * Update synthetic.py * Update synthetic.py * Update synthetic.py * Update synthetic.py * Update synthetic.py * Update synthetic.py * Update synthetic.py * Update synthetic.py * Update synthetic.py * Update synthetic.py * Update synthetic.py * Update synthetic.py * Update synthetic.py * Update synthetic.py * Update synthetic.py * Update synthetic.py * Update synthetic.py * Update synthetic.py * Update synthetic.py * Update synthetic.py * Update synthetic.py * Update pyproject.toml * Delete .gitignore * Update synthetic.py * Update synthetic.py * Update synthetic.py * Update synthetic.py * Update synthetic.py * Update synthetic.py * Update synthetic.py * Update synthetic.py * Update synthetic.py * Update synthetic.py * Update synthetic.py * Update synthetic.py * Update synthetic.py * Update synthetic.py * Update synthetic.py * Update synthetic.py * Update synthetic.py * Update _utils.py * Update pyproject.toml * Update synthetic.py * Update synthetic.py * Update synthetic.py * Update synthetic.py * Update chat_templates.py * Seasame force float16 / float32 * Fix Seasame * Update loader.py * Update vision.py * Update vision.py * Update vision.py * Update loader.py * is_multimodal * Update loader.py * Update loader.py * Update loader.py * Update loader.py * Update vision.py * Update vision.py * Update vision.py * UNSLOTH_DISABLE_STATIC_GENERATION * Update vision.py * Auto vision detection * Sesame * Whisper * Update loader.py * Update loader.py * Update loader.py * Update mapper.py * Update vision.py * Update vision.py * Update vision.py * Update vision.py * Update vision.py * Update vision.py * Update loader.py * Update loader.py * Update loader.py * Update loader.py * Update _utils.py * Update rl.py * versioning * Update rl.py * Update rl.py * Update rl.py * Update rl.py * Update rl.py * logging * Update pyproject.toml * Update rl.py * versioning * Update rl.py * Update rl.py * Update rl_replacements.py * Update rl_replacements.py * Update rl.py * Update rl_replacements.py * Update rl_replacements.py * logits / temperature * Update rl_replacements.py * Update pyproject.toml * Update rl_replacements.py * Update rl_replacements.py * Debugging only * Update llama.py * Update llama.py * Update rl_replacements.py * Update rl_replacements.py * Update rl_replacements.py * Update rl_replacements.py * Update rl_replacements.py * Generic efficient GRPO * Update rl_replacements.py * Update rl_replacements.py * Remove debugging * Update rl_replacements.py * Update rl_replacements.py * Update vision.py * Update llama.py * Update rl_replacements.py * versioning * Update _utils.py * Update vision.py * Update mapper.py * Update loader.py * Update mapper.py * Update vision.py * Update loader.py * Update vision.py * Update loader.py * Update _utils.py * Update vision.py * gradient checkpointing * Gemma 3N fixes * Update loader.py * Versioning * Gemma 3N fixes * Update vision.py * Update vision.py * Update loader.py * Update vision.py * Fix setup.py * setup.py * Prints * Update setup.py * Update setup.py * Update setup.py * Update pyproject.toml * Update pyproject.toml * Update pyproject.toml * Update pyproject.toml * Update pyproject.toml * Update pyproject.toml * Update vision.py * Update vision.py * Update pyproject.toml * Update vision.py * Update _utils.py * Update __init__.py * Update __init__.py --------- Co-authored-by: jeromeku Co-authored-by: Michael Han <107991372+shimmyshimmer@users.noreply.github.com> * silienty skip falcon h1 import is transformers_version < 4.53.0 (#2912) * Dynamically adjust get_per_token_logps function and patch as well (#2911) * add intel gpu with vllm support (#2903) * [bugs] fix for casual mask (#2868) * fix for casual mask * use un_casual in sdpa * add missing mask * fix for type * Explicitly check if xformers exists for attention (#2889) * Update __init__.py * Update llama.py * if mlp doesn't exist in layer module check for feed_forward name for falcon h1 (#2913) * Move inputs to right devices. (#2919) * Move tensors to right devices * fix multi gpu for non mistral models * multi GPU RoPE for gemma2 * Finish up multi GPU inference * Make multiGPU rope a list * Remove unnecessary transfer to CPU * Remove unnecessary move to CPU * Donot move inputs to device yet will be handled separately in another PR * Move inputs to appropriate decoder device * Make device count global variable * Cleanup RoPE device code * Fixup num_gpu to device count * Cleanup device counts * Use device index for RoPE get_cache * Donot typecast * Use tuple instead of list for tensors. Use device index directly * fixup move to device logic * WIP VLM vLLM * Make vLLM patch a function * Add save and load lora functions * Make fast_inference setup depend on the flag * Improve fast inference patching mechanism * Make vision setting depend on checks in fastbasemodel * Check LoRA and vLLM intercompatibility for vision models * Comment pointing to vLLM LoRA check * Improve lora validation on vLLM * Error out on no vLLM and increase max lora rank * Bug fixes (#3017) * Update synthetic.py * Update synthetic.py * Update synthetic.py * Update synthetic.py * Update synthetic.py * Update synthetic.py * Update synthetic.py * Update synthetic.py * Update synthetic.py * Update synthetic.py * Update synthetic.py * Update synthetic.py * Update synthetic.py * Update synthetic.py * Update synthetic.py * Update synthetic.py * Update synthetic.py * Update synthetic.py * Update synthetic.py * Update synthetic.py * Update synthetic.py * Update synthetic.py * Update synthetic.py * Update pyproject.toml * Delete .gitignore * Update synthetic.py * Update synthetic.py * Update synthetic.py * Update synthetic.py * Update synthetic.py * Update synthetic.py * Update synthetic.py * Update synthetic.py * Update synthetic.py * Update synthetic.py * Update synthetic.py * Update synthetic.py * Update synthetic.py * Update synthetic.py * Update synthetic.py * Update synthetic.py * Update synthetic.py * Update _utils.py * Update pyproject.toml * Update synthetic.py * Update synthetic.py * Update synthetic.py * Update synthetic.py * Update chat_templates.py * Seasame force float16 / float32 * Fix Seasame * Update loader.py * Update vision.py * Update vision.py * Update vision.py * Update loader.py * is_multimodal * Update loader.py * Update loader.py * Update loader.py * Update loader.py * Update vision.py * Update vision.py * Update vision.py * UNSLOTH_DISABLE_STATIC_GENERATION * Update vision.py * Auto vision detection * Sesame * Whisper * Update loader.py * Update loader.py * Update loader.py * Update mapper.py * Update vision.py * Update vision.py * Update vision.py * Update vision.py * Update vision.py * Update vision.py * Update loader.py * Update loader.py * Update loader.py * Update loader.py * Update _utils.py * Update rl.py * versioning * Update rl.py * Update rl.py * Update rl.py * Update rl.py * Update rl.py * logging * Update pyproject.toml * Update rl.py * versioning * Update rl.py * Update rl.py * Update rl_replacements.py * Update rl_replacements.py * Update rl.py * Update rl_replacements.py * Update rl_replacements.py * logits / temperature * Update rl_replacements.py * Update pyproject.toml * Update rl_replacements.py * Update rl_replacements.py * Debugging only * Update llama.py * Update llama.py * Update rl_replacements.py * Update rl_replacements.py * Update rl_replacements.py * Update rl_replacements.py * Update rl_replacements.py * Generic efficient GRPO * Update rl_replacements.py * Update rl_replacements.py * Remove debugging * Update rl_replacements.py * Update rl_replacements.py * Update vision.py * Update llama.py * Update rl_replacements.py * versioning * Update _utils.py * Update vision.py * Update mapper.py * Update loader.py * Update mapper.py * Update vision.py * Update loader.py * Update vision.py * Update loader.py * Update _utils.py * Update vision.py * gradient checkpointing * Gemma 3N fixes * Update loader.py * Versioning * Gemma 3N fixes * Update vision.py * Update vision.py * Update loader.py * Update vision.py * Fix setup.py * setup.py * Prints * Update setup.py * Update setup.py * Update setup.py * Update pyproject.toml * Update pyproject.toml * Update pyproject.toml * Update pyproject.toml * Update pyproject.toml * Update pyproject.toml * Update vision.py * Update vision.py * Update pyproject.toml * Update vision.py * Update _utils.py * Update __init__.py * Update __init__.py * Small fixes * Update vision.py * Update vision.py * versioning * Update __init__.py * Update llama.py * Update rl.py * Update rl.py * Update _utils.py * Update vision.py * Update vision.py * compiler stance * Update _utils.py * Update pyproject.toml * Update pyproject.toml * Update rl_replacements.py * Update rl_replacements.py * Update rl_replacements.py * Update rl_replacements.py * Update rl.py * Update rl_replacements.py * Update rl_replacements.py * Update rl_replacements.py * Update rl_replacements.py * Update rl_replacements.py * Update rl_replacements.py * Update rl_replacements.py * Revert "Revert "Add Qwen2.5-VL-32B-Instruct mapping to fix quantized model me…" (#2990) This reverts commit 204fc46e1904ac3de01f06099f07b88b46be38bf. * skip_guard_eval_unsafe fix * Update synthetic.py * Update synthetic.py * Update synthetic.py * Update synthetic.py * Update synthetic.py * Update llama.py * Update llama.py * Fix `quantization_method` * versioning * fix for casual mask (#3011) * [intel] add for intel path for llama.py (#3012) * fix for intel path * remove unuse code * Update unsloth/models/llama.py --------- Co-authored-by: Daniel Han * Update llama.py * Fix Gemma 2 (#3024) * Update synthetic.py * Update synthetic.py * Update synthetic.py * Update synthetic.py * Update synthetic.py * Update synthetic.py * Update synthetic.py * Update synthetic.py * Update synthetic.py * Update synthetic.py * Update synthetic.py * Update synthetic.py * Update synthetic.py * Update synthetic.py * Update synthetic.py * Update synthetic.py * Update synthetic.py * Update synthetic.py * Update pyproject.toml * Delete .gitignore * Update synthetic.py * Update synthetic.py * Update synthetic.py * Update synthetic.py * Update synthetic.py * Update synthetic.py * Update synthetic.py * Update synthetic.py * Update synthetic.py * Update synthetic.py * Update synthetic.py * Update synthetic.py * Update synthetic.py * Update synthetic.py * Update synthetic.py * Update synthetic.py * Update synthetic.py * Update _utils.py * Update pyproject.toml * Update synthetic.py * Update synthetic.py * Update synthetic.py * Update synthetic.py * Update chat_templates.py * Seasame force float16 / float32 * Fix Seasame * Update loader.py * Update vision.py * Update vision.py * Update vision.py * Update loader.py * is_multimodal * Update loader.py * Update loader.py * Update loader.py * Update loader.py * Update vision.py * Update vision.py * Update vision.py * UNSLOTH_DISABLE_STATIC_GENERATION * Update vision.py * Auto vision detection * Sesame * Whisper * Update loader.py * Update loader.py * Update loader.py * Update mapper.py * Update vision.py * Update vision.py * Update vision.py * Update vision.py * Update vision.py * Update vision.py * Update loader.py * Update loader.py * Update loader.py * Update loader.py * Update _utils.py * Update rl.py * versioning * Update rl.py * Update rl.py * Update rl.py * Update rl.py * Update rl.py * logging * Update pyproject.toml * Update rl.py * versioning * Update rl.py * Update rl.py * Update rl_replacements.py * Update rl_replacements.py * Update rl.py * Update rl_replacements.py * Update rl_replacements.py * logits / temperature * Update rl_replacements.py * Update pyproject.toml * Update rl_replacements.py * Update rl_replacements.py * Debugging only * Update llama.py * Update llama.py * Update rl_replacements.py * Update rl_replacements.py * Update rl_replacements.py * Update rl_replacements.py * Update rl_replacements.py * Generic efficient GRPO * Update rl_replacements.py * Update rl_replacements.py * Remove debugging * Update rl_replacements.py * Update rl_replacements.py * Update vision.py * Update llama.py * Update rl_replacements.py * versioning * Update _utils.py * Update vision.py * Update mapper.py * Update loader.py * Update mapper.py * Update vision.py * Update loader.py * Update vision.py * Update loader.py * Update _utils.py * Update vision.py * gradient checkpointing * Gemma 3N fixes * Update loader.py * Versioning * Gemma 3N fixes * Update vision.py * Update vision.py * Update loader.py * Update vision.py * Fix setup.py * setup.py * Prints * Update setup.py * Update setup.py * Update setup.py * Update pyproject.toml * Update pyproject.toml * Update pyproject.toml * Update pyproject.toml * Update pyproject.toml * Update pyproject.toml * Update vision.py * Update vision.py * Update pyproject.toml * Update vision.py * Update _utils.py * Update __init__.py * Update __init__.py * Small fixes * Update vision.py * Update vision.py * versioning * Update __init__.py * Update llama.py * Update rl.py * Update rl.py * Update _utils.py * Update vision.py * Update vision.py * compiler stance * Update _utils.py * Update pyproject.toml * Update pyproject.toml * Update rl_replacements.py * Update rl_replacements.py * Update rl_replacements.py * Update rl_replacements.py * Update rl.py * Update rl_replacements.py * Update rl_replacements.py * Update rl_replacements.py * Update rl_replacements.py * Update rl_replacements.py * Update rl_replacements.py * Update rl_replacements.py * Revert "Revert "Add Qwen2.5-VL-32B-Instruct mapping to fix quantized model me…" (#2990) This reverts commit 204fc46e1904ac3de01f06099f07b88b46be38bf. * skip_guard_eval_unsafe fix * Update synthetic.py * Update synthetic.py * Update synthetic.py * Update synthetic.py * Update synthetic.py * Update llama.py * Update llama.py * Fix `quantization_method` * versioning * Update _utils.py * Update _utils.py * Update _utils.py * falcon force float32 on sm<75 machines (#3026) * Fix torch compile issues (#3028) * Update synthetic.py * Update synthetic.py * Update synthetic.py * Update synthetic.py * Update synthetic.py * Update synthetic.py * Update synthetic.py * Update synthetic.py * Update synthetic.py * Update synthetic.py * Update synthetic.py * Update synthetic.py * Update pyproject.toml * Delete .gitignore * Update synthetic.py * Update synthetic.py * Update synthetic.py * Update synthetic.py * Update synthetic.py * Update synthetic.py * Update synthetic.py * Update synthetic.py * Update synthetic.py * Update synthetic.py * Update synthetic.py * Update synthetic.py * Update synthetic.py * Update synthetic.py * Update synthetic.py * Update synthetic.py * Update synthetic.py * Update _utils.py * Update pyproject.toml * Update synthetic.py * Update synthetic.py * Update synthetic.py * Update synthetic.py * Update chat_templates.py * Seasame force float16 / float32 * Fix Seasame * Update loader.py * Update vision.py * Update vision.py * Update vision.py * Update loader.py * is_multimodal * Update loader.py * Update loader.py * Update loader.py * Update loader.py * Update vision.py * Update vision.py * Update vision.py * UNSLOTH_DISABLE_STATIC_GENERATION * Update vision.py * Auto vision detection * Sesame * Whisper * Update loader.py * Update loader.py * Update loader.py * Update mapper.py * Update vision.py * Update vision.py * Update vision.py * Update vision.py * Update vision.py * Update vision.py * Update loader.py * Update loader.py * Update loader.py * Update loader.py * Update _utils.py * Update rl.py * versioning * Update rl.py * Update rl.py * Update rl.py * Update rl.py * Update rl.py * logging * Update pyproject.toml * Update rl.py * versioning * Update rl.py * Update rl.py * Update rl_replacements.py * Update rl_replacements.py * Update rl.py * Update rl_replacements.py * Update rl_replacements.py * logits / temperature * Update rl_replacements.py * Update pyproject.toml * Update rl_replacements.py * Update rl_replacements.py * Debugging only * Update llama.py * Update llama.py * Update rl_replacements.py * Update rl_replacements.py * Update rl_replacements.py * Update rl_replacements.py * Update rl_replacements.py * Generic efficient GRPO * Update rl_replacements.py * Update rl_replacements.py * Remove debugging * Update rl_replacements.py * Update rl_replacements.py * Update vision.py * Update llama.py * Update rl_replacements.py * versioning * Update _utils.py * Update vision.py * Update mapper.py * Update loader.py * Update mapper.py * Update vision.py * Update loader.py * Update vision.py * Update loader.py * Update _utils.py * Update vision.py * gradient checkpointing * Gemma 3N fixes * Update loader.py * Versioning * Gemma 3N fixes * Update vision.py * Update vision.py * Update loader.py * Update vision.py * Fix setup.py * setup.py * Prints * Update setup.py * Update setup.py * Update setup.py * Update pyproject.toml * Update pyproject.toml * Update pyproject.toml * Update pyproject.toml * Update pyproject.toml * Update pyproject.toml * Update vision.py * Update vision.py * Update pyproject.toml * Update vision.py * Update _utils.py * Update __init__.py * Update __init__.py * Small fixes * Update vision.py * Update vision.py * versioning * Update __init__.py * Update llama.py * Update rl.py * Update rl.py * Update _utils.py * Update vision.py * Update vision.py * compiler stance * Update _utils.py * Update pyproject.toml * Update pyproject.toml * Update rl_replacements.py * Update rl_replacements.py * Update rl_replacements.py * Update rl_replacements.py * Update rl.py * Update rl_replacements.py * Update rl_replacements.py * Update rl_replacements.py * Update rl_replacements.py * Update rl_replacements.py * Update rl_replacements.py * Update rl_replacements.py * Revert "Revert "Add Qwen2.5-VL-32B-Instruct mapping to fix quantized model me…" (#2990) This reverts commit 204fc46e1904ac3de01f06099f07b88b46be38bf. * skip_guard_eval_unsafe fix * Update synthetic.py * Update synthetic.py * Update synthetic.py * Update synthetic.py * Update synthetic.py * Update llama.py * Update llama.py * Fix `quantization_method` * versioning * Update _utils.py * Update _utils.py * Update _utils.py * check stride * Cleanup * Update rope_embedding.py * Update gemma2.py * Fix `set_stance` * Update pyproject.toml * Update _utils.py * Fixup patch vllm * Disable mllama * Use variables to decide VLM support * Better attn_impl handling * Patch TF protobuf incompatability * Torch 2.8 (#3186) * Fix mamba * Update loader.py * Update vision.py * Update loader.py * Filter vLLM standby logs (#3131) * filter vLLM standby logs * safeguard standby logger patch * Update unsloth/models/_utils.py * Update unsloth/models/_utils.py * Update unsloth/models/_utils.py --------- Co-authored-by: Daniel Han * Update loader.py * Add scaler * Update llama.py * Update _utils.py * Versioning * GPT OSS fix * GPT OSS fix * Update loader.py * Update vision.py * Update vision.py * Update loader.py * Update vision.py * Update vision.py * Update llama.py * Update llama.py * Update llama.py * Versioning * Update mapper.py * Update vision.py * Update vision.py * Update vision.py * Upcast norms * Update loader.py * Update vision.py * Upcast layernorms * Update llama.py * Update llama.py * Update llama.py * Update llama.py * Update llama.py * Update llama.py * Update save.py * Update rl.py * Update pyproject.toml * Update rl.py * Update rl_replacements.py * Update rl.py * Update rl.py * Update rl.py * Update _utils.py * Update __init__.py * Torch 2.8 * Update rl_replacements.py --------- Co-authored-by: Datta Nimmaturi * Update _auto_install.py * Update pyproject.toml * Update rl.py * Protobuf issue * Update pyproject.toml * Fix extras transformers typo in pyproject.toml * Update _utils.py * Bug fixes (#3195) * Fix mamba * Update loader.py * Update vision.py * Update loader.py * Filter vLLM standby logs (#3131) * filter vLLM standby logs * safeguard standby logger patch * Update unsloth/models/_utils.py * Update unsloth/models/_utils.py * Update unsloth/models/_utils.py --------- Co-authored-by: Daniel Han * Update loader.py * Add scaler * Update llama.py * Update _utils.py * Versioning * GPT OSS fix * GPT OSS fix * Update loader.py * Update vision.py * Update vision.py * Update loader.py * Update vision.py * Update vision.py * Update llama.py * Update llama.py * Update llama.py * Versioning * Update mapper.py * Update vision.py * Update vision.py * Update vision.py * Upcast norms * Update loader.py * Update vision.py * Upcast layernorms * Update llama.py * Update llama.py * Update llama.py * Update llama.py * Update llama.py * Update llama.py * Update save.py * Update rl.py * Update pyproject.toml * Update rl.py * Update rl_replacements.py * Update rl.py * Update rl.py * Update rl.py * Update _utils.py * Update __init__.py * Torch 2.8 * Update rl_replacements.py * Update loader.py * UNSLOTH_ENABLE_CCE * Fix * Update loader.py * Update loader.py * Update __init__.py * Update __init__.py * Update __init__.py * Update __init__.py * Import fixes * Update loader.py * Fix aimv2 issue * Update loader.py * Update import_fixes.py * Update import_fixes.py * Update loader.py * Update loader.py * Update loader.py * Upgrade * Update loader.py * Update loader.py * Update loader.py * Update loader.py --------- Co-authored-by: Datta Nimmaturi * adallow float32 dtype in FastLanguageModel (#3204) * Update loader.py * Update vision.py * Suppress message and use unsloth sampling params * Use trl sampling params for now * Improve error message * fixup quantized fast inference model name * Add mistral 3 support --------- Co-authored-by: Michael Han <107991372+shimmyshimmer@users.noreply.github.com> Co-authored-by: Daniel Han Co-authored-by: jeromeku Co-authored-by: DoubleMathew Co-authored-by: Lei Zhenyuan Co-authored-by: parth2510 --- unsloth/models/vision.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/unsloth/models/vision.py b/unsloth/models/vision.py index 36e2cdd459..b0777c99a8 100644 --- a/unsloth/models/vision.py +++ b/unsloth/models/vision.py @@ -83,6 +83,7 @@ VLLM_SUPPORTED_VLM = [ "qwen2_5_vl", "gemma3", + "mistral3" ] VLLM_NON_LORA_VLM = [ "mllama" @@ -90,6 +91,7 @@ from transformers import GenerationConfig, CompileConfig, HybridCache, AutoConfig, PretrainedConfig HAS_TORCH_DTYPE = "torch_dtype" in PretrainedConfig.__doc__ + from transformers import GenerationConfig, CompileConfig, HybridCache _compile_config = CompileConfig( From 67a544de0598a6c7a5442e4578c1348e70fc0e65 Mon Sep 17 00:00:00 2001 From: Daniel Han Date: Sun, 21 Sep 2025 23:10:47 -0700 Subject: [PATCH 178/272] Set padding to 0 --- unsloth/models/llama.py | 7 +++++++ unsloth/models/vision.py | 7 +++++++ 2 files changed, 14 insertions(+) diff --git a/unsloth/models/llama.py b/unsloth/models/llama.py index 8708fb5218..1b22542514 100644 --- a/unsloth/models/llama.py +++ b/unsloth/models/llama.py @@ -2244,6 +2244,13 @@ def from_pretrained( unsloth_fast_generate.__doc__ = model._old_generate.__doc__ model.generate = types.MethodType(unsloth_fast_generate, model) pass + # Set weight[padding_idx] = 0 + with torch.no_grad(): + for name, module in model.named_modules(): + if type(module) is torch.nn.Embedding: + if getattr(module, "weight", None) is not None and getattr(module, "padding_idx", None) is not None: + if module.padding_idx < module.weight.shape[0]: + module.weight[module.padding_idx] = 0 return model, tokenizer pass diff --git a/unsloth/models/vision.py b/unsloth/models/vision.py index b0777c99a8..10f757c21b 100644 --- a/unsloth/models/vision.py +++ b/unsloth/models/vision.py @@ -876,6 +876,13 @@ def post_patch_model( m.for_training = functools.partial(FastBaseModel.for_training, m) m.for_inference = functools.partial(FastBaseModel.for_inference, m) m = m.model + # Set weight[padding_idx] = 0 + with torch.no_grad(): + for name, module in model.named_modules(): + if type(module) is torch.nn.Embedding: + if getattr(module, "weight", None) is not None and getattr(module, "padding_idx", None) is not None: + if module.padding_idx < module.weight.shape[0]: + module.weight[module.padding_idx] = 0 return model pass From 72383278d174f42398ca7f8e3b8e3e9b18401da5 Mon Sep 17 00:00:00 2001 From: Daniel Han Date: Tue, 23 Sep 2025 02:24:10 -0700 Subject: [PATCH 179/272] Fix patch --- unsloth/models/loader.py | 2 +- unsloth/models/rl.py | 3 ++- 2 files changed, 3 insertions(+), 2 deletions(-) diff --git a/unsloth/models/loader.py b/unsloth/models/loader.py index 1a9c145368..875c6f73fc 100644 --- a/unsloth/models/loader.py +++ b/unsloth/models/loader.py @@ -664,7 +664,7 @@ def from_pretrained( # Qwen 2.5 elif "qwen2_5" in model_types_all and transformers_version < Version("4.49.0"): raise RuntimeError("Unsloth: Qwen 2.5 only works on transformers >= 4.49.0." + LATEST) - # Gemma 3N must be beefore Gemma 3 + # Gemma 3N must be before Gemma 3 elif "gemma3n" in model_types_all: if transformers_version < Version("4.53.0"): raise RuntimeError("Unsloth: Gemma 3N only works on transformers >= 4.53.0" + LATEST) diff --git a/unsloth/models/rl.py b/unsloth/models/rl.py index 889cbaccaf..b032f228c8 100644 --- a/unsloth/models/rl.py +++ b/unsloth/models/rl.py @@ -118,6 +118,7 @@ def generate_with_clone(*args, **kwargs): # Wrap trainer with padding to right and enable training mode import functools +from types import MethodType def prepare_for_training_mode(f): @functools.wraps(f) def wrapper(self, *args, **kwargs): @@ -496,7 +497,7 @@ def _patch_trl_rl_trainers(trainer_file = "grpo_trainer"): if "model" in call_args: training_check = \ "if hasattr(self, 'train'):\n"\ - " self.train = prepare_for_training_mode(self.train)\n"\ + " self.train = MethodType(prepare_for_training_mode(self.__class__.train), self)\n"\ "pass\n" RLTrainer_post += training_check pass From 8a1e6fb9d6a5147e51e161c8a9f92788457f250d Mon Sep 17 00:00:00 2001 From: Daniel Han Date: Tue, 23 Sep 2025 02:29:46 -0700 Subject: [PATCH 180/272] fixup patch (#3359) Co-authored-by: Datta Nimmaturi --- unsloth/models/rl.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/unsloth/models/rl.py b/unsloth/models/rl.py index b032f228c8..65434bb095 100644 --- a/unsloth/models/rl.py +++ b/unsloth/models/rl.py @@ -123,11 +123,11 @@ def prepare_for_training_mode(f): @functools.wraps(f) def wrapper(self, *args, **kwargs): # Enable training mode - if hasattr(self, model) and hasattr(self.model, "for_training"): + if hasattr(self, 'model') and hasattr(self.model, "for_training"): self.model.for_training() output = f(self, *args, **kwargs) # Return inference mode - if hasattr(self, model) and hasattr(self.model, "for_inference"): + if hasattr(self, 'model') and hasattr(self.model, "for_inference"): self.model.for_inference() return output return wrapper From f0ec1aeaa550daa755b8eb4b89c1a0ba6c496c91 Mon Sep 17 00:00:00 2001 From: Daniel Han Date: Tue, 23 Sep 2025 07:36:15 -0700 Subject: [PATCH 181/272] Update vision.py --- unsloth/models/vision.py | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/unsloth/models/vision.py b/unsloth/models/vision.py index 10f757c21b..f62596a8c2 100644 --- a/unsloth/models/vision.py +++ b/unsloth/models/vision.py @@ -246,6 +246,13 @@ def unsloth_base_fast_generate( if cache_implementation is not None: kwargs["compile_config"] = _compile_config pass + + # Delete cached Flex Attention masks to reset inference + for name, module in self.named_modules(): + if hasattr(module, "_flex_attention_cache"): + del module._flex_attention_cache + pass + with torch.inference_mode(), autocaster: output = self._old_generate(*args, **kwargs) From a64a3b2d619af08ef5214e6281a7fa0f6c2f039a Mon Sep 17 00:00:00 2001 From: Daniel Han Date: Tue, 23 Sep 2025 08:08:09 -0700 Subject: [PATCH 182/272] Versioning --- pyproject.toml | 4 ++-- unsloth/models/_utils.py | 2 +- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/pyproject.toml b/pyproject.toml index 4f9c308b32..c7f67acfdd 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -37,7 +37,7 @@ triton = [ ] huggingface = [ - "unsloth_zoo>=2025.9.9", + "unsloth_zoo>=2025.9.10", "packaging", "tyro", "transformers>=4.51.3,!=4.52.0,!=4.52.1,!=4.52.2,!=4.52.3,!=4.53.0,!=4.54.0,!=4.55.0,!=4.55.1,<=4.55.4", @@ -453,7 +453,7 @@ colab-ampere-torch220 = [ "flash-attn>=2.6.3", ] colab-new = [ - "unsloth_zoo>=2025.9.9", + "unsloth_zoo>=2025.9.10", "packaging", "tyro", "transformers>=4.51.3,!=4.47.0,!=4.52.0,!=4.52.1,!=4.52.2,!=4.52.3,!=4.53.0,!=4.54.0,!=4.55.0,!=4.55.1,<=4.55.4", diff --git a/unsloth/models/_utils.py b/unsloth/models/_utils.py index 8275283c6a..ef8fc0fba6 100644 --- a/unsloth/models/_utils.py +++ b/unsloth/models/_utils.py @@ -12,7 +12,7 @@ # See the License for the specific language governing permissions and # limitations under the License. -__version__ = "2025.9.7" +__version__ = "2025.9.8" __all__ = [ "SUPPORTS_BFLOAT16", From 1b7640ba35436dda38385d7a463bebdc34ac3969 Mon Sep 17 00:00:00 2001 From: Daniel Han Date: Tue, 23 Sep 2025 08:41:29 -0700 Subject: [PATCH 183/272] Update vision.py --- unsloth/models/vision.py | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) diff --git a/unsloth/models/vision.py b/unsloth/models/vision.py index f62596a8c2..e83bd3ec08 100644 --- a/unsloth/models/vision.py +++ b/unsloth/models/vision.py @@ -957,7 +957,12 @@ def _for_training(m): # Pad tokenizer to the left if hasattr(m, "_saved_temp_tokenizer"): m._saved_temp_tokenizer.padding_side = "right" # Set a flag for generation! - if hasattr(m, "_flag_for_generation"): del m._flag_for_generation + if hasattr(m, "_flag_for_generation"): + try: + # Weirdly sometimes cannot succeed so do a try except + del m._flag_for_generation + except: + pass pass m = model while hasattr(m, "model"): From f5c438540a2e69d4985171f9e04badebda67e7fa Mon Sep 17 00:00:00 2001 From: Daniel Han Date: Tue, 23 Sep 2025 18:01:53 -0700 Subject: [PATCH 184/272] Update vision.py --- unsloth/models/vision.py | 58 +++++++++++++++++++++++++++++++++++++--- 1 file changed, 55 insertions(+), 3 deletions(-) diff --git a/unsloth/models/vision.py b/unsloth/models/vision.py index e83bd3ec08..fdfaee89f5 100644 --- a/unsloth/models/vision.py +++ b/unsloth/models/vision.py @@ -83,10 +83,13 @@ VLLM_SUPPORTED_VLM = [ "qwen2_5_vl", "gemma3", - "mistral3" + "mistral3", ] VLLM_NON_LORA_VLM = [ - "mllama" + "mllama", +] +PRE_COMPILE_INFERENCE = [ + "gpt_oss", ] from transformers import GenerationConfig, CompileConfig, HybridCache, AutoConfig, PretrainedConfig @@ -250,12 +253,29 @@ def unsloth_base_fast_generate( # Delete cached Flex Attention masks to reset inference for name, module in self.named_modules(): if hasattr(module, "_flex_attention_cache"): - del module._flex_attention_cache + try: del module._flex_attention_cache + except: pass + # Solves AttributeError: 'SlidingWindowLayer' object has no attribute 'max_batch_size' + if hasattr(module, "_cache") and "cache_utils" in str(module._cache.__class__): + try: del module._cache + except: pass pass + # DO INFERENCE with torch.inference_mode(), autocaster: output = self._old_generate(*args, **kwargs) + # Delete cached Flex Attention masks to reset inference + for name, module in self.named_modules(): + if hasattr(module, "_flex_attention_cache"): + try: del module._flex_attention_cache + except: pass + # Solves AttributeError: 'SlidingWindowLayer' object has no attribute 'max_batch_size' + if hasattr(module, "_cache") and "cache_utils" in str(module._cache.__class__): + try: del module._cache + except: pass + pass + # FastBaseModel.for_training(self) return output pass @@ -674,6 +694,7 @@ def from_pretrained( model, use_gradient_checkpointing = use_gradient_checkpointing, trust_remote_code = trust_remote_code, + model_type = model_type_arch, ) # Clear deleted GPU items for _ in range(3): @@ -686,6 +707,31 @@ def from_pretrained( return model, tokenizer pass + @staticmethod + def pre_compile_for_inference(model_type, model, tokenizer): + """ + We need to invoke torch.compile to save VRAM usage and make it faster downstream. + Sometimes torch.compile can use 3GB weirdly on large batches, then it goes down to <1GB. + So we invoke torch.compile on short batches to reduce VRAM usage. + """ + if model_type is None or model is None or tokenizer is None: return + if str(model_type).lower() not in PRE_COMPILE_INFERENCE: return + if getattr(tokenizer, "chat_template", None) is None: return + messages = [ + [ + {"role": "user", "content": f"1+1"}, + ], + ]*4 + inputs = tokenizer.apply_chat_template( + messages, + add_generation_prompt = True, + return_tensors = "pt", + return_dict = True, + ).to(model.device) + print(f"Unsloth: Pre compiling {model_type.title()} model for faster inference - this might take a few minutes!") + _ = model.generate(**inputs, max_new_tokens = 3) + del inputs + pass @staticmethod def get_peft_model( @@ -823,6 +869,7 @@ def post_patch_model( model, use_gradient_checkpointing = True, trust_remote_code = False, + model_type = None, ): full_finetuning = os.environ.get("UNSLOTH_ENABLE_FULL_FINETUNING", "0") == "1" @@ -850,10 +897,12 @@ def post_patch_model( patch_saving_functions(model, vision = True) # Patch tokenizer to pad to the left + tokenizer = None m = model while hasattr(m, "model"): if hasattr(m, "_saved_temp_tokenizer"): if hasattr(m._saved_temp_tokenizer, "tokenizer"): + tokenizer = m._saved_temp_tokenizer m._saved_temp_tokenizer.tokenizer.padding_side = "left" pass # Also set is_loaded_in_8bit to disable incorrect DDP @@ -862,6 +911,7 @@ def post_patch_model( pass if hasattr(m, "_saved_temp_tokenizer"): if hasattr(m._saved_temp_tokenizer, "tokenizer"): + tokenizer = m._saved_temp_tokenizer m._saved_temp_tokenizer.tokenizer.padding_side = "left" pass # Also set is_loaded_in_8bit to disable incorrect DDP @@ -890,6 +940,8 @@ def post_patch_model( if getattr(module, "weight", None) is not None and getattr(module, "padding_idx", None) is not None: if module.padding_idx < module.weight.shape[0]: module.weight[module.padding_idx] = 0 + # Patch for torch.compiled inference + FastBaseModel.pre_compile_for_inference(model_type, model, tokenizer) return model pass From 8438a7620eafc9f012c92c6ac01c61c0da0b7bf3 Mon Sep 17 00:00:00 2001 From: Daniel Han Date: Tue, 23 Sep 2025 18:05:18 -0700 Subject: [PATCH 185/272] Update vision.py --- unsloth/models/vision.py | 1 + 1 file changed, 1 insertion(+) diff --git a/unsloth/models/vision.py b/unsloth/models/vision.py index fdfaee89f5..941e461889 100644 --- a/unsloth/models/vision.py +++ b/unsloth/models/vision.py @@ -941,6 +941,7 @@ def post_patch_model( if module.padding_idx < module.weight.shape[0]: module.weight[module.padding_idx] = 0 # Patch for torch.compiled inference + print("Precompiling") FastBaseModel.pre_compile_for_inference(model_type, model, tokenizer) return model pass From 5867273c8fd0649bf8f304f4182bd9250c844ef6 Mon Sep 17 00:00:00 2001 From: Daniel Han Date: Tue, 23 Sep 2025 18:06:50 -0700 Subject: [PATCH 186/272] Update vision.py --- unsloth/models/vision.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/unsloth/models/vision.py b/unsloth/models/vision.py index 941e461889..07067afb3c 100644 --- a/unsloth/models/vision.py +++ b/unsloth/models/vision.py @@ -941,7 +941,7 @@ def post_patch_model( if module.padding_idx < module.weight.shape[0]: module.weight[module.padding_idx] = 0 # Patch for torch.compiled inference - print("Precompiling") + print(model_type, model, tokenizer) FastBaseModel.pre_compile_for_inference(model_type, model, tokenizer) return model pass From 7b2bef1053c5b136431a727314ddf1c220afc7e9 Mon Sep 17 00:00:00 2001 From: Daniel Han Date: Tue, 23 Sep 2025 18:10:09 -0700 Subject: [PATCH 187/272] Update vision.py --- unsloth/models/vision.py | 12 ++++++++---- 1 file changed, 8 insertions(+), 4 deletions(-) diff --git a/unsloth/models/vision.py b/unsloth/models/vision.py index 07067afb3c..5d0080f8b8 100644 --- a/unsloth/models/vision.py +++ b/unsloth/models/vision.py @@ -695,6 +695,7 @@ def from_pretrained( use_gradient_checkpointing = use_gradient_checkpointing, trust_remote_code = trust_remote_code, model_type = model_type_arch, + tokenizer = tokenizer, ) # Clear deleted GPU items for _ in range(3): @@ -717,6 +718,10 @@ def pre_compile_for_inference(model_type, model, tokenizer): if model_type is None or model is None or tokenizer is None: return if str(model_type).lower() not in PRE_COMPILE_INFERENCE: return if getattr(tokenizer, "chat_template", None) is None: return + # Check if already compiled and exit + for module in model.modules(): + if hasattr(module, "_pre_compiled_for_inference"): return + pass messages = [ [ {"role": "user", "content": f"1+1"}, @@ -731,6 +736,8 @@ def pre_compile_for_inference(model_type, model, tokenizer): print(f"Unsloth: Pre compiling {model_type.title()} model for faster inference - this might take a few minutes!") _ = model.generate(**inputs, max_new_tokens = 3) del inputs + # Set we already pre compiled + model._pre_compiled_for_inference = True pass @staticmethod @@ -870,6 +877,7 @@ def post_patch_model( use_gradient_checkpointing = True, trust_remote_code = False, model_type = None, + tokenizer = None, ): full_finetuning = os.environ.get("UNSLOTH_ENABLE_FULL_FINETUNING", "0") == "1" @@ -897,12 +905,10 @@ def post_patch_model( patch_saving_functions(model, vision = True) # Patch tokenizer to pad to the left - tokenizer = None m = model while hasattr(m, "model"): if hasattr(m, "_saved_temp_tokenizer"): if hasattr(m._saved_temp_tokenizer, "tokenizer"): - tokenizer = m._saved_temp_tokenizer m._saved_temp_tokenizer.tokenizer.padding_side = "left" pass # Also set is_loaded_in_8bit to disable incorrect DDP @@ -911,7 +917,6 @@ def post_patch_model( pass if hasattr(m, "_saved_temp_tokenizer"): if hasattr(m._saved_temp_tokenizer, "tokenizer"): - tokenizer = m._saved_temp_tokenizer m._saved_temp_tokenizer.tokenizer.padding_side = "left" pass # Also set is_loaded_in_8bit to disable incorrect DDP @@ -941,7 +946,6 @@ def post_patch_model( if module.padding_idx < module.weight.shape[0]: module.weight[module.padding_idx] = 0 # Patch for torch.compiled inference - print(model_type, model, tokenizer) FastBaseModel.pre_compile_for_inference(model_type, model, tokenizer) return model pass From 82a7697da359a8df6c54c72a02ce9795e90c60c4 Mon Sep 17 00:00:00 2001 From: Daniel Han Date: Tue, 23 Sep 2025 18:13:36 -0700 Subject: [PATCH 188/272] Update vision.py --- unsloth/models/vision.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/unsloth/models/vision.py b/unsloth/models/vision.py index 5d0080f8b8..15fe64858f 100644 --- a/unsloth/models/vision.py +++ b/unsloth/models/vision.py @@ -733,7 +733,7 @@ def pre_compile_for_inference(model_type, model, tokenizer): return_tensors = "pt", return_dict = True, ).to(model.device) - print(f"Unsloth: Pre compiling {model_type.title()} model for faster inference - this might take a few minutes!") + print(f"🦥 Unsloth: Pre compiling {model_type.title()} model for faster inference - this might take a ~ 3 minutes!") _ = model.generate(**inputs, max_new_tokens = 3) del inputs # Set we already pre compiled From aa9b200437e18a4f4856c3d8c9dbe2946b50663f Mon Sep 17 00:00:00 2001 From: Daniel Han Date: Tue, 23 Sep 2025 18:19:24 -0700 Subject: [PATCH 189/272] Update vision.py --- unsloth/models/vision.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/unsloth/models/vision.py b/unsloth/models/vision.py index 15fe64858f..1a1de849cf 100644 --- a/unsloth/models/vision.py +++ b/unsloth/models/vision.py @@ -733,7 +733,8 @@ def pre_compile_for_inference(model_type, model, tokenizer): return_tensors = "pt", return_dict = True, ).to(model.device) - print(f"🦥 Unsloth: Pre compiling {model_type.title()} model for faster inference - this might take a ~ 3 minutes!") + print(f"🦥 Unsloth: Pre compiling {model_type.title()} model for faster inference - this might take 3 minutes or so!") + print("========= Pre compiling model for faster inference. Please be patient thank you! =========") _ = model.generate(**inputs, max_new_tokens = 3) del inputs # Set we already pre compiled From eb1df232a50d0ae6889e3cf3357faf9b3d31109f Mon Sep 17 00:00:00 2001 From: Daniel Han Date: Tue, 23 Sep 2025 18:22:55 -0700 Subject: [PATCH 190/272] Update vision.py --- unsloth/models/vision.py | 19 ++++++++++++++++--- 1 file changed, 16 insertions(+), 3 deletions(-) diff --git a/unsloth/models/vision.py b/unsloth/models/vision.py index 1a1de849cf..15fb7f8232 100644 --- a/unsloth/models/vision.py +++ b/unsloth/models/vision.py @@ -722,6 +722,22 @@ def pre_compile_for_inference(model_type, model, tokenizer): for module in model.modules(): if hasattr(module, "_pre_compiled_for_inference"): return pass + print(f"🦥 Unsloth: Pre compiling {model_type.title()} model for faster inference - this might take 3 minutes or so!") + print("========= Pre compiling model for faster inference. Please be patient thank you! =========") + # Do single inference + messages = [ + [ + {"role": "user", "content": f"1+1"}, + ], + ]*1 + inputs = tokenizer.apply_chat_template( + messages, + add_generation_prompt = True, + return_tensors = "pt", + return_dict = True, + ).to(model.device) + _ = model.generate(**inputs, max_new_tokens = 3) + # Do batched inference messages = [ [ {"role": "user", "content": f"1+1"}, @@ -733,10 +749,7 @@ def pre_compile_for_inference(model_type, model, tokenizer): return_tensors = "pt", return_dict = True, ).to(model.device) - print(f"🦥 Unsloth: Pre compiling {model_type.title()} model for faster inference - this might take 3 minutes or so!") - print("========= Pre compiling model for faster inference. Please be patient thank you! =========") _ = model.generate(**inputs, max_new_tokens = 3) - del inputs # Set we already pre compiled model._pre_compiled_for_inference = True pass From 563aa35081e581bcbac63cf8943c0ad4489195ac Mon Sep 17 00:00:00 2001 From: Daniel Han Date: Tue, 23 Sep 2025 18:33:51 -0700 Subject: [PATCH 191/272] Update vision.py --- unsloth/models/vision.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/unsloth/models/vision.py b/unsloth/models/vision.py index 15fb7f8232..4a559dc8e3 100644 --- a/unsloth/models/vision.py +++ b/unsloth/models/vision.py @@ -727,7 +727,7 @@ def pre_compile_for_inference(model_type, model, tokenizer): # Do single inference messages = [ [ - {"role": "user", "content": f"1+1"}, + {"role": "user", "content": f"What is 1+1 equal to?"}, ], ]*1 inputs = tokenizer.apply_chat_template( From 4bfde2ea2333be379456b5b296cc20b960891869 Mon Sep 17 00:00:00 2001 From: Daniel Han Date: Tue, 23 Sep 2025 23:14:47 -0700 Subject: [PATCH 192/272] Update vision.py --- unsloth/models/vision.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/unsloth/models/vision.py b/unsloth/models/vision.py index 4a559dc8e3..94dcf3673d 100644 --- a/unsloth/models/vision.py +++ b/unsloth/models/vision.py @@ -960,7 +960,7 @@ def post_patch_model( if module.padding_idx < module.weight.shape[0]: module.weight[module.padding_idx] = 0 # Patch for torch.compiled inference - FastBaseModel.pre_compile_for_inference(model_type, model, tokenizer) + # FastBaseModel.pre_compile_for_inference(model_type, model, tokenizer) return model pass From d6beafe16a7c9f388777e04032397ff6e61cc9e6 Mon Sep 17 00:00:00 2001 From: Daniel Han Date: Wed, 24 Sep 2025 00:01:42 -0700 Subject: [PATCH 193/272] MXFP4 dequant --- unsloth/models/loader.py | 1 + unsloth/models/vision.py | 23 +++++++++++++++++++++-- 2 files changed, 22 insertions(+), 2 deletions(-) diff --git a/unsloth/models/loader.py b/unsloth/models/loader.py index 875c6f73fc..ade7e1292f 100644 --- a/unsloth/models/loader.py +++ b/unsloth/models/loader.py @@ -910,6 +910,7 @@ def from_pretrained( supports_sdpa = supports_sdpa, whisper_language = whisper_language, whisper_task = whisper_task, + auto_config = auto_config, # Pass vLLM/inference parameters fast_inference = fast_inference, diff --git a/unsloth/models/vision.py b/unsloth/models/vision.py index 94dcf3673d..1894bd185f 100644 --- a/unsloth/models/vision.py +++ b/unsloth/models/vision.py @@ -300,7 +300,9 @@ def from_pretrained( supports_sdpa = True, whisper_language = None, whisper_task = None, - fast_inference = False, + auto_config = None, + # vLLM parameters + fast_inference = False, gpu_memory_utilization = 0.5, float8_kv_cache = False, random_state = 3407, @@ -500,10 +502,27 @@ def from_pretrained( # Cannot be None, since HF now checks for the config if load_in_4bit: # Ignore load_in_4bit / load_in_8bit for MXFP4 - best to get config file - if "gpt-oss" in model_name.lower(): + if "gpt-oss-20b" in model_name.lower() or "gpt-oss-120b" in model_name.lower(): pass else: kwargs["quantization_config"] = bnb_config + else: + # Try dequantizing the quantized model if it's a quantized model + if auto_config is None: + auto_config = AutoConfig.from_pretrained( + model_name, + token = token, + trust_remote_code = trust_remote_code, + ) + if hasattr(auto_config, "quantization_config"): + from transformers.quantizers.auto import AUTO_QUANTIZATION_CONFIG_MAPPING + quantizer = AUTO_QUANTIZATION_CONFIG_MAPPING[auto_config["quant_method"]] + quantizer_kwargs = {} + if "dequantize" in inspect.signature(quantizer).parameters: + quantizer_kwargs["dequantize"] = True + quantization_config = quantizer.from_dict(config, **quantizer_kwargs) + kwargs["quantization_config"] = quantization_config + pass pass # Check if using forced float32 - we load it in bfloat16, then cast to float16! From 19cfe1be33961e083ec345357d6e4eb083bb2ab8 Mon Sep 17 00:00:00 2001 From: Daniel Han Date: Wed, 24 Sep 2025 00:03:24 -0700 Subject: [PATCH 194/272] Update loader.py --- unsloth/models/loader.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/unsloth/models/loader.py b/unsloth/models/loader.py index ade7e1292f..c859d62858 100644 --- a/unsloth/models/loader.py +++ b/unsloth/models/loader.py @@ -910,7 +910,7 @@ def from_pretrained( supports_sdpa = supports_sdpa, whisper_language = whisper_language, whisper_task = whisper_task, - auto_config = auto_config, + auto_config = model_config, # Pass vLLM/inference parameters fast_inference = fast_inference, From 63a7f65a89893e484c620365f5d15f9c1f782a33 Mon Sep 17 00:00:00 2001 From: Daniel Han Date: Wed, 24 Sep 2025 00:06:52 -0700 Subject: [PATCH 195/272] Update vision.py --- unsloth/models/vision.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/unsloth/models/vision.py b/unsloth/models/vision.py index 1894bd185f..7722ddd482 100644 --- a/unsloth/models/vision.py +++ b/unsloth/models/vision.py @@ -516,7 +516,8 @@ def from_pretrained( ) if hasattr(auto_config, "quantization_config"): from transformers.quantizers.auto import AUTO_QUANTIZATION_CONFIG_MAPPING - quantizer = AUTO_QUANTIZATION_CONFIG_MAPPING[auto_config["quant_method"]] + quantization_config = auto_config.quantization_config + quantizer = AUTO_QUANTIZATION_CONFIG_MAPPING[quantization_config["quant_method"]] quantizer_kwargs = {} if "dequantize" in inspect.signature(quantizer).parameters: quantizer_kwargs["dequantize"] = True From df5282b7339b7712f916dbdf2ab958f607184271 Mon Sep 17 00:00:00 2001 From: Daniel Han Date: Wed, 24 Sep 2025 00:13:28 -0700 Subject: [PATCH 196/272] load_in_16bit --- unsloth/models/loader.py | 25 ++++++++++++++++--------- unsloth/models/vision.py | 12 ++++++++---- 2 files changed, 24 insertions(+), 13 deletions(-) diff --git a/unsloth/models/loader.py b/unsloth/models/loader.py index c859d62858..0e130fb973 100644 --- a/unsloth/models/loader.py +++ b/unsloth/models/loader.py @@ -110,8 +110,9 @@ def from_pretrained( model_name = "unsloth/Llama-3.2-1B-Instruct", max_seq_length = 2048, dtype = None, - load_in_4bit = True, - load_in_8bit = False, + load_in_4bit = True, # 4bit QLoRA + load_in_8bit = False, # 8bit LoRA + load_in_16bit = False, # 16bit LoRA full_finetuning = False, token = None, device_map = "sequential", @@ -147,6 +148,7 @@ def from_pretrained( dtype = dtype, load_in_4bit = load_in_4bit, load_in_8bit = load_in_8bit, + load_in_16bit = load_in_16bit, full_finetuning = full_finetuning, token = token, device_map = device_map, @@ -386,6 +388,7 @@ def from_pretrained( dtype = dtype, load_in_4bit = load_in_4bit, load_in_8bit = load_in_8bit, + load_in_16bit = load_in_16bit, full_finetuning = full_finetuning, token = token, device_map = device_map, @@ -523,8 +526,9 @@ def from_pretrained( model_name = "unsloth/Llama-3.2-11B-Vision-Instruct-bnb-4bit", max_seq_length = 2048, dtype = None, - load_in_4bit = True, - load_in_8bit = False, + load_in_4bit = True, # 4bit QLoRA + load_in_8bit = False, # 8bit LoRA + load_in_16bit = False, # 16bit LoRA full_finetuning = False, token = None, device_map = "sequential", @@ -576,15 +580,17 @@ def from_pretrained( if full_finetuning and (load_in_4bit or load_in_8bit): print("Unsloth: You selected full finetuning support, but 4bit / 8bit is enabled - disabling LoRA / QLoRA.") - load_in_4bit = False - load_in_8bit = False + load_in_4bit = False + load_in_8bit = False + load_in_16bit = False pass - if load_in_4bit and load_in_8bit: + if int(load_in_4bit) + int(load_in_8bit) + int(load_in_16bit) >= 2: raise RuntimeError( - "Unsloth: Can only load in 4bit or 8bit, not both!\n"\ + "Unsloth: Can only load in 4bit or 8bit or 16bit, not a combination!\n"\ "Also, we by default set `load_in_4bit = True`.\n"\ - "If you want 8bit finetuning, set both `load_in_4bit = False` and `load_in_8bit = True`" + "If you want 8bit finetuning, set both `load_in_4bit = False` and `load_in_8bit = True`\n"\ + "If you want 16bit LoRA finetuning, set `load_in_16bit = True`" ) pass @@ -898,6 +904,7 @@ def from_pretrained( dtype = _get_dtype(dtype), load_in_4bit = load_in_4bit, load_in_8bit = load_in_8bit, + load_in_16bit = load_in_16bit, full_finetuning = full_finetuning, token = token, device_map = device_map, diff --git a/unsloth/models/vision.py b/unsloth/models/vision.py index 7722ddd482..506d16bc57 100644 --- a/unsloth/models/vision.py +++ b/unsloth/models/vision.py @@ -289,6 +289,7 @@ def from_pretrained( dtype = None, load_in_4bit = True, load_in_8bit = False, + load_in_16bit = False, full_finetuning = False, token = None, device_map = "sequential", @@ -462,12 +463,13 @@ def from_pretrained( bnb_config = None if full_finetuning and (load_in_4bit or load_in_8bit): print("Unsloth: You selected full finetuning support, but 4bit / 8bit is enabled - disabling LoRA / QLoRA.") - load_in_4bit = False - load_in_8bit = False + load_in_4bit = False + load_in_8bit = False + load_in_16bit = False pass - if load_in_4bit and load_in_8bit: - raise RuntimeError("Unsloth: Can only load in 4bit or 8bit, not both!") + if int(load_in_4bit) + int(load_in_8bit) + int(load_in_16bit) >= 2: + raise RuntimeError("Unsloth: Can only load in 4bit or 8bit or 16bit, not a combination!") if load_in_4bit: bnb_config = BitsAndBytesConfig( load_in_4bit = True, @@ -481,6 +483,8 @@ def from_pretrained( load_in_8bit = True, llm_int8_skip_modules = SKIP_QUANTIZATION_MODULES.copy(), ) + elif load_in_16bit: + bnb_config = None elif not load_in_4bit and not load_in_8bit and not full_finetuning: print("Unsloth: QLoRA and full finetuning all not selected. Switching to 16bit LoRA.") pass From e7174b161f3dfb083076ed5844a7358e90019cea Mon Sep 17 00:00:00 2001 From: Daniel Han Date: Wed, 24 Sep 2025 00:14:02 -0700 Subject: [PATCH 197/272] Update vision.py --- unsloth/models/vision.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/unsloth/models/vision.py b/unsloth/models/vision.py index 506d16bc57..f1ff53a526 100644 --- a/unsloth/models/vision.py +++ b/unsloth/models/vision.py @@ -525,7 +525,7 @@ def from_pretrained( quantizer_kwargs = {} if "dequantize" in inspect.signature(quantizer).parameters: quantizer_kwargs["dequantize"] = True - quantization_config = quantizer.from_dict(config, **quantizer_kwargs) + quantization_config = quantizer.from_dict(quantization_config, **quantizer_kwargs) kwargs["quantization_config"] = quantization_config pass pass From ffe5aca5d7b622d4b7de5297570519085931e9d4 Mon Sep 17 00:00:00 2001 From: Daniel Han Date: Wed, 24 Sep 2025 01:57:28 -0700 Subject: [PATCH 198/272] Update vision.py --- unsloth/models/vision.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/unsloth/models/vision.py b/unsloth/models/vision.py index f1ff53a526..78d2cd6573 100644 --- a/unsloth/models/vision.py +++ b/unsloth/models/vision.py @@ -760,7 +760,7 @@ def pre_compile_for_inference(model_type, model, tokenizer): return_tensors = "pt", return_dict = True, ).to(model.device) - _ = model.generate(**inputs, max_new_tokens = 3) + _ = model.generate(**inputs, max_new_tokens = 1) # Do batched inference messages = [ [ @@ -773,7 +773,7 @@ def pre_compile_for_inference(model_type, model, tokenizer): return_tensors = "pt", return_dict = True, ).to(model.device) - _ = model.generate(**inputs, max_new_tokens = 3) + _ = model.generate(**inputs, max_new_tokens = 2) # Set we already pre compiled model._pre_compiled_for_inference = True pass @@ -984,7 +984,7 @@ def post_patch_model( if module.padding_idx < module.weight.shape[0]: module.weight[module.padding_idx] = 0 # Patch for torch.compiled inference - # FastBaseModel.pre_compile_for_inference(model_type, model, tokenizer) + FastBaseModel.pre_compile_for_inference(model_type, model, tokenizer) return model pass From 81356cc76031d5684aabee020394398acec8f785 Mon Sep 17 00:00:00 2001 From: Daniel Han Date: Wed, 24 Sep 2025 07:22:30 -0700 Subject: [PATCH 199/272] Update vision.py --- unsloth/models/vision.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/unsloth/models/vision.py b/unsloth/models/vision.py index 78d2cd6573..ff3e216c3a 100644 --- a/unsloth/models/vision.py +++ b/unsloth/models/vision.py @@ -511,7 +511,6 @@ def from_pretrained( else: kwargs["quantization_config"] = bnb_config else: - # Try dequantizing the quantized model if it's a quantized model if auto_config is None: auto_config = AutoConfig.from_pretrained( model_name, @@ -523,8 +522,9 @@ def from_pretrained( quantization_config = auto_config.quantization_config quantizer = AUTO_QUANTIZATION_CONFIG_MAPPING[quantization_config["quant_method"]] quantizer_kwargs = {} - if "dequantize" in inspect.signature(quantizer).parameters: - quantizer_kwargs["dequantize"] = True + # We cannot dequantize since gpt-oss-20b MXFP4 will now be gpt-oss-20b-BF16 + # if "dequantize" in inspect.signature(quantizer).parameters: + # quantizer_kwargs["dequantize"] = True quantization_config = quantizer.from_dict(quantization_config, **quantizer_kwargs) kwargs["quantization_config"] = quantization_config pass From 2313ea949292209f602bad9b75a9000f8d3f217e Mon Sep 17 00:00:00 2001 From: Daniel Han Date: Thu, 25 Sep 2025 03:32:05 -0700 Subject: [PATCH 200/272] Update rl.py --- unsloth/models/rl.py | 13 +++++++++++++ 1 file changed, 13 insertions(+) diff --git a/unsloth/models/rl.py b/unsloth/models/rl.py index 65434bb095..2b6293993d 100644 --- a/unsloth/models/rl.py +++ b/unsloth/models/rl.py @@ -963,6 +963,19 @@ def patch_functions(RLTrainer, trainer_file, RLTrainer_name, all_imports, import source = edit_function(function, source) pass + """ + import torch + X = torch.ones((2, 2048, 201088), dtype = torch.bfloat16, device = "cuda") + X[torch.randperm(2, dtype = torch.int64, device = X.device)] + + will error out in torch 2.8 AcceleratorError: CUDA error: invalid configuration argument + """ + source = re.sub( + r"(\n[\s]{4,})generation_batch = shuffle_sequence_dict\(generation_batch\)\n", + r"\n\1try: generation_batch = shuffle_sequence_dict(generation_batch)\n\1except: pass\n", + source, + ) + # llm_model = self.llm.llm_engine.model_executor.driver_worker.model_runner.model source = re.sub( r"(\n[\s]{4,}).+?model_executor\.driver_worker.+?\n", From 0c18d86fbf64269a54f924102eb65bd26b424f7b Mon Sep 17 00:00:00 2001 From: Daniel Han Date: Thu, 25 Sep 2025 22:00:45 -0700 Subject: [PATCH 201/272] Update vision.py --- unsloth/models/vision.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/unsloth/models/vision.py b/unsloth/models/vision.py index ff3e216c3a..5b99cef8a7 100644 --- a/unsloth/models/vision.py +++ b/unsloth/models/vision.py @@ -984,7 +984,7 @@ def post_patch_model( if module.padding_idx < module.weight.shape[0]: module.weight[module.padding_idx] = 0 # Patch for torch.compiled inference - FastBaseModel.pre_compile_for_inference(model_type, model, tokenizer) + # FastBaseModel.pre_compile_for_inference(model_type, model, tokenizer) return model pass From 19017fd5ba998296851d2eb6a7ad8a80b553da2e Mon Sep 17 00:00:00 2001 From: Daniel Han Date: Fri, 26 Sep 2025 02:33:44 -0700 Subject: [PATCH 202/272] offload_embedding --- unsloth/models/loader.py | 5 +++++ unsloth/models/vision.py | 10 ++++++++++ 2 files changed, 15 insertions(+) diff --git a/unsloth/models/loader.py b/unsloth/models/loader.py index 0e130fb973..98396bb754 100644 --- a/unsloth/models/loader.py +++ b/unsloth/models/loader.py @@ -123,6 +123,7 @@ def from_pretrained( resize_model_vocab = None, revision = None, use_exact_model_name = False, + offload_embedding = False, fast_inference = False, # uses vLLM gpu_memory_utilization = 0.5, @@ -161,6 +162,7 @@ def from_pretrained( return_logits = False, # Return logits fullgraph = True, # No graph breaks use_exact_model_name = use_exact_model_name, + offload_embedding = offload_embedding, # Pass vLLM/inference parameters fast_inference = fast_inference, @@ -401,6 +403,7 @@ def from_pretrained( return_logits = False, # Return logits fullgraph = True, # No graph breaks use_exact_model_name = use_exact_model_name, + offload_embedding = offload_embedding, # Pass vLLM/inference parameters fast_inference = fast_inference, @@ -545,6 +548,7 @@ def from_pretrained( whisper_language = None, whisper_task = None, unsloth_force_compile = False, + offload_embedding = False, # Add the missing vLLM/inference parameters fast_inference = False, # uses vLLM @@ -918,6 +922,7 @@ def from_pretrained( whisper_language = whisper_language, whisper_task = whisper_task, auto_config = model_config, + offload_embedding = offload_embedding, # Pass vLLM/inference parameters fast_inference = fast_inference, diff --git a/unsloth/models/vision.py b/unsloth/models/vision.py index 5b99cef8a7..e658f18347 100644 --- a/unsloth/models/vision.py +++ b/unsloth/models/vision.py @@ -302,6 +302,7 @@ def from_pretrained( whisper_language = None, whisper_task = None, auto_config = None, + offload_embedding = False, # vLLM parameters fast_inference = False, gpu_memory_utilization = 0.5, @@ -551,6 +552,15 @@ def from_pretrained( if hasattr(model, 'generate'): model.fast_generate = model.generate model.fast_generate_batches = error_out_no_vllm + if offload_embedding: + embed_tokens = model.get_input_embeddings() + nbytes = embed_tokens.weight.numel() * embed_tokens.weight.itemsize + ngb = round(nbytes / 1024 / 1024 / 1024, 2) + print(f"Unsloth: Offloading embeddings to RAM to save {ngb} GB.") + embed_tokens.to("cpu") + # Must free GPU memory otherwise will not free! + torch.cuda.empty_cache() + gc.collect() else: from unsloth_zoo.vllm_utils import ( load_vllm, From 77fca7998e09db683d80ebfdfc0b5b2715c601cb Mon Sep 17 00:00:00 2001 From: Daniel Han Date: Fri, 26 Sep 2025 03:04:53 -0700 Subject: [PATCH 203/272] Update vision.py --- unsloth/models/vision.py | 11 +++++++++++ 1 file changed, 11 insertions(+) diff --git a/unsloth/models/vision.py b/unsloth/models/vision.py index e658f18347..dc1def7a7b 100644 --- a/unsloth/models/vision.py +++ b/unsloth/models/vision.py @@ -558,6 +558,17 @@ def from_pretrained( ngb = round(nbytes / 1024 / 1024 / 1024, 2) print(f"Unsloth: Offloading embeddings to RAM to save {ngb} GB.") embed_tokens.to("cpu") + embed_tokens.weight.pin_memory() + + # Add hooks to move inputs to CPU and back to CUDA + def pre_hook(module, args): + args[0]._old_device = args[0].device + return (args[0].to("cpu", non_blocking = True)) + def post_hook(module, args, output): + old_device = getattr(args[0], "_old_device", "cuda") + return output.to(old_device, non_blocking = True) + embed_tokens.register_forward_pre_hook(pre_hook, prepend = True) + embed_tokens.register_forward_hook (post_hook, prepend = True) # Must free GPU memory otherwise will not free! torch.cuda.empty_cache() gc.collect() From 92084ba38bac1bb770dae817ce7acc94412d5cad Mon Sep 17 00:00:00 2001 From: Daniel Han Date: Fri, 26 Sep 2025 03:08:47 -0700 Subject: [PATCH 204/272] Update vision.py --- unsloth/models/vision.py | 1 - 1 file changed, 1 deletion(-) diff --git a/unsloth/models/vision.py b/unsloth/models/vision.py index dc1def7a7b..cedaa14125 100644 --- a/unsloth/models/vision.py +++ b/unsloth/models/vision.py @@ -558,7 +558,6 @@ def from_pretrained( ngb = round(nbytes / 1024 / 1024 / 1024, 2) print(f"Unsloth: Offloading embeddings to RAM to save {ngb} GB.") embed_tokens.to("cpu") - embed_tokens.weight.pin_memory() # Add hooks to move inputs to CPU and back to CUDA def pre_hook(module, args): From 499f939c9b118d59a85952da626c913f6a915cfd Mon Sep 17 00:00:00 2001 From: Daniel Han Date: Fri, 26 Sep 2025 03:17:35 -0700 Subject: [PATCH 205/272] Update vision.py --- unsloth/models/vision.py | 17 +++++++++-------- 1 file changed, 9 insertions(+), 8 deletions(-) diff --git a/unsloth/models/vision.py b/unsloth/models/vision.py index cedaa14125..2cd95b0377 100644 --- a/unsloth/models/vision.py +++ b/unsloth/models/vision.py @@ -560,14 +560,15 @@ def from_pretrained( embed_tokens.to("cpu") # Add hooks to move inputs to CPU and back to CUDA - def pre_hook(module, args): - args[0]._old_device = args[0].device - return (args[0].to("cpu", non_blocking = True)) - def post_hook(module, args, output): - old_device = getattr(args[0], "_old_device", "cuda") - return output.to(old_device, non_blocking = True) - embed_tokens.register_forward_pre_hook(pre_hook, prepend = True) - embed_tokens.register_forward_hook (post_hook, prepend = True) + # [TODO] Doesn't seem to work! + # def pre_hook(module, args): + # args[0]._old_device = args[0].device + # return (args[0].to("cpu", non_blocking = True)) + # def post_hook(module, args, output): + # old_device = getattr(args[0], "_old_device", "cuda") + # return output.to(old_device, non_blocking = True) + # embed_tokens.register_forward_pre_hook(pre_hook, prepend = True) + # embed_tokens.register_forward_hook (post_hook, prepend = True) # Must free GPU memory otherwise will not free! torch.cuda.empty_cache() gc.collect() From f72c0a9b26b89ad1387b020aa25b242d0e0a7833 Mon Sep 17 00:00:00 2001 From: Daniel Han Date: Sat, 27 Sep 2025 17:44:58 -0700 Subject: [PATCH 206/272] Update vision.py --- unsloth/models/vision.py | 52 ++-------------------------------------- 1 file changed, 2 insertions(+), 50 deletions(-) diff --git a/unsloth/models/vision.py b/unsloth/models/vision.py index 2cd95b0377..071416dd1b 100644 --- a/unsloth/models/vision.py +++ b/unsloth/models/vision.py @@ -524,8 +524,8 @@ def from_pretrained( quantizer = AUTO_QUANTIZATION_CONFIG_MAPPING[quantization_config["quant_method"]] quantizer_kwargs = {} # We cannot dequantize since gpt-oss-20b MXFP4 will now be gpt-oss-20b-BF16 - # if "dequantize" in inspect.signature(quantizer).parameters: - # quantizer_kwargs["dequantize"] = True + if load_in_16bit and "dequantize" in inspect.signature(quantizer).parameters: + quantizer_kwargs["dequantize"] = True quantization_config = quantizer.from_dict(quantization_config, **quantizer_kwargs) kwargs["quantization_config"] = quantization_config pass @@ -753,52 +753,6 @@ def from_pretrained( return model, tokenizer pass - @staticmethod - def pre_compile_for_inference(model_type, model, tokenizer): - """ - We need to invoke torch.compile to save VRAM usage and make it faster downstream. - Sometimes torch.compile can use 3GB weirdly on large batches, then it goes down to <1GB. - So we invoke torch.compile on short batches to reduce VRAM usage. - """ - if model_type is None or model is None or tokenizer is None: return - if str(model_type).lower() not in PRE_COMPILE_INFERENCE: return - if getattr(tokenizer, "chat_template", None) is None: return - # Check if already compiled and exit - for module in model.modules(): - if hasattr(module, "_pre_compiled_for_inference"): return - pass - print(f"🦥 Unsloth: Pre compiling {model_type.title()} model for faster inference - this might take 3 minutes or so!") - print("========= Pre compiling model for faster inference. Please be patient thank you! =========") - # Do single inference - messages = [ - [ - {"role": "user", "content": f"What is 1+1 equal to?"}, - ], - ]*1 - inputs = tokenizer.apply_chat_template( - messages, - add_generation_prompt = True, - return_tensors = "pt", - return_dict = True, - ).to(model.device) - _ = model.generate(**inputs, max_new_tokens = 1) - # Do batched inference - messages = [ - [ - {"role": "user", "content": f"1+1"}, - ], - ]*4 - inputs = tokenizer.apply_chat_template( - messages, - add_generation_prompt = True, - return_tensors = "pt", - return_dict = True, - ).to(model.device) - _ = model.generate(**inputs, max_new_tokens = 2) - # Set we already pre compiled - model._pre_compiled_for_inference = True - pass - @staticmethod def get_peft_model( model, @@ -1004,8 +958,6 @@ def post_patch_model( if getattr(module, "weight", None) is not None and getattr(module, "padding_idx", None) is not None: if module.padding_idx < module.weight.shape[0]: module.weight[module.padding_idx] = 0 - # Patch for torch.compiled inference - # FastBaseModel.pre_compile_for_inference(model_type, model, tokenizer) return model pass From 2a7cfa0ecf137fedd970e67c330d893c7e8f60f0 Mon Sep 17 00:00:00 2001 From: Daniel Han Date: Sat, 27 Sep 2025 17:47:28 -0700 Subject: [PATCH 207/272] Update vision.py --- unsloth/models/vision.py | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/unsloth/models/vision.py b/unsloth/models/vision.py index 071416dd1b..0510b26128 100644 --- a/unsloth/models/vision.py +++ b/unsloth/models/vision.py @@ -856,7 +856,11 @@ def get_peft_model( # Enable gradients on modules which are trainable requires_grad_for_gradient_checkpointing(model) trust_remote_code = getattr(model, "_unsloth_trust_remote_code", False) - model = FastBaseModel.post_patch_model(model, use_gradient_checkpointing, trust_remote_code = trust_remote_code) + model = FastBaseModel.post_patch_model( + model, + use_gradient_checkpointing = use_gradient_checkpointing, + trust_remote_code = trust_remote_code, + ) model.max_seq_length = max_seq_length # Save to modules as well for module in model.modules(): From 2577d8162f9bfe734a30404dff8980530a6701c3 Mon Sep 17 00:00:00 2001 From: Daniel Han Date: Sat, 27 Sep 2025 17:50:53 -0700 Subject: [PATCH 208/272] Update vision.py --- unsloth/models/vision.py | 15 ++++++++++++--- 1 file changed, 12 insertions(+), 3 deletions(-) diff --git a/unsloth/models/vision.py b/unsloth/models/vision.py index 0510b26128..3522d46f0c 100644 --- a/unsloth/models/vision.py +++ b/unsloth/models/vision.py @@ -549,7 +549,7 @@ def from_pretrained( # attn_implementation = attn_implementation, **kwargs, ) - if hasattr(model, 'generate'): + if hasattr(model, "generate"): model.fast_generate = model.generate model.fast_generate_batches = error_out_no_vllm if offload_embedding: @@ -612,8 +612,17 @@ def from_pretrained( llm = load_vllm(**load_vllm_kwargs) # Convert to HF format - _, quant_state_dict = get_vllm_state_dict(llm, config = model_config, is_vision_model = True) - model = convert_vllm_to_huggingface(quant_state_dict, model_config, dtype, bnb_config, is_vision_model = True) + _, quant_state_dict = get_vllm_state_dict( + llm, + config = model_config, + is_vision_model = True, + ) + model = convert_vllm_to_huggingface( + quant_state_dict, + model_config, + dtype, bnb_config, + is_vision_model = True, + ) model.vllm_engine = llm model.fast_generate = model.vllm_engine.generate model.fast_generate_batches = functools.partial(generate_batches, model.vllm_engine) From 1eee987e60d908ee415caa9108dc50398ace64ba Mon Sep 17 00:00:00 2001 From: Daniel Han Date: Tue, 30 Sep 2025 02:32:05 -0700 Subject: [PATCH 209/272] Update rl_replacements.py --- unsloth/models/rl_replacements.py | 50 ++++++++++++++++++++++++++++++- 1 file changed, 49 insertions(+), 1 deletion(-) diff --git a/unsloth/models/rl_replacements.py b/unsloth/models/rl_replacements.py index ec81106890..a207514e72 100644 --- a/unsloth/models/rl_replacements.py +++ b/unsloth/models/rl_replacements.py @@ -27,6 +27,7 @@ from collections import defaultdict from unsloth_zoo.rl_replacements import RL_REPLACEMENTS, left_pack_padding from unsloth import DEVICE_TYPE +import textwrap RL_EXTRA_ARGS = defaultdict(list) RL_FUNCTIONS = defaultdict(list) @@ -295,12 +296,59 @@ def grpo_trainer__generate_and_score_completions(function_name, function): if self.use_vllm:""" function = function.replace(replace_part, new_replacement) - return function pass RL_FUNCTIONS["grpo_trainer"].append(grpo_trainer__generate_and_score_completions) +# Fix {"reasoning_effort" : "high"} not applied +def grpo_trainer_fix_maybe_apply_chat_template(function_name, function): + spaces = function.find("def ") + if spaces % 4 != 0: return function + spaces += 4 + replacement = """ + _chat_template_ = getattr(self.processing_class, "chat_template", None) + if _chat_template_ is None: _chat_template_ = "" + _supported_keys_ = set(("prompt", "chosen", "rejected", "completion", "messages", "label")) + + prompts_text = [] + for _example_ in __INPUTS__REPLACEMENT__: + _tokenizer_kwargs_ = {} + if type(_example_) is not dict: + _example_ = {"prompt": _example_} + _left_keys_ = _example_.keys() - _supported_keys_ + for k in _left_keys_: + if k in _chat_template_: + v = _example_[k] + if type(v) is str: + _tokenizer_kwargs_[k] = v + _x_ = maybe_apply_chat_template(_example_, self.processing_class, **_tokenizer_kwargs_)["prompt"] + prompts_text.append(_x_) + """ + replacement = textwrap.dedent(replacement).strip() + replacement = textwrap.indent(replacement, spaces*" ") + replacement = f"\n{replacement}\n" + what = 'prompts_text = [maybe_apply_chat_template(example, self.processing_class)["prompt"] for example in inputs]' + function = function.replace(what, replacement.replace("__INPUTS__REPLACEMENT__", "inputs")) + + """prompts_text = [ + maybe_apply_chat_template({"prompt": prompt}, self.processing_class)["prompt"] for prompt in prompts + ]""" + function = re.sub( + r"prompts_text = \["\ + r"[\s]{0,}"\ + r"maybe_apply_chat_template\(\{[\"\']prompt[\"\'][\s]{0,}\:[\s]{0,}prompt[\s]{0,}\}[\s]{0,}\,[\s]{0,}self\.processing_class\)"\ + r"\[[\"\']prompt[\"\']\] for prompt in prompts"\ + r"[\s]{0,}"\ + r"\]", + replacement.replace("__INPUTS__REPLACEMENT__", "prompts"), + function, + ) + return function +pass +RL_FUNCTIONS["grpo_trainer"].append(grpo_trainer_fix_maybe_apply_chat_template) + + # Remove _move_model_to_vllm def grpo_trainer__move_model_to_vllm(function_name, function): if function_name != "_move_model_to_vllm": return function From 1edc796df17ec4c2da37b92168a79042d417eba8 Mon Sep 17 00:00:00 2001 From: Daniel Han Date: Tue, 30 Sep 2025 03:47:23 -0700 Subject: [PATCH 210/272] Update loader.py --- unsloth/models/loader.py | 11 ++++++++--- 1 file changed, 8 insertions(+), 3 deletions(-) diff --git a/unsloth/models/loader.py b/unsloth/models/loader.py index 98396bb754..3bcab1ce87 100644 --- a/unsloth/models/loader.py +++ b/unsloth/models/loader.py @@ -662,10 +662,15 @@ def from_pretrained( ) model_types_all = ",".join(model_types) + "," - # Check versions + # Save model types and loading method lowered_model_name = model_name.lower() - if os.environ.get("UNSLOTH_MODEL_NAME", "") == "": - os.environ["UNSLOTH_MODEL_NAME"] = lowered_model_name + string = os.environ.get("UNSLOTH_MODEL_NAME", "") + model_types_all + if load_in_4bit: string += "_load_in_4bit_" + if load_in_8bit: string += "_load_in_8bit_" + if load_in_16bit: string += "_load_in_16bit_" + os.environ["UNSLOTH_MODEL_NAME"] = string + + # Check versions LATEST = '\nPlease use transformers via `pip install --no-deps git+https://github.com/huggingface/transformers.git`' NIGHTLY = '\nPlease use nightly transformers via pip install --upgrade "transformers>=4.49.0"`' # Pixtral From 205d09cb1af754bcd2b948028ee729d3706e1a3e Mon Sep 17 00:00:00 2001 From: Daniel Han Date: Tue, 30 Sep 2025 04:35:03 -0700 Subject: [PATCH 211/272] Fix padding issue --- unsloth/models/_utils.py | 2 +- unsloth/models/vision.py | 15 +++++++++------ 2 files changed, 10 insertions(+), 7 deletions(-) diff --git a/unsloth/models/_utils.py b/unsloth/models/_utils.py index 5af8c756e3..2df9b878d0 100644 --- a/unsloth/models/_utils.py +++ b/unsloth/models/_utils.py @@ -12,7 +12,7 @@ # See the License for the specific language governing permissions and # limitations under the License. -__version__ = "2025.9.9" +__version__ = "2025.9.10" __all__ = [ "SUPPORTS_BFLOAT16", diff --git a/unsloth/models/vision.py b/unsloth/models/vision.py index 524bf64cdb..ae76c573d0 100644 --- a/unsloth/models/vision.py +++ b/unsloth/models/vision.py @@ -965,12 +965,15 @@ def post_patch_model( m.for_inference = functools.partial(FastBaseModel.for_inference, m) m = m.model # Set weight[padding_idx] = 0 - with torch.no_grad(): - for name, module in model.named_modules(): - if type(module) is torch.nn.Embedding: - if getattr(module, "weight", None) is not None and getattr(module, "padding_idx", None) is not None: - if module.padding_idx < module.weight.shape[0]: - module.weight[module.padding_idx] = 0 + # Only do this if tokenizer is defined since eos_token == pad_token sometimes! + pad_token_id = getattr(tokenizer, "pad_token_id", None) + if tokenizer is not None and getattr(tokenizer, "eos_token_id", None) != pad_token_id: + with torch.no_grad(): + for name, module in model.named_modules(): + if type(module) is torch.nn.Embedding: + if getattr(module, "weight", None) is not None and getattr(module, "padding_idx", None) is not None: + if module.padding_idx == pad_token_id and module.padding_idx < module.weight.shape[0]: + module.weight[module.padding_idx] = 0 return model pass From 07cc6ed405743f81e711ac4e2c53059a723111ff Mon Sep 17 00:00:00 2001 From: Daniel Han Date: Tue, 30 Sep 2025 04:59:34 -0700 Subject: [PATCH 212/272] Update pyproject.toml --- pyproject.toml | 14 +++++++------- 1 file changed, 7 insertions(+), 7 deletions(-) diff --git a/pyproject.toml b/pyproject.toml index 0a3cfa1f79..a3aa62d37b 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -37,18 +37,18 @@ triton = [ ] huggingface = [ - "unsloth_zoo>=2025.9.11", + "unsloth_zoo>=2025.9.12", "packaging", "tyro", "transformers>=4.51.3,!=4.52.0,!=4.52.1,!=4.52.2,!=4.52.3,!=4.53.0,!=4.54.0,!=4.55.0,!=4.55.1,<=4.56.2", - "datasets>=3.4.1,<4.0.0", + "datasets>=3.4.1,!=4.0.*,!=4.1.0", "sentencepiece>=0.2.0", "tqdm", "psutil", "wheel>=0.42.0", "numpy", "accelerate>=0.34.1", - "trl>=0.7.9,!=0.9.0,!=0.9.1,!=0.9.2,!=0.9.3,!=0.15.0,!=0.19.0", + "trl>=0.7.9,!=0.9.0,!=0.9.1,!=0.9.2,!=0.9.3,!=0.15.0,!=0.19.0,<=0.23.0", "peft>=0.7.1,!=0.11.0", "protobuf", "huggingface_hub>=0.34.0", @@ -453,11 +453,11 @@ colab-ampere-torch220 = [ "flash-attn>=2.6.3", ] colab-new = [ - "unsloth_zoo>=2025.9.11", + "unsloth_zoo>=2025.9.12", "packaging", "tyro", - "transformers>=4.51.3,!=4.47.0,!=4.52.0,!=4.52.1,!=4.52.2,!=4.52.3,!=4.53.0,!=4.54.0,!=4.55.0,!=4.55.1,<=4.56.2", - "datasets>=3.4.1,<4.0.0", + "transformers>=4.51.3,!=4.52.0,!=4.52.1,!=4.52.2,!=4.52.3,!=4.53.0,!=4.54.0,!=4.55.0,!=4.55.1,<=4.56.2", + "datasets>=3.4.1,!=4.0.*,!=4.1.0", "sentencepiece>=0.2.0", "tqdm", "psutil", @@ -471,7 +471,7 @@ colab-new = [ ] colab-no-deps = [ "accelerate>=0.34.1", - "trl>=0.7.9,!=0.9.0,!=0.9.1,!=0.9.2,!=0.9.3,!=0.15.0,!=0.19.0", + "trl>=0.7.9,!=0.9.0,!=0.9.1,!=0.9.2,!=0.9.3,!=0.15.0,!=0.19.0,<=0.23.0", "peft>=0.7.1", "xformers", "bitsandbytes>=0.45.5", From d225f7f1eb4f143812ec5637b4c8cc9a3fe846c7 Mon Sep 17 00:00:00 2001 From: Daniel Han Date: Tue, 30 Sep 2025 04:59:50 -0700 Subject: [PATCH 213/272] Update _utils.py --- unsloth/models/_utils.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/unsloth/models/_utils.py b/unsloth/models/_utils.py index 2df9b878d0..3844df7e97 100644 --- a/unsloth/models/_utils.py +++ b/unsloth/models/_utils.py @@ -12,7 +12,7 @@ # See the License for the specific language governing permissions and # limitations under the License. -__version__ = "2025.9.10" +__version__ = "2025.9.11" __all__ = [ "SUPPORTS_BFLOAT16", From 5d6c3d9c9eaf080454ad83607e9c83f29bb4ecaa Mon Sep 17 00:00:00 2001 From: Daniel Han Date: Tue, 30 Sep 2025 05:01:49 -0700 Subject: [PATCH 214/272] Update pyproject.toml --- pyproject.toml | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/pyproject.toml b/pyproject.toml index a3aa62d37b..510e186cde 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -37,7 +37,7 @@ triton = [ ] huggingface = [ - "unsloth_zoo>=2025.9.12", + "unsloth_zoo>=2025.9.13", "packaging", "tyro", "transformers>=4.51.3,!=4.52.0,!=4.52.1,!=4.52.2,!=4.52.3,!=4.53.0,!=4.54.0,!=4.55.0,!=4.55.1,<=4.56.2", @@ -453,7 +453,7 @@ colab-ampere-torch220 = [ "flash-attn>=2.6.3", ] colab-new = [ - "unsloth_zoo>=2025.9.12", + "unsloth_zoo>=2025.9.13", "packaging", "tyro", "transformers>=4.51.3,!=4.52.0,!=4.52.1,!=4.52.2,!=4.52.3,!=4.53.0,!=4.54.0,!=4.55.0,!=4.55.1,<=4.56.2", From af56af339e62d569028802f54ffd26b62923c57c Mon Sep 17 00:00:00 2001 From: Daniel Han Date: Tue, 30 Sep 2025 05:02:47 -0700 Subject: [PATCH 215/272] Update _utils.py --- unsloth/models/_utils.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/unsloth/models/_utils.py b/unsloth/models/_utils.py index 3844df7e97..2df9b878d0 100644 --- a/unsloth/models/_utils.py +++ b/unsloth/models/_utils.py @@ -12,7 +12,7 @@ # See the License for the specific language governing permissions and # limitations under the License. -__version__ = "2025.9.11" +__version__ = "2025.9.10" __all__ = [ "SUPPORTS_BFLOAT16", From eb2d403be4434e60a88cade57df5785bfbe01e31 Mon Sep 17 00:00:00 2001 From: Daniel Han Date: Wed, 1 Oct 2025 04:40:24 -0700 Subject: [PATCH 216/272] Update vision.py --- unsloth/models/vision.py | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/unsloth/models/vision.py b/unsloth/models/vision.py index 9d086a04d5..71e4a98a90 100644 --- a/unsloth/models/vision.py +++ b/unsloth/models/vision.py @@ -561,7 +561,9 @@ def from_pretrained( # model.device also will change to CPU so change back m = model while hasattr(m, "model"): - if hasattr(m, "device"): m._old_device_ = m.device + if hasattr(m, "device"): + m._old_device_ = m.device + print(m._old_device_) m = m.model if hasattr(m, "device"): m._old_device_ = m.device @@ -575,11 +577,13 @@ def from_pretrained( try: m.device = m._old_device_ except: pass del m._old_device_ + print(m._old_device_) m = m.model if hasattr(m, "device"): try: m.device = m._old_device_ except: pass del m._old_device_ + print(m._old_device_) # Add hooks to move inputs to CPU and back to CUDA # [TODO] Doesn't seem to work! From 9bc76e8db536334dc6a69aeeffa9d578bddc0554 Mon Sep 17 00:00:00 2001 From: Daniel Han Date: Wed, 1 Oct 2025 04:43:01 -0700 Subject: [PATCH 217/272] Update vision.py --- unsloth/models/vision.py | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/unsloth/models/vision.py b/unsloth/models/vision.py index 71e4a98a90..8bd2eea182 100644 --- a/unsloth/models/vision.py +++ b/unsloth/models/vision.py @@ -576,14 +576,15 @@ def from_pretrained( if hasattr(m, "device") and hasattr(m, "_old_device_"): try: m.device = m._old_device_ except: pass + print(m._old_device_, m.device) del m._old_device_ - print(m._old_device_) m = m.model if hasattr(m, "device"): try: m.device = m._old_device_ except: pass del m._old_device_ - print(m._old_device_) + print(m._old_device_, m.device) + print(model.device) # Add hooks to move inputs to CPU and back to CUDA # [TODO] Doesn't seem to work! From a0425bb45737499ce2dc87e0cd8ef982b2b0a24c Mon Sep 17 00:00:00 2001 From: Daniel Han Date: Wed, 1 Oct 2025 04:44:21 -0700 Subject: [PATCH 218/272] Update vision.py --- unsloth/models/vision.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/unsloth/models/vision.py b/unsloth/models/vision.py index 8bd2eea182..fd8cf04a64 100644 --- a/unsloth/models/vision.py +++ b/unsloth/models/vision.py @@ -563,7 +563,7 @@ def from_pretrained( while hasattr(m, "model"): if hasattr(m, "device"): m._old_device_ = m.device - print(m._old_device_) + print(m._old_device_, m.device) m = m.model if hasattr(m, "device"): m._old_device_ = m.device @@ -582,8 +582,8 @@ def from_pretrained( if hasattr(m, "device"): try: m.device = m._old_device_ except: pass - del m._old_device_ print(m._old_device_, m.device) + del m._old_device_ print(model.device) # Add hooks to move inputs to CPU and back to CUDA From b0ba73cd7e6f5005e271dade905f4fd6fe327ec9 Mon Sep 17 00:00:00 2001 From: Daniel Han Date: Wed, 1 Oct 2025 04:46:20 -0700 Subject: [PATCH 219/272] Update vision.py --- unsloth/models/vision.py | 6 ++---- 1 file changed, 2 insertions(+), 4 deletions(-) diff --git a/unsloth/models/vision.py b/unsloth/models/vision.py index fd8cf04a64..c07c3221f9 100644 --- a/unsloth/models/vision.py +++ b/unsloth/models/vision.py @@ -574,14 +574,12 @@ def from_pretrained( m = model while hasattr(m, "model"): if hasattr(m, "device") and hasattr(m, "_old_device_"): - try: m.device = m._old_device_ - except: pass + m.device = m._old_device_ print(m._old_device_, m.device) del m._old_device_ m = m.model if hasattr(m, "device"): - try: m.device = m._old_device_ - except: pass + m.device = m._old_device_ print(m._old_device_, m.device) del m._old_device_ print(model.device) From f85a91a90b04281880afa88e28996ae9e43c4aa5 Mon Sep 17 00:00:00 2001 From: Daniel Han Date: Wed, 1 Oct 2025 04:57:03 -0700 Subject: [PATCH 220/272] Update vision.py --- unsloth/models/vision.py | 25 ------------------------- 1 file changed, 25 deletions(-) diff --git a/unsloth/models/vision.py b/unsloth/models/vision.py index c07c3221f9..fbe3c832c2 100644 --- a/unsloth/models/vision.py +++ b/unsloth/models/vision.py @@ -557,33 +557,8 @@ def from_pretrained( nbytes = embed_tokens.weight.numel() * embed_tokens.weight.itemsize ngb = round(nbytes / 1024 / 1024 / 1024, 2) print(f"Unsloth: Offloading embeddings to RAM to save {ngb} GB.") - - # model.device also will change to CPU so change back - m = model - while hasattr(m, "model"): - if hasattr(m, "device"): - m._old_device_ = m.device - print(m._old_device_, m.device) - m = m.model - if hasattr(m, "device"): m._old_device_ = m.device - - # Move embeddings to CPU embed_tokens.to("cpu") - # model.device also will change to CPU so change back - m = model - while hasattr(m, "model"): - if hasattr(m, "device") and hasattr(m, "_old_device_"): - m.device = m._old_device_ - print(m._old_device_, m.device) - del m._old_device_ - m = m.model - if hasattr(m, "device"): - m.device = m._old_device_ - print(m._old_device_, m.device) - del m._old_device_ - print(model.device) - # Add hooks to move inputs to CPU and back to CUDA # [TODO] Doesn't seem to work! # def pre_hook(module, args): From 47f2ef72631b6515bac8c1d353ec371fd129c6d1 Mon Sep 17 00:00:00 2001 From: Daniel Han Date: Sat, 4 Oct 2025 23:39:09 -0700 Subject: [PATCH 221/272] Update vision.py --- unsloth/models/vision.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/unsloth/models/vision.py b/unsloth/models/vision.py index fbe3c832c2..a7c12e9644 100644 --- a/unsloth/models/vision.py +++ b/unsloth/models/vision.py @@ -636,7 +636,9 @@ def from_pretrained( # Check float32 norm weights if os.environ.get("UNSLOTH_HIGH_PRECISION_LAYERNORM", "0") == "1": for jj, (name, module) in enumerate(model.named_modules()): - if name.endswith("norm") and hasattr(module, "weight"): + if (name.endswith(("norm", "norm1", "norm2", "norm3", "norm4")) \ + or "layernorm" in name or "layer_norm" in name) \ + and hasattr(module, "weight"): module._pre_set_compute_dtype = torch.float32 pass # Edit data-types From 06fc86f084a898c356ed137f11a2b671a50bbe64 Mon Sep 17 00:00:00 2001 From: Daniel Han Date: Sun, 5 Oct 2025 00:40:16 -0700 Subject: [PATCH 222/272] New models --- pyproject.toml | 4 ++-- unsloth/models/_utils.py | 2 +- unsloth/models/mapper.py | 46 ++++++++++++++++++++++++++++++++++++++++ 3 files changed, 49 insertions(+), 3 deletions(-) diff --git a/pyproject.toml b/pyproject.toml index 6c0b7f8ca1..1bbb9c657a 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -37,7 +37,7 @@ triton = [ ] huggingface = [ - "unsloth_zoo>=2025.9.14", + "unsloth_zoo>=2025.10.1", "packaging", "tyro", "transformers>=4.51.3,!=4.52.0,!=4.52.1,!=4.52.2,!=4.52.3,!=4.53.0,!=4.54.0,!=4.55.0,!=4.55.1,<=4.56.2", @@ -453,7 +453,7 @@ colab-ampere-torch220 = [ "flash-attn>=2.6.3", ] colab-new = [ - "unsloth_zoo>=2025.9.14", + "unsloth_zoo>=2025.10.1", "packaging", "tyro", "transformers>=4.51.3,!=4.52.0,!=4.52.1,!=4.52.2,!=4.52.3,!=4.53.0,!=4.54.0,!=4.55.0,!=4.55.1,<=4.56.2", diff --git a/unsloth/models/_utils.py b/unsloth/models/_utils.py index 3079196e68..8650e21438 100644 --- a/unsloth/models/_utils.py +++ b/unsloth/models/_utils.py @@ -12,7 +12,7 @@ # See the License for the specific language governing permissions and # limitations under the License. -__version__ = "2025.9.11" +__version__ = "2025.10.1" __all__ = [ "SUPPORTS_BFLOAT16", diff --git a/unsloth/models/mapper.py b/unsloth/models/mapper.py index eb9119b681..600396ed46 100644 --- a/unsloth/models/mapper.py +++ b/unsloth/models/mapper.py @@ -966,6 +966,52 @@ "mistralai/Magistral-Small-2509", "unsloth/Magistral-Small-2509-bnb-4bit", ), + "unsloth/Apertus-70B-Instruct-2509-unsloth-bnb-4bit" : ( + "unsloth/Apertus-70B-Instruct-2509", + "swiss-ai/Apertus-70B-2509", + "unsloth/Apertus-70B-Instruct-2509-unsloth-bnb-4bit", + ), + "unsloth/Apertus-8B-Instruct-2509-unsloth-bnb-4bit" : ( + "unsloth/Apertus-8B-Instruct-2509", + "swiss-ai/Apertus-8B-2509", + "unsloth/Apertus-8B-Instruct-2509-unsloth-bnb-4bit", + ), + "unsloth/granite-4.0-micro-unsloth-bnb-4bit" : ( + "unsloth/granite-4.0-micro", + "ibm-granite/granite-4.0-micro", + "unsloth/granite-4.0-micro-bnb-4bit", + ), + "unsloth/granite-4.0-h-micro-unsloth-bnb-4bit" : ( + "unsloth/granite-4.0-h-micro", + "ibm-granite/granite-4.0-h-micro", + "unsloth/granite-4.0-h-micro-bnb-4bit", + ), + "unsloth/granite-4.0-micro-base-unsloth-bnb-4bit" : ( + "unsloth/granite-4.0-micro-base", + "ibm-granite/granite-4.0-micro-base", + "unsloth/granite-4.0-micro-base-bnb-4bit", + ), + "unsloth/granite-4.0-h-micro-base-unsloth-bnb-4bit" : ( + "unsloth/granite-4.0-h-micro-base", + "ibm-granite/granite-4.0-h-micro-base", + "unsloth/granite-4.0-h-micro-base-bnb-4bit", + ), + "unsloth/granite-4.0-h-tiny" : ( + "unsloth/granite-4.0-h-tiny", + "ibm-granite/granite-4.0-h-tiny", + ), + "unsloth/granite-4.0-h-small" : ( + "unsloth/granite-4.0-h-small", + "ibm-granite/granite-4.0-h-small", + ), + "unsloth/granite-4.0-h-tiny-base" : ( + "unsloth/granite-4.0-h-tiny-base", + "ibm-granite/granite-4.0-h-tiny-base", + ), + "unsloth/granite-4.0-h-small-base" : ( + "unsloth/granite-4.0-h-small-base", + "ibm-granite/granite-4.0-h-small-base", + ), } INT_TO_FLOAT_MAPPER = {} From 778da7dcf8e375c71339d77405ad6b491c149515 Mon Sep 17 00:00:00 2001 From: Daniel Han Date: Thu, 16 Oct 2025 05:32:29 -0700 Subject: [PATCH 223/272] Update llama.py --- unsloth/models/llama.py | 1 + 1 file changed, 1 insertion(+) diff --git a/unsloth/models/llama.py b/unsloth/models/llama.py index 8ff74872a3..21b10f15cf 100644 --- a/unsloth/models/llama.py +++ b/unsloth/models/llama.py @@ -1195,6 +1195,7 @@ def _CausalLM_fast_forward( logits = self.lm_head(hidden_states[:, -num_logits_to_keep:, :].to(dtype)) else: RETURN_LOGITS = os.environ.get("UNSLOTH_RETURN_LOGITS", "0") == "1" + print("RETURN_LOGITS", RETURN_LOGITS) # < 1024 Normal Unsloth uses less VRAM! if bsz*q_len <= 1024: RETURN_LOGITS = True From ed443ee0c8057831d029bdd5fa6920994d9e80a8 Mon Sep 17 00:00:00 2001 From: Daniel Han Date: Thu, 16 Oct 2025 05:36:22 -0700 Subject: [PATCH 224/272] Versioning --- pyproject.toml | 4 ++-- unsloth/models/_utils.py | 2 +- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/pyproject.toml b/pyproject.toml index 0b10d0ca13..2f812c769f 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -40,7 +40,7 @@ triton = [ "triton-windows ; (sys_platform == 'win32') and (platform_machine == 'AMD64' or platform_machine == 'x86_64')", ] huggingface = [ - "unsloth_zoo>=2025.10.3", + "unsloth_zoo>=2025.10.4", "wheel>=0.42.0", "packaging", "torchvision", @@ -458,7 +458,7 @@ colab-ampere-torch220 = [ "flash-attn>=2.6.3 ; ('linux' in sys_platform)", ] colab-new = [ - "unsloth_zoo>=2025.10.3", + "unsloth_zoo>=2025.10.4", "packaging", "tyro", "transformers>=4.51.3,!=4.52.0,!=4.52.1,!=4.52.2,!=4.52.3,!=4.53.0,!=4.54.0,!=4.55.0,!=4.55.1,<=4.56.2", diff --git a/unsloth/models/_utils.py b/unsloth/models/_utils.py index 93575a043d..5746f91694 100644 --- a/unsloth/models/_utils.py +++ b/unsloth/models/_utils.py @@ -12,7 +12,7 @@ # See the License for the specific language governing permissions and # limitations under the License. -__version__ = "2025.10.3" +__version__ = "2025.10.4" __all__ = [ "SUPPORTS_BFLOAT16", From da00e2f0d1f9859ad9941adfce7dc8d60bb30622 Mon Sep 17 00:00:00 2001 From: Daniel Han Date: Thu, 16 Oct 2025 05:36:59 -0700 Subject: [PATCH 225/272] Update _utils.py --- unsloth/models/_utils.py | 15 +++++++++------ 1 file changed, 9 insertions(+), 6 deletions(-) diff --git a/unsloth/models/_utils.py b/unsloth/models/_utils.py index 5746f91694..f5948cef1a 100644 --- a/unsloth/models/_utils.py +++ b/unsloth/models/_utils.py @@ -1652,14 +1652,17 @@ def error_out_no_vllm(*args, **kwargs): raise NotImplementedError("Unsloth: vLLM is not yet supported for fast inference for this model! Please use `.generate` instead") -from torchao.core.config import AOBaseConfig try: - from torchao.quantization import Int4WeightOnlyConfig + from torchao.core.config import AOBaseConfig + try: + from torchao.quantization import Int4WeightOnlyConfig + except: + print("Unsloth: TorchAO changed `torchao.quantization.Int4WeightOnlyConfig`") + Int4WeightOnlyConfig = None + pass except: - print("Unsloth: TorchAO changed `torchao.quantization.Int4WeightOnlyConfig`") - Int4WeightOnlyConfig = None -pass - + AOBaseConfig = None + pass @dataclass class TorchAOConfig: qat_scheme : str = "int4" From 250ea60650ebc245cb2278bc63756e7df5c13db4 Mon Sep 17 00:00:00 2001 From: Daniel Han Date: Thu, 16 Oct 2025 05:37:50 -0700 Subject: [PATCH 226/272] Update llama.py --- unsloth/models/llama.py | 1 - 1 file changed, 1 deletion(-) diff --git a/unsloth/models/llama.py b/unsloth/models/llama.py index 21b10f15cf..8ff74872a3 100644 --- a/unsloth/models/llama.py +++ b/unsloth/models/llama.py @@ -1195,7 +1195,6 @@ def _CausalLM_fast_forward( logits = self.lm_head(hidden_states[:, -num_logits_to_keep:, :].to(dtype)) else: RETURN_LOGITS = os.environ.get("UNSLOTH_RETURN_LOGITS", "0") == "1" - print("RETURN_LOGITS", RETURN_LOGITS) # < 1024 Normal Unsloth uses less VRAM! if bsz*q_len <= 1024: RETURN_LOGITS = True From a921ea6450a8c3f2dc62e78cb4821abd317066e4 Mon Sep 17 00:00:00 2001 From: Daniel Han Date: Thu, 16 Oct 2025 05:38:16 -0700 Subject: [PATCH 227/272] Update _utils.py --- unsloth/models/_utils.py | 1 + 1 file changed, 1 insertion(+) diff --git a/unsloth/models/_utils.py b/unsloth/models/_utils.py index f5948cef1a..bf7d441c38 100644 --- a/unsloth/models/_utils.py +++ b/unsloth/models/_utils.py @@ -1662,6 +1662,7 @@ def error_out_no_vllm(*args, **kwargs): pass except: AOBaseConfig = None + Int4WeightOnlyConfig = None pass @dataclass class TorchAOConfig: From c90df8745fab97c254902cd6f453341ee8d004c7 Mon Sep 17 00:00:00 2001 From: Daniel Han Date: Thu, 16 Oct 2025 05:40:44 -0700 Subject: [PATCH 228/272] Update llama.py --- unsloth/models/llama.py | 1 + 1 file changed, 1 insertion(+) diff --git a/unsloth/models/llama.py b/unsloth/models/llama.py index 8ff74872a3..21b10f15cf 100644 --- a/unsloth/models/llama.py +++ b/unsloth/models/llama.py @@ -1195,6 +1195,7 @@ def _CausalLM_fast_forward( logits = self.lm_head(hidden_states[:, -num_logits_to_keep:, :].to(dtype)) else: RETURN_LOGITS = os.environ.get("UNSLOTH_RETURN_LOGITS", "0") == "1" + print("RETURN_LOGITS", RETURN_LOGITS) # < 1024 Normal Unsloth uses less VRAM! if bsz*q_len <= 1024: RETURN_LOGITS = True From c64f0113f2fe9bed8b7d3130435bba9eb7f6c4cf Mon Sep 17 00:00:00 2001 From: Daniel Han Date: Thu, 16 Oct 2025 05:47:41 -0700 Subject: [PATCH 229/272] Fix AMD --- unsloth/models/_utils.py | 4 ++-- unsloth/models/llama.py | 9 ++++++--- unsloth/models/mistral.py | 7 ++++++- 3 files changed, 14 insertions(+), 6 deletions(-) diff --git a/unsloth/models/_utils.py b/unsloth/models/_utils.py index bf7d441c38..ab3a9e058e 100644 --- a/unsloth/models/_utils.py +++ b/unsloth/models/_utils.py @@ -458,8 +458,8 @@ def patch_mistral_nemo_config(config): config = re.sub( r"(\*\*kwargs)[\s]{0,}\,[\s]{0,}\)[\s]{0,}\:", r"rope_scaling=None,"\ - r"\n **kwargs):\n"\ - r"\n self.rope_scaling = rope_scaling\n", + r"\n \*\*kwargs):\n"\ + r"\n self\.rope_scaling = rope_scaling\n", config, ) diff --git a/unsloth/models/llama.py b/unsloth/models/llama.py index 21b10f15cf..75c4e14308 100644 --- a/unsloth/models/llama.py +++ b/unsloth/models/llama.py @@ -1195,12 +1195,15 @@ def _CausalLM_fast_forward( logits = self.lm_head(hidden_states[:, -num_logits_to_keep:, :].to(dtype)) else: RETURN_LOGITS = os.environ.get("UNSLOTH_RETURN_LOGITS", "0") == "1" - print("RETURN_LOGITS", RETURN_LOGITS) # < 1024 Normal Unsloth uses less VRAM! - if bsz*q_len <= 1024: RETURN_LOGITS = True + if DEVICE_TYPE == "hip": + # [TODO] AMD GPUs fail on chunked_cross_entropy loss! + # RuntimeError: Triton Error [HIP]: Code: 1, Messsage: invalid argument + RETURN_LOGITS = False + elif bsz*q_len <= 1024: + RETURN_LOGITS = True if not RETURN_LOGITS and labels is not None: - n_items = kwargs.get("num_items_in_batch", None) if n_items is None: n_items = kwargs.get("n_items", None) diff --git a/unsloth/models/mistral.py b/unsloth/models/mistral.py index faab2d30b1..b547739df2 100644 --- a/unsloth/models/mistral.py +++ b/unsloth/models/mistral.py @@ -298,7 +298,12 @@ def MistralForCausalLM_fast_forward( else: RETURN_LOGITS = os.environ.get("UNSLOTH_RETURN_LOGITS", "0") == "1" # < 1024 Normal Unsloth uses less VRAM! - if bsz * q_len <= 1024: RETURN_LOGITS = True + if DEVICE_TYPE == "hip": + # [TODO] AMD GPUs fail on chunked_cross_entropy loss! + # RuntimeError: Triton Error [HIP]: Code: 1, Messsage: invalid argument + RETURN_LOGITS = False + elif bsz*q_len <= 1024: + RETURN_LOGITS = True if not RETURN_LOGITS and labels is not None: n_items = kwargs.get("num_items_in_batch", None) or kwargs.get("n_items", None) From 8eecf7d62e481b03e4f27b232c8d5eeb641a9973 Mon Sep 17 00:00:00 2001 From: Daniel Han Date: Thu, 16 Oct 2025 05:49:48 -0700 Subject: [PATCH 230/272] Update _utils.py --- unsloth/models/_utils.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/unsloth/models/_utils.py b/unsloth/models/_utils.py index ab3a9e058e..bf7d441c38 100644 --- a/unsloth/models/_utils.py +++ b/unsloth/models/_utils.py @@ -458,8 +458,8 @@ def patch_mistral_nemo_config(config): config = re.sub( r"(\*\*kwargs)[\s]{0,}\,[\s]{0,}\)[\s]{0,}\:", r"rope_scaling=None,"\ - r"\n \*\*kwargs):\n"\ - r"\n self\.rope_scaling = rope_scaling\n", + r"\n **kwargs):\n"\ + r"\n self.rope_scaling = rope_scaling\n", config, ) From c22b9a351993c89bf2b05f364fe476222c9f4d41 Mon Sep 17 00:00:00 2001 From: Daniel Han Date: Thu, 16 Oct 2025 05:54:01 -0700 Subject: [PATCH 231/272] Update llama.py --- unsloth/models/llama.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/unsloth/models/llama.py b/unsloth/models/llama.py index 75c4e14308..42c0eaf8ed 100644 --- a/unsloth/models/llama.py +++ b/unsloth/models/llama.py @@ -1764,7 +1764,8 @@ def unsloth_fast_generate( kwargs["pad_token_id"] = kwargs.pop("pad_token_id", model_eos_token_id) # Mixed precision autocast - with torch.inference_mode(), torch.autocast(device_type = DEVICE_TYPE, dtype = dtype): + device_type = DEVICE_TYPE if DEVICE_TYPE != "hip" else "cuda" # hip doesn't work + with torch.inference_mode(), torch.autocast(device_type = device_type, dtype = dtype): output = self._old_generate(*args, **kwargs) pass From 38b9e00d81c119a858822c2a902f48744fbcfc14 Mon Sep 17 00:00:00 2001 From: Daniel Han Date: Thu, 16 Oct 2025 05:56:24 -0700 Subject: [PATCH 232/272] Update vision.py --- unsloth/models/vision.py | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/unsloth/models/vision.py b/unsloth/models/vision.py index b90ad00cf8..cfc1d0d082 100644 --- a/unsloth/models/vision.py +++ b/unsloth/models/vision.py @@ -203,11 +203,12 @@ def unsloth_base_fast_generate( except: pass # Mixed precision autocast + device_type = DEVICE_TYPE if DEVICE_TYPE != "hip" else "cuda" # hip doesn't work if os.environ.get("UNSLOTH_FORCE_FLOAT32", "0") == "1": - autocaster = torch.autocast(device_type = "cuda", dtype = torch.float16) + autocaster = torch.autocast(device_type = device_type, dtype = torch.float16) dtype = torch.float16 else: - autocaster = torch.autocast(device_type = "cuda", dtype = dtype) + autocaster = torch.autocast(device_type = device_type, dtype = dtype) # Prepare LoRA # state_dict = convert_lora_modules(self, dtype = dtype) From b99dcd5e469d1150bb0f930dc1414541574befcf Mon Sep 17 00:00:00 2001 From: Daniel Han Date: Thu, 16 Oct 2025 06:03:08 -0700 Subject: [PATCH 233/272] DEVICE_TYPE_TORCH --- unsloth/__init__.py | 2 ++ unsloth/models/_utils.py | 2 +- unsloth/models/llama.py | 33 ++++++++++++++++----------------- unsloth/models/vision.py | 7 +++---- 4 files changed, 22 insertions(+), 22 deletions(-) diff --git a/unsloth/__init__.py b/unsloth/__init__.py index 5dd16bae99..45719d472c 100644 --- a/unsloth/__init__.py +++ b/unsloth/__init__.py @@ -96,6 +96,8 @@ def get_device_type(): raise NotImplementedError("Unsloth currently only works on NVIDIA, AMD and Intel GPUs.") pass DEVICE_TYPE : str = get_device_type() +# HIP fails for autocast and other torch functions. Use CUDA instead +DEVICE_TYPE_TORCH = DEVICE_TYPE if DEVICE_TYPE != "hip" else DEVICE_TYPE @functools.cache def get_device_count(): diff --git a/unsloth/models/_utils.py b/unsloth/models/_utils.py index bf7d441c38..e787f55532 100644 --- a/unsloth/models/_utils.py +++ b/unsloth/models/_utils.py @@ -87,7 +87,7 @@ import warnings, subprocess, re, inspect, psutil, os, math from unsloth_zoo.utils import Version from importlib.metadata import version as importlib_version -from unsloth import DEVICE_TYPE, DEVICE_COUNT +from unsloth import DEVICE_TYPE, DEVICE_COUNT, DEVICE_TYPE_TORCH from unsloth_zoo.log import logger from unsloth_zoo.tokenizer_utils import ( patch_tokenizer as _patch_tokenizer, diff --git a/unsloth/models/llama.py b/unsloth/models/llama.py index 42c0eaf8ed..596042288d 100644 --- a/unsloth/models/llama.py +++ b/unsloth/models/llama.py @@ -27,7 +27,7 @@ from unsloth_zoo.utils import Version, _get_dtype from unsloth_zoo.hf_utils import dtype_from_config, add_dtype_kwargs, fix_lora_auto_mapping from unsloth_zoo.peft_utils import SKIP_QUANTIZATION_MODULES -from unsloth import DEVICE_TYPE, DEVICE_COUNT +from unsloth import DEVICE_TYPE, DEVICE_COUNT, DEVICE_TYPE_TORCH transformers_version = Version(transformers_version) # Transformers moved rotary embeddings out of all attention layers @@ -732,7 +732,7 @@ def LlamaModel_fast_forward( position_ids = torch.arange( past_key_values_length, seq_length + past_key_values_length, dtype = torch.int32, - device = f"{DEVICE_TYPE}:0", + device = f"{DEVICE_TYPE_TORCH}:0", ) position_ids = position_ids.unsqueeze(0).view(-1, seq_length) elif position_ids is not None: @@ -905,13 +905,13 @@ def LlamaModel_fast_forward( is_causal = True, sliding_window = self.config.sliding_window, )\ - .to_causal_4d(1, n, n, dtype = inputs_embeds.dtype, device = DEVICE_TYPE,)\ + .to_causal_4d(1, n, n, dtype = inputs_embeds.dtype, device = DEVICE_TYPE_TORCH,)\ .squeeze(0).squeeze(0) self.GA_mask = AttentionMaskConverter( is_causal = True, )\ - .to_causal_4d(1, n, n, dtype = inputs_embeds.dtype, device = DEVICE_TYPE,)\ + .to_causal_4d(1, n, n, dtype = inputs_embeds.dtype, device = DEVICE_TYPE_TORCH,)\ .squeeze(0).squeeze(0) pass pass @@ -1028,11 +1028,11 @@ def LlamaModel_fast_forward_inference_custom( bsz, q_len, hd = X.shape assert(q_len == 1) # Get saved buffers to reduce memory movement - residual = torch.empty((bsz, q_len, hd), dtype = torch.float32, device = f"{DEVICE_TYPE}:0") - _XX = torch.empty((2, bsz, q_len, hd), dtype = torch.float32, device = f"{DEVICE_TYPE}:0") + residual = torch.empty((bsz, q_len, hd), dtype = torch.float32, device = f"{DEVICE_TYPE_TORCH}:0") + _XX = torch.empty((2, bsz, q_len, hd), dtype = torch.float32, device = f"{DEVICE_TYPE_TORCH}:0") XX, XX2 = _XX[0], _XX[1] - variance = torch.empty((bsz, q_len, 1), dtype = torch.float32, device = f"{DEVICE_TYPE}:0") - temp_mlp = torch.empty((2, bsz, 1, mlp_size), dtype = X.dtype, device = f"{DEVICE_TYPE}:0") + variance = torch.empty((bsz, q_len, 1), dtype = torch.float32, device = f"{DEVICE_TYPE_TORCH}:0") + temp_mlp = torch.empty((2, bsz, 1, mlp_size), dtype = X.dtype, device = f"{DEVICE_TYPE_TORCH}:0") temp_gates, temp_ups = tuple(temp_mlp[0].to(torch.device(x)) for x in range(DEVICE_COUNT)), tuple(temp_mlp[1].to(torch.device(x)) for x in range(DEVICE_COUNT)) seq_len = past_key_values[0][0].shape[-2] @@ -1378,7 +1378,7 @@ def __init__(self, dim = None, max_position_embeddings=2048, base=10000, device= partial_rotary_factor = config.partial_rotary_factor if hasattr(config, "partial_rotary_factor") else 1.0 dim = getattr(config, "head_dim", None) if dim is None: dim = int((config.hidden_size // config.num_attention_heads)) - device = DEVICE_TYPE + device = DEVICE_TYPE_TORCH max_position_embeddings = config.max_position_embeddings pass @@ -1490,7 +1490,7 @@ def __init__(self, dim = None, max_position_embeddings=2048, base=10000, device= base = config.rope_theta partial_rotary_factor = config.partial_rotary_factor if hasattr(config, "partial_rotary_factor") else 1.0 dim = int((config.hidden_size // config.num_attention_heads)) - device = DEVICE_TYPE + device = DEVICE_TYPE_TORCH max_position_embeddings = config.max_position_embeddings pass @@ -1610,7 +1610,7 @@ def __init__(self, base = config.rope_theta partial_rotary_factor = config.partial_rotary_factor if hasattr(config, "partial_rotary_factor") else 1.0 dim = int((config.hidden_size // config.num_attention_heads)) - device = DEVICE_TYPE + device = DEVICE_TYPE_TORCH max_position_embeddings = config.max_position_embeddings pass @@ -1764,8 +1764,7 @@ def unsloth_fast_generate( kwargs["pad_token_id"] = kwargs.pop("pad_token_id", model_eos_token_id) # Mixed precision autocast - device_type = DEVICE_TYPE if DEVICE_TYPE != "hip" else "cuda" # hip doesn't work - with torch.inference_mode(), torch.autocast(device_type = device_type, dtype = dtype): + with torch.inference_mode(), torch.autocast(device_type = DEVICE_TYPE_TORCH, dtype = dtype): output = self._old_generate(*args, **kwargs) pass @@ -2389,7 +2388,7 @@ def get_peft_model( pass model.get_input_embeddings().modules_to_save.default\ - .to(device = DEVICE_TYPE, dtype = new_dtype, non_blocking = True) + .to(device = DEVICE_TYPE_TORCH, dtype = new_dtype, non_blocking = True) model.get_input_embeddings().modules_to_save.default.requires_grad_(True) # [TODO] Move old embed_tokens to CPU - should be disk! @@ -2409,7 +2408,7 @@ def get_peft_model( pass model.get_output_embeddings().modules_to_save.default\ - .to(device = DEVICE_TYPE, dtype = new_dtype, non_blocking = True) + .to(device = DEVICE_TYPE_TORCH, dtype = new_dtype, non_blocking = True) model.get_output_embeddings().modules_to_save.default.requires_grad_(True) # [TODO] Move old lm_head to CPU - should be disk! @@ -2678,7 +2677,7 @@ def get_peft_model( pass model.get_input_embeddings().modules_to_save.default\ - .to(device = DEVICE_TYPE, dtype = new_dtype, non_blocking = True) + .to(device = DEVICE_TYPE_TORCH, dtype = new_dtype, non_blocking = True) model.get_input_embeddings().modules_to_save.default.requires_grad_(True) pass @@ -2694,7 +2693,7 @@ def get_peft_model( pass model.get_output_embeddings().modules_to_save.default\ - .to(device = DEVICE_TYPE, dtype = new_dtype, non_blocking = True) + .to(device = DEVICE_TYPE_TORCH, dtype = new_dtype, non_blocking = True) model.get_output_embeddings().modules_to_save.default.requires_grad_(True) pass diff --git a/unsloth/models/vision.py b/unsloth/models/vision.py index cfc1d0d082..b2704876e3 100644 --- a/unsloth/models/vision.py +++ b/unsloth/models/vision.py @@ -71,7 +71,7 @@ # Old HF Hub versions <= 0.0.25 from huggingface_hub.utils._token import get_token pass -from unsloth import DEVICE_TYPE, DEVICE_COUNT +from unsloth import DEVICE_TYPE, DEVICE_COUNT, DEVICE_TYPE_TORCH __all__ = [ "FastBaseModel", @@ -203,12 +203,11 @@ def unsloth_base_fast_generate( except: pass # Mixed precision autocast - device_type = DEVICE_TYPE if DEVICE_TYPE != "hip" else "cuda" # hip doesn't work if os.environ.get("UNSLOTH_FORCE_FLOAT32", "0") == "1": - autocaster = torch.autocast(device_type = device_type, dtype = torch.float16) + autocaster = torch.autocast(device_type = DEVICE_TYPE_TORCH, dtype = torch.float16) dtype = torch.float16 else: - autocaster = torch.autocast(device_type = device_type, dtype = dtype) + autocaster = torch.autocast(device_type = DEVICE_TYPE_TORCH, dtype = dtype) # Prepare LoRA # state_dict = convert_lora_modules(self, dtype = dtype) From 19bc977f21fbb678d6bd961ca89cdbc00f66b90d Mon Sep 17 00:00:00 2001 From: Daniel Han Date: Thu, 16 Oct 2025 06:07:00 -0700 Subject: [PATCH 234/272] Update __init__.py --- unsloth/__init__.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/unsloth/__init__.py b/unsloth/__init__.py index 45719d472c..d2b77f6b37 100644 --- a/unsloth/__init__.py +++ b/unsloth/__init__.py @@ -97,7 +97,8 @@ def get_device_type(): pass DEVICE_TYPE : str = get_device_type() # HIP fails for autocast and other torch functions. Use CUDA instead -DEVICE_TYPE_TORCH = DEVICE_TYPE if DEVICE_TYPE != "hip" else DEVICE_TYPE +DEVICE_TYPE_TORCH = DEVICE_TYPE +if DEVICE_TYPE_TORCH == "hip": DEVICE_TYPE_TORCH = "cuda" @functools.cache def get_device_count(): From 5aa6a39ad848ef901db2788cd8d32f333ba8b463 Mon Sep 17 00:00:00 2001 From: Daniel Han Date: Thu, 16 Oct 2025 06:37:29 -0700 Subject: [PATCH 235/272] Update __init__.py --- unsloth/__init__.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/unsloth/__init__.py b/unsloth/__init__.py index d2b77f6b37..99d651ae5f 100644 --- a/unsloth/__init__.py +++ b/unsloth/__init__.py @@ -149,7 +149,9 @@ def get_device_count(): # OutOfResources: out of resource: shared memory, Required: 98304, Hardware limit: 65536. Reducing block sizes or `num_stages` if (major_torch >= 2 and minor_torch >= 8) or (major_torch > 2): os.environ["UNSLOTH_ENABLE_CCE"] = "0" -pass +elif DEVICE_TYPE == "hip": + # CCE also fails in HIP / AMD + os.environ["UNSLOTH_ENABLE_CCE"] = "0" # Fix other issues import importlib.util From 0576c13e0ce1fa300ed08244c10b1b8bfad197f4 Mon Sep 17 00:00:00 2001 From: Daniel Han Date: Thu, 16 Oct 2025 21:36:05 -0700 Subject: [PATCH 236/272] Update _utils.py --- unsloth/models/_utils.py | 26 ++++++++++++++++++++++++++ 1 file changed, 26 insertions(+) diff --git a/unsloth/models/_utils.py b/unsloth/models/_utils.py index beefada7bf..cb98db00f1 100644 --- a/unsloth/models/_utils.py +++ b/unsloth/models/_utils.py @@ -1346,6 +1346,32 @@ def patch_gradient_accumulation_fix(Trainer): # Also fix passing in num_items_in_batch if not hasattr(Trainer, "_old_compute_loss"): + + # Fix transformers 4.57.0 causing `Output 0 of UnslothFusedLossBackward is a view and is being modified inplace.` + function = inspect.getsource(Trainer.compute_loss) + if "loss *=" in function or "loss*=" in function: + where = function.find("def") + function = function.split("\n") + function = "\n".join(x[where:] for x in function) + + # Import all variables that need importing + import transformers.trainer + items_in_trainer = dir(transformers.trainer) + good_items = [] + for item in items_in_trainer: + if item in function: good_items.append(item) + pass + exec("from transformers.trainer import (" + ", ".join(x for x in good_items) + ")", globals()) + + # Replace loss*= with loss = loss * + function = re.sub( + r"loss[\s]{0,}\*\=", + "loss = loss *", + function, + ) + exec(function, globals()) + Trainer.compute_loss = compute_loss + pass Trainer._old_compute_loss = Trainer.compute_loss Trainer.compute_loss = _unsloth_pre_compute_loss pass From ee46343c57d8c8628d3de401a8865cac83bfe416 Mon Sep 17 00:00:00 2001 From: Daniel Han Date: Thu, 16 Oct 2025 23:26:06 -0700 Subject: [PATCH 237/272] Move DEVICE_TYPE --- unsloth/__init__.py | 56 +++++----------------- unsloth/device_type.py | 80 +++++++++++++++++++++++++++++++ unsloth/kernels/utils.py | 9 +++- unsloth/models/_utils.py | 10 +++- unsloth/models/llama.py | 9 +++- unsloth/models/loader.py | 33 ++++++++++++- unsloth/models/rl_replacements.py | 9 +++- unsloth/models/vision.py | 9 +++- 8 files changed, 165 insertions(+), 50 deletions(-) create mode 100644 unsloth/device_type.py diff --git a/unsloth/__init__.py b/unsloth/__init__.py index 99d651ae5f..b336ad03fe 100644 --- a/unsloth/__init__.py +++ b/unsloth/__init__.py @@ -69,49 +69,14 @@ raise exception pass -@functools.cache -def is_hip(): - return bool(getattr(getattr(torch, "version", None), "hip", None)) -pass - -@functools.cache -def get_device_type(): - if hasattr(torch, "cuda") and torch.cuda.is_available(): - if is_hip(): - return "hip" - return "cuda" - elif hasattr(torch, "xpu") and torch.xpu.is_available(): - return "xpu" - # Check torch.accelerator - if hasattr(torch, "accelerator"): - if not torch.accelerator.is_available(): - raise NotImplementedError("Unsloth cannot find any torch accelerator? You need a GPU.") - accelerator = str(torch.accelerator.current_accelerator()) - if accelerator in ("cuda", "xpu", "hip"): - raise RuntimeError( - f"Unsloth: Weirdly `torch.cuda.is_available()`, `torch.xpu.is_available()` and `is_hip` all failed.\n"\ - f"But `torch.accelerator.current_accelerator()` works with it being = `{accelerator}`\n"\ - f"Please reinstall torch - it's most likely broken :(" - ) - raise NotImplementedError("Unsloth currently only works on NVIDIA, AMD and Intel GPUs.") -pass -DEVICE_TYPE : str = get_device_type() -# HIP fails for autocast and other torch functions. Use CUDA instead -DEVICE_TYPE_TORCH = DEVICE_TYPE -if DEVICE_TYPE_TORCH == "hip": DEVICE_TYPE_TORCH = "cuda" - -@functools.cache -def get_device_count(): - if DEVICE_TYPE in ("cuda", "hip"): - return torch.cuda.device_count() - elif DEVICE_TYPE == "xpu": - return torch.xpu.device_count() - else: - return 1 -pass - -DEVICE_COUNT : int = get_device_count() - +from .device_type import ( + is_hip, + get_device_type, + DEVICE_TYPE, + DEVICE_TYPE_TORCH, + DEVICE_COUNT, + ALLOW_PREQUANTIZED_MODELS, +) # Reduce VRAM usage by reducing fragmentation # And optimize pinning of memory # TODO(billishyahao): need to add hip related optimization... @@ -201,7 +166,10 @@ def is_bf16_supported(): return SUPPORTS_BFLOAT16 else: from triton.common.build import libcuda_dirs # Try loading bitsandbytes and triton - import bitsandbytes as bnb + try: + import bitsandbytes as bnb + except: + print("Unsloth: `bitsandbytes` is not installed - 4bit QLoRA unallowed, but 16bit and full finetuning works!") try: cdequantize_blockwise_fp32 = bnb.functional.lib.cdequantize_blockwise_fp32 libcuda_dirs() diff --git a/unsloth/device_type.py b/unsloth/device_type.py new file mode 100644 index 0000000000..547750019a --- /dev/null +++ b/unsloth/device_type.py @@ -0,0 +1,80 @@ +# Copyright 2023-present Daniel Han-Chen & the Unsloth team. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +__all__ = [ + "is_hip", + "get_device_type", + "DEVICE_TYPE", + "DEVICE_TYPE_TORCH", + "DEVICE_COUNT", + "ALLOW_PREQUANTIZED_MODELS", +] + +import torch +import functools + +@functools.cache +def is_hip(): + return bool(getattr(getattr(torch, "version", None), "hip", None)) +pass + +@functools.cache +def get_device_type(): + if hasattr(torch, "cuda") and torch.cuda.is_available(): + if is_hip(): + return "hip" + return "cuda" + elif hasattr(torch, "xpu") and torch.xpu.is_available(): + return "xpu" + # Check torch.accelerator + if hasattr(torch, "accelerator"): + if not torch.accelerator.is_available(): + raise NotImplementedError("Unsloth cannot find any torch accelerator? You need a GPU.") + accelerator = str(torch.accelerator.current_accelerator()) + if accelerator in ("cuda", "xpu", "hip"): + raise RuntimeError( + f"Unsloth: Weirdly `torch.cuda.is_available()`, `torch.xpu.is_available()` and `is_hip` all failed.\n"\ + f"But `torch.accelerator.current_accelerator()` works with it being = `{accelerator}`\n"\ + f"Please reinstall torch - it's most likely broken :(" + ) + raise NotImplementedError("Unsloth currently only works on NVIDIA, AMD and Intel GPUs.") +pass +DEVICE_TYPE : str = get_device_type() +# HIP fails for autocast and other torch functions. Use CUDA instead +DEVICE_TYPE_TORCH = DEVICE_TYPE +if DEVICE_TYPE_TORCH == "hip": DEVICE_TYPE_TORCH = "cuda" + +@functools.cache +def get_device_count(): + if DEVICE_TYPE in ("cuda", "hip"): + return torch.cuda.device_count() + elif DEVICE_TYPE == "xpu": + return torch.xpu.device_count() + else: + return 1 +pass + +DEVICE_COUNT : int = get_device_count() + +# Check blocksize for 4bit -> 64 for CUDA, 128 for AMD +# If AMD, we cannot load pre-quantized models for now :( +ALLOW_PREQUANTIZED_MODELS : bool = True +if DEVICE_TYPE == "hip": + try: + from bitsandbytes.nn.modules import Params4bit + if "blocksize = 64 if not HIP_ENVIRONMENT else 128" in inspect.getsource(Params4bit): + ALLOW_PREQUANTIZED_MODELS = False + except: + pass +pass diff --git a/unsloth/kernels/utils.py b/unsloth/kernels/utils.py index 9a46d0d5d7..16fc694230 100644 --- a/unsloth/kernels/utils.py +++ b/unsloth/kernels/utils.py @@ -19,7 +19,14 @@ import functools from typing import Optional -from .. import DEVICE_TYPE, DEVICE_COUNT +from ..device_type import ( + is_hip, + get_device_type, + DEVICE_TYPE, + DEVICE_TYPE_TORCH, + DEVICE_COUNT, + ALLOW_PREQUANTIZED_MODELS, +) from .fp8 import weight_dequant, fp8_linear # torch.cuda.amp.custom_fwd is deprecated >= 2.4 diff --git a/unsloth/models/_utils.py b/unsloth/models/_utils.py index cb98db00f1..35c0a2fcd5 100644 --- a/unsloth/models/_utils.py +++ b/unsloth/models/_utils.py @@ -87,7 +87,14 @@ import warnings, subprocess, re, inspect, psutil, os, math from unsloth_zoo.utils import Version from importlib.metadata import version as importlib_version -from unsloth import DEVICE_TYPE, DEVICE_COUNT, DEVICE_TYPE_TORCH +from ..device_type import ( + is_hip, + get_device_type, + DEVICE_TYPE, + DEVICE_TYPE_TORCH, + DEVICE_COUNT, + ALLOW_PREQUANTIZED_MODELS, +) from unsloth_zoo.log import logger from unsloth_zoo.tokenizer_utils import ( patch_tokenizer as _patch_tokenizer, @@ -1331,6 +1338,7 @@ def _unsloth_pre_compute_loss(self, model, inputs, *args, **kwargs): def patch_gradient_accumulation_fix(Trainer): # Fixes gradient accumulation + # Fixes Output 0 of UnslothFusedLossBackward is a view and is being modified inplace. import inspect if hasattr(Trainer, "get_batch_samples"): if Trainer.get_batch_samples.__name__ == "_unsloth_get_batch_samples": return diff --git a/unsloth/models/llama.py b/unsloth/models/llama.py index 596042288d..535537a3a1 100644 --- a/unsloth/models/llama.py +++ b/unsloth/models/llama.py @@ -27,7 +27,14 @@ from unsloth_zoo.utils import Version, _get_dtype from unsloth_zoo.hf_utils import dtype_from_config, add_dtype_kwargs, fix_lora_auto_mapping from unsloth_zoo.peft_utils import SKIP_QUANTIZATION_MODULES -from unsloth import DEVICE_TYPE, DEVICE_COUNT, DEVICE_TYPE_TORCH +from ..device_type import ( + is_hip, + get_device_type, + DEVICE_TYPE, + DEVICE_TYPE_TORCH, + DEVICE_COUNT, + ALLOW_PREQUANTIZED_MODELS, +) transformers_version = Version(transformers_version) # Transformers moved rotary embeddings out of all attention layers diff --git a/unsloth/models/loader.py b/unsloth/models/loader.py index 356bca8a29..ecafc5b2ce 100644 --- a/unsloth/models/loader.py +++ b/unsloth/models/loader.py @@ -45,6 +45,14 @@ pass from huggingface_hub import HfFileSystem import importlib.util +from ...device_type import ( + is_hip, + get_device_type, + DEVICE_TYPE, + DEVICE_TYPE_TORCH, + DEVICE_COUNT, + ALLOW_PREQUANTIZED_MODELS, +) # https://github.com/huggingface/transformers/pull/26037 allows 4 bit loading! from unsloth_zoo.utils import Version, _get_dtype @@ -195,6 +203,12 @@ def from_pretrained( old_model_name = model_name if not use_exact_model_name: model_name = get_model_name(model_name, load_in_4bit) + # Check if pre-quantized models are allowed + # For eg AMD GPUs need blocksize = 128, but our pre-quants are blocksize = 64 + if not ALLOW_PREQUANTIZED_MODELS and model_name.endswith(("-unsloth-bnb-4bit", "-bnb-4bit")): + model_name = model_name.removesuffix("-unsloth-bnb-4bit") + model_name = model_name.removesuffix("-bnb-4bit") + pass if USE_MODELSCOPE and not os.path.exists(model_name): from modelscope import snapshot_download @@ -306,6 +320,12 @@ def from_pretrained( model_name = peft_config.base_model_name_or_path if not use_exact_model_name: model_name = get_model_name(model_name, load_in_4bit) + # Check if pre-quantized models are allowed + # For eg AMD GPUs need blocksize = 128, but our pre-quants are blocksize = 64 + if not ALLOW_PREQUANTIZED_MODELS and model_name.endswith(("-unsloth-bnb-4bit", "-bnb-4bit")): + model_name = model_name.removesuffix("-unsloth-bnb-4bit") + model_name = model_name.removesuffix("-bnb-4bit") + pass model_config = AutoConfig.from_pretrained( model_name, token = token, @@ -618,6 +638,12 @@ def from_pretrained( old_model_name = model_name if not use_exact_model_name: model_name = get_model_name(model_name, load_in_4bit) + # Check if pre-quantized models are allowed + # For eg AMD GPUs need blocksize = 128, but our pre-quants are blocksize = 64 + if not ALLOW_PREQUANTIZED_MODELS and model_name.endswith(("-unsloth-bnb-4bit", "-bnb-4bit")): + model_name = model_name.removesuffix("-unsloth-bnb-4bit") + model_name = model_name.removesuffix("-bnb-4bit") + pass # Check modelscope if USE_MODELSCOPE and not os.path.exists(model_name): @@ -833,7 +859,12 @@ def from_pretrained( model_name = peft_config.base_model_name_or_path if not use_exact_model_name: model_name = get_model_name(model_name, load_in_4bit) - + # Check if pre-quantized models are allowed + # For eg AMD GPUs need blocksize = 128, but our pre-quants are blocksize = 64 + if not ALLOW_PREQUANTIZED_MODELS and model_name.endswith(("-unsloth-bnb-4bit", "-bnb-4bit")): + model_name = model_name.removesuffix("-unsloth-bnb-4bit") + model_name = model_name.removesuffix("-bnb-4bit") + pass model_config = AutoConfig.from_pretrained( model_name, token = token, diff --git a/unsloth/models/rl_replacements.py b/unsloth/models/rl_replacements.py index a207514e72..1e68790004 100644 --- a/unsloth/models/rl_replacements.py +++ b/unsloth/models/rl_replacements.py @@ -26,7 +26,14 @@ import inspect from collections import defaultdict from unsloth_zoo.rl_replacements import RL_REPLACEMENTS, left_pack_padding -from unsloth import DEVICE_TYPE +from .device_type import ( + is_hip, + get_device_type, + DEVICE_TYPE, + DEVICE_TYPE_TORCH, + DEVICE_COUNT, + ALLOW_PREQUANTIZED_MODELS, +) import textwrap RL_EXTRA_ARGS = defaultdict(list) diff --git a/unsloth/models/vision.py b/unsloth/models/vision.py index b2704876e3..f2bd7c306b 100644 --- a/unsloth/models/vision.py +++ b/unsloth/models/vision.py @@ -71,7 +71,14 @@ # Old HF Hub versions <= 0.0.25 from huggingface_hub.utils._token import get_token pass -from unsloth import DEVICE_TYPE, DEVICE_COUNT, DEVICE_TYPE_TORCH +from ..device_type import ( + is_hip, + get_device_type, + DEVICE_TYPE, + DEVICE_TYPE_TORCH, + DEVICE_COUNT, + ALLOW_PREQUANTIZED_MODELS, +) __all__ = [ "FastBaseModel", From 09fd92546b09ec2107144f73cf8a27de183162b8 Mon Sep 17 00:00:00 2001 From: Daniel Han Date: Thu, 16 Oct 2025 23:27:53 -0700 Subject: [PATCH 238/272] Update rl_replacements.py --- unsloth/models/rl_replacements.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/unsloth/models/rl_replacements.py b/unsloth/models/rl_replacements.py index 1e68790004..7ecffdf63d 100644 --- a/unsloth/models/rl_replacements.py +++ b/unsloth/models/rl_replacements.py @@ -26,7 +26,7 @@ import inspect from collections import defaultdict from unsloth_zoo.rl_replacements import RL_REPLACEMENTS, left_pack_padding -from .device_type import ( +from ..device_type import ( is_hip, get_device_type, DEVICE_TYPE, From bd22cb1a7c5d5b5d7d13422d8af181d2093f56d4 Mon Sep 17 00:00:00 2001 From: Daniel Han Date: Thu, 16 Oct 2025 23:30:14 -0700 Subject: [PATCH 239/272] Update loader.py --- unsloth/models/loader.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/unsloth/models/loader.py b/unsloth/models/loader.py index ecafc5b2ce..5d1896fd5b 100644 --- a/unsloth/models/loader.py +++ b/unsloth/models/loader.py @@ -45,7 +45,7 @@ pass from huggingface_hub import HfFileSystem import importlib.util -from ...device_type import ( +from ..device_type import ( is_hip, get_device_type, DEVICE_TYPE, From 9fe4d319fd44ec36103bd2d26da54c69f768efe2 Mon Sep 17 00:00:00 2001 From: Daniel Han Date: Fri, 17 Oct 2025 03:32:42 -0700 Subject: [PATCH 240/272] AMD install script --- pyproject.toml | 4 ++-- unsloth/models/_amd_install.sh | 31 +++++++++++++++++++++++++++++++ unsloth/models/_utils.py | 2 +- 3 files changed, 34 insertions(+), 3 deletions(-) create mode 100644 unsloth/models/_amd_install.sh diff --git a/pyproject.toml b/pyproject.toml index 2f812c769f..b490114e12 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -40,7 +40,7 @@ triton = [ "triton-windows ; (sys_platform == 'win32') and (platform_machine == 'AMD64' or platform_machine == 'x86_64')", ] huggingface = [ - "unsloth_zoo>=2025.10.4", + "unsloth_zoo>=2025.10.5", "wheel>=0.42.0", "packaging", "torchvision", @@ -458,7 +458,7 @@ colab-ampere-torch220 = [ "flash-attn>=2.6.3 ; ('linux' in sys_platform)", ] colab-new = [ - "unsloth_zoo>=2025.10.4", + "unsloth_zoo>=2025.10.5", "packaging", "tyro", "transformers>=4.51.3,!=4.52.0,!=4.52.1,!=4.52.2,!=4.52.3,!=4.53.0,!=4.54.0,!=4.55.0,!=4.55.1,<=4.56.2", diff --git a/unsloth/models/_amd_install.sh b/unsloth/models/_amd_install.sh new file mode 100644 index 0000000000..637fdba7de --- /dev/null +++ b/unsloth/models/_amd_install.sh @@ -0,0 +1,31 @@ +#!/usr/bin/env bash +# _amd_install.sh +# Non-interactive installer: build tools, PyTorch (ROCm 6.4), bitsandbytes (HIP), and Unsloth from source. +# Usage: +# bash _amd_install.sh +# + +set -euo pipefail +export DEBIAN_FRONTEND=noninteractive + +apt-get update +apt-get install -y --no-install-recommends build-essential cmake git + +pip install \ + torch==2.8.0 torchvision torchaudio torchao==0.13.0 xformers \ + --index-url https://download.pytorch.org/whl/rocm6.4 + +WORKDIR="$(pwd)" +TMPDIR="$(mktemp -d)" +cd "$TMPDIR" +git clone https://github.com/bitsandbytes-foundation/bitsandbytes.git +cd bitsandbytes +arch +cmake -DCOMPUTE_BACKEND=hip -S . +make -j"$(nproc)" +pip install . +cd "$WORKDIR" +rm -rf "$TMPDIR" + +pip install "unsloth_zoo[base] @ git+https://github.com/unslothai/unsloth-zoo" +pip install "unsloth[base] @ git+https://github.com/unslothai/unsloth" diff --git a/unsloth/models/_utils.py b/unsloth/models/_utils.py index 35c0a2fcd5..e713c8efff 100644 --- a/unsloth/models/_utils.py +++ b/unsloth/models/_utils.py @@ -12,7 +12,7 @@ # See the License for the specific language governing permissions and # limitations under the License. -__version__ = "2025.10.4" +__version__ = "2025.10.5" __all__ = [ "SUPPORTS_BFLOAT16", From 302649864f1bd47768975937c7c005b8dbe715bc Mon Sep 17 00:00:00 2001 From: Daniel Han Date: Fri, 17 Oct 2025 03:34:06 -0700 Subject: [PATCH 241/272] Move AMD --- unsloth/{models => }/_amd_install.sh | 0 1 file changed, 0 insertions(+), 0 deletions(-) rename unsloth/{models => }/_amd_install.sh (100%) diff --git a/unsloth/models/_amd_install.sh b/unsloth/_amd_install.sh similarity index 100% rename from unsloth/models/_amd_install.sh rename to unsloth/_amd_install.sh From c8150dcefafe3b7753cb47ed1750622573381ba8 Mon Sep 17 00:00:00 2001 From: Daniel Han Date: Fri, 17 Oct 2025 03:36:42 -0700 Subject: [PATCH 242/272] Update _amd_install.sh --- unsloth/_amd_install.sh | 1 + 1 file changed, 1 insertion(+) diff --git a/unsloth/_amd_install.sh b/unsloth/_amd_install.sh index 637fdba7de..b83dbd2ad0 100644 --- a/unsloth/_amd_install.sh +++ b/unsloth/_amd_install.sh @@ -27,5 +27,6 @@ pip install . cd "$WORKDIR" rm -rf "$TMPDIR" +pip install --no-deps unsloth unsloth-zoo pip install "unsloth_zoo[base] @ git+https://github.com/unslothai/unsloth-zoo" pip install "unsloth[base] @ git+https://github.com/unslothai/unsloth" From f25c3650cb15b3acdce8086034691b9f9366cde4 Mon Sep 17 00:00:00 2001 From: Daniel Han Date: Fri, 17 Oct 2025 04:00:20 -0700 Subject: [PATCH 243/272] Update pyproject.toml --- pyproject.toml | 14 +++++++++++--- 1 file changed, 11 insertions(+), 3 deletions(-) diff --git a/pyproject.toml b/pyproject.toml index b490114e12..6409d74c05 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -39,11 +39,10 @@ triton = [ "triton>=3.0.0 ; ('linux' in sys_platform)", "triton-windows ; (sys_platform == 'win32') and (platform_machine == 'AMD64' or platform_machine == 'x86_64')", ] -huggingface = [ +huggingfacenotorch = [ "unsloth_zoo>=2025.10.5", "wheel>=0.42.0", "packaging", - "torchvision", "numpy", "tqdm", "psutil", @@ -58,6 +57,10 @@ huggingface = [ "diffusers", "transformers>=4.51.3,!=4.52.0,!=4.52.1,!=4.52.2,!=4.52.3,!=4.53.0,!=4.54.0,!=4.55.0,!=4.55.1,<=4.56.2", "trl>=0.7.9,!=0.9.0,!=0.9.1,!=0.9.2,!=0.9.3,!=0.15.0,!=0.19.0,<=0.23.0", +] +huggingface = [ + "unsloth[huggingfacenotorch]", + "torchvision", "unsloth[triton]", ] windows = [ @@ -740,7 +743,12 @@ intel-gpu-torch270 = [ "torch @ https://download.pytorch.org/whl/xpu/torch-2.7.0%2Bxpu-cp312-cp312-linux_x86_64.whl#sha256=c806d44aa2ca5d225629f6fbc6c994d5deaac2d2cde449195bc8e3522ddd219a ; ('linux' in sys_platform) and python_version == '3.12' and (platform_machine == 'AMD64' or platform_machine == 'x86_64')", "torch @ https://download.pytorch.org/whl/xpu/torch-2.7.0%2Bxpu-cp313-cp313-linux_x86_64.whl#sha256=25d8277b7f01d42e2e014ccbab57a2692b6ec4eff8dcf894eda1b297407cf97a ; ('linux' in sys_platform) and python_version == '3.13' and (platform_machine == 'AMD64' or platform_machine == 'x86_64')", ] - +amd = [ + "unsloth[huggingfacenotorch]", + "bitsandbytes @ https://github.com/bitsandbytes-foundation/bitsandbytes/releases/download/continuous-release_main/bitsandbytes-1.33.7.preview-py3-none-manylinux_2_24_x86_64.whl ; ('linux' in sys_platform) and (platform_machine == 'AMD64' or platform_machine == 'x86_64')", + "bitsandbytes @ https://github.com/bitsandbytes-foundation/bitsandbytes/releases/download/continuous-release_main/bitsandbytes-1.33.7.preview-py3-none-win_amd64.whl ; (sys_platform == 'win32') and (platform_machine == 'AMD64' or platform_machine == 'x86_64')", + "bitsandbytes @ https://github.com/bitsandbytes-foundation/bitsandbytes/releases/download/continuous-release_main/bitsandbytes-1.33.7.preview-py3-none-manylinux_2_24_aarch64.whl ; ('linux' in sys_platform) and (platform_machine == 'aarch64')", +] [project.urls] homepage = "http://www.unsloth.ai" documentation = "https://github.com/unslothai/unsloth" From 315c2cfcc3e554b73a5744736e0c59f32263b20f Mon Sep 17 00:00:00 2001 From: Daniel Han Date: Fri, 17 Oct 2025 06:53:45 -0700 Subject: [PATCH 244/272] Update pyproject.toml --- pyproject.toml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pyproject.toml b/pyproject.toml index 6409d74c05..57fab8baf3 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -40,7 +40,6 @@ triton = [ "triton-windows ; (sys_platform == 'win32') and (platform_machine == 'AMD64' or platform_machine == 'x86_64')", ] huggingfacenotorch = [ - "unsloth_zoo>=2025.10.5", "wheel>=0.42.0", "packaging", "numpy", @@ -60,6 +59,7 @@ huggingfacenotorch = [ ] huggingface = [ "unsloth[huggingfacenotorch]", + "unsloth_zoo>=2025.10.5", "torchvision", "unsloth[triton]", ] From b5d3df84541421054b695aec6340f03d26036866 Mon Sep 17 00:00:00 2001 From: Daniel Han Date: Fri, 17 Oct 2025 06:53:55 -0700 Subject: [PATCH 245/272] Delete _amd_install.sh --- unsloth/_amd_install.sh | 32 -------------------------------- 1 file changed, 32 deletions(-) delete mode 100644 unsloth/_amd_install.sh diff --git a/unsloth/_amd_install.sh b/unsloth/_amd_install.sh deleted file mode 100644 index b83dbd2ad0..0000000000 --- a/unsloth/_amd_install.sh +++ /dev/null @@ -1,32 +0,0 @@ -#!/usr/bin/env bash -# _amd_install.sh -# Non-interactive installer: build tools, PyTorch (ROCm 6.4), bitsandbytes (HIP), and Unsloth from source. -# Usage: -# bash _amd_install.sh -# - -set -euo pipefail -export DEBIAN_FRONTEND=noninteractive - -apt-get update -apt-get install -y --no-install-recommends build-essential cmake git - -pip install \ - torch==2.8.0 torchvision torchaudio torchao==0.13.0 xformers \ - --index-url https://download.pytorch.org/whl/rocm6.4 - -WORKDIR="$(pwd)" -TMPDIR="$(mktemp -d)" -cd "$TMPDIR" -git clone https://github.com/bitsandbytes-foundation/bitsandbytes.git -cd bitsandbytes -arch -cmake -DCOMPUTE_BACKEND=hip -S . -make -j"$(nproc)" -pip install . -cd "$WORKDIR" -rm -rf "$TMPDIR" - -pip install --no-deps unsloth unsloth-zoo -pip install "unsloth_zoo[base] @ git+https://github.com/unslothai/unsloth-zoo" -pip install "unsloth[base] @ git+https://github.com/unslothai/unsloth" From 55dd1f66a86c08f938e189bb8da2f14c151e6975 Mon Sep 17 00:00:00 2001 From: Daniel Han Date: Fri, 17 Oct 2025 06:54:35 -0700 Subject: [PATCH 246/272] Update device_type.py --- unsloth/device_type.py | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/unsloth/device_type.py b/unsloth/device_type.py index 547750019a..ac70d26795 100644 --- a/unsloth/device_type.py +++ b/unsloth/device_type.py @@ -19,10 +19,13 @@ "DEVICE_TYPE_TORCH", "DEVICE_COUNT", "ALLOW_PREQUANTIZED_MODELS", + "ALLOW_BITSANDBYTES", ] import torch import functools +from unsloth_zoo.utils import Version +import inspect @functools.cache def is_hip(): @@ -70,11 +73,15 @@ def get_device_count(): # Check blocksize for 4bit -> 64 for CUDA, 128 for AMD # If AMD, we cannot load pre-quantized models for now :( ALLOW_PREQUANTIZED_MODELS : bool = True +# HSA_STATUS_ERROR_EXCEPTION checks - sometimes AMD fails for BnB +ALLOW_BITSANDBYTES : bool = True if DEVICE_TYPE == "hip": try: from bitsandbytes.nn.modules import Params4bit if "blocksize = 64 if not HIP_ENVIRONMENT else 128" in inspect.getsource(Params4bit): ALLOW_PREQUANTIZED_MODELS = False + import bitsandbytes + ALLOW_BITSANDBYTES = Version(bitsandbytes.__version__) > Version("0.48.2.dev0") except: pass pass From 0960fe45a38d06d91864d374ca3f062394fcfd85 Mon Sep 17 00:00:00 2001 From: Daniel Han Date: Fri, 17 Oct 2025 06:54:53 -0700 Subject: [PATCH 247/272] Update loader.py --- unsloth/models/loader.py | 9 +++++++++ 1 file changed, 9 insertions(+) diff --git a/unsloth/models/loader.py b/unsloth/models/loader.py index 5d1896fd5b..4c6c91f6fc 100644 --- a/unsloth/models/loader.py +++ b/unsloth/models/loader.py @@ -52,6 +52,7 @@ DEVICE_TYPE_TORCH, DEVICE_COUNT, ALLOW_PREQUANTIZED_MODELS, + ALLOW_BITSANDBYTES, ) # https://github.com/huggingface/transformers/pull/26037 allows 4 bit loading! @@ -199,6 +200,10 @@ def from_pretrained( ) pass pass + # Check if 4bit is allowed specifically for AMD + if not ALLOW_BITSANDBYTES and not use_exact_model_name: + print("Unsloth: AMD currently is not stable with 4bit bitsandbytes. Disabling for now.") + load_in_4bit = False old_model_name = model_name if not use_exact_model_name: @@ -634,6 +639,10 @@ def from_pretrained( "compatible with `full_finetuning=True`. If you wish to use QAT with LoRA, " "please pass in `qat_scheme` in `FastLanguageModel.get_peft_model(...)` instead." ) + # Check if 4bit is allowed specifically for AMD + if not ALLOW_BITSANDBYTES and not use_exact_model_name: + print("Unsloth: AMD currently is not stable with 4bit bitsandbytes. Disabling for now.") + load_in_4bit = False old_model_name = model_name if not use_exact_model_name: From 5f0d9fa7540e7627141d043427ea36550c2f4026 Mon Sep 17 00:00:00 2001 From: Daniel Han Date: Sun, 19 Oct 2025 22:20:38 -0700 Subject: [PATCH 248/272] Update _utils.py --- unsloth/models/_utils.py | 10 +++++++++- 1 file changed, 9 insertions(+), 1 deletion(-) diff --git a/unsloth/models/_utils.py b/unsloth/models/_utils.py index 78ca68b449..1f50b8206e 100644 --- a/unsloth/models/_utils.py +++ b/unsloth/models/_utils.py @@ -163,7 +163,15 @@ warnings.filterwarnings(action = "ignore", category = RuntimeWarning, module = "multiprocessing") warnings.filterwarnings(action = "ignore", category = RuntimeWarning, module = "multiprocess") warnings.filterwarnings(action = "ignore", category = UserWarning, module = "triton") - +try: + # pydantic/_internal/_generate_schema.py:2249: UnsupportedFieldAttributeWarning: The 'frozen' attribute with value True + # was provided to the `Field()` function, which has no effect in the context it was used. + # 'frozen' is field-specific metadata, and can only be attached to a model field using `Annotated` metadata or by assignment. + # This may have happened because an `Annotated` type alias using the `type` statement was used, or if the `Field()` function was attached to a single member of a union type. + from pydantic.warnings import UnsupportedFieldAttributeWarning + warnings.filterwarnings(action = "ignore", category = UnsupportedFieldAttributeWarning, module = "pydantic") +except: + pass # Stop "Special tokens have been added in the vocabulary, ..." import logging logging.getLogger("transformers.tokenization_utils_base").setLevel(logging.CRITICAL+1) From 1f726a42943af972aa4c13642cffd401c11cc495 Mon Sep 17 00:00:00 2001 From: Daniel Han Date: Sun, 19 Oct 2025 22:26:12 -0700 Subject: [PATCH 249/272] Update _utils.py --- unsloth/models/_utils.py | 8 ++++++++ 1 file changed, 8 insertions(+) diff --git a/unsloth/models/_utils.py b/unsloth/models/_utils.py index 1f50b8206e..fab02563a6 100644 --- a/unsloth/models/_utils.py +++ b/unsloth/models/_utils.py @@ -352,6 +352,14 @@ def filter(self, x): return not (self.text in x.getMessage()) except: pass +# We detected that you are using `from_pretrained` with a meta device context manager or `torch.set_default_device('meta') +try: + from transformers.modeling_utils import logger as modeling_utils_logger + modeling_utils_logger.addFilter(HideLoggingMessage("anti-pattern")) + del modeling_utils_logger +except: + pass + # Errors out on # Some weights of Gemma3nForConditionalGeneration were not initialized from the model checkpoint from transformers.modeling_utils import logger as transformers_logger From 8d29c64257bc1be5f54977f96590cdb5a45708be Mon Sep 17 00:00:00 2001 From: Daniel Han Date: Sun, 19 Oct 2025 22:44:16 -0700 Subject: [PATCH 250/272] Update _utils.py --- unsloth/models/_utils.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/unsloth/models/_utils.py b/unsloth/models/_utils.py index fab02563a6..c7284c5e96 100644 --- a/unsloth/models/_utils.py +++ b/unsloth/models/_utils.py @@ -169,7 +169,7 @@ # 'frozen' is field-specific metadata, and can only be attached to a model field using `Annotated` metadata or by assignment. # This may have happened because an `Annotated` type alias using the `type` statement was used, or if the `Field()` function was attached to a single member of a union type. from pydantic.warnings import UnsupportedFieldAttributeWarning - warnings.filterwarnings(action = "ignore", category = UnsupportedFieldAttributeWarning, module = "pydantic") + warnings.filterwarnings(action = "ignore", category = UnsupportedFieldAttributeWarning) except: pass # Stop "Special tokens have been added in the vocabulary, ..." From 0e9fb1da1e10a902ffba06776e2026b8435a3fb3 Mon Sep 17 00:00:00 2001 From: Daniel Han Date: Sun, 19 Oct 2025 22:49:03 -0700 Subject: [PATCH 251/272] Update _utils.py --- unsloth/models/_utils.py | 9 --------- 1 file changed, 9 deletions(-) diff --git a/unsloth/models/_utils.py b/unsloth/models/_utils.py index c7284c5e96..337413aca4 100644 --- a/unsloth/models/_utils.py +++ b/unsloth/models/_utils.py @@ -163,15 +163,6 @@ warnings.filterwarnings(action = "ignore", category = RuntimeWarning, module = "multiprocessing") warnings.filterwarnings(action = "ignore", category = RuntimeWarning, module = "multiprocess") warnings.filterwarnings(action = "ignore", category = UserWarning, module = "triton") -try: - # pydantic/_internal/_generate_schema.py:2249: UnsupportedFieldAttributeWarning: The 'frozen' attribute with value True - # was provided to the `Field()` function, which has no effect in the context it was used. - # 'frozen' is field-specific metadata, and can only be attached to a model field using `Annotated` metadata or by assignment. - # This may have happened because an `Annotated` type alias using the `type` statement was used, or if the `Field()` function was attached to a single member of a union type. - from pydantic.warnings import UnsupportedFieldAttributeWarning - warnings.filterwarnings(action = "ignore", category = UnsupportedFieldAttributeWarning) -except: - pass # Stop "Special tokens have been added in the vocabulary, ..." import logging logging.getLogger("transformers.tokenization_utils_base").setLevel(logging.CRITICAL+1) From 9950e27d8d107efba09109225c702d2b6c33ea75 Mon Sep 17 00:00:00 2001 From: Daniel Han Date: Sun, 19 Oct 2025 22:56:01 -0700 Subject: [PATCH 252/272] Update _utils.py --- unsloth/models/_utils.py | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/unsloth/models/_utils.py b/unsloth/models/_utils.py index 337413aca4..b38fb57c4d 100644 --- a/unsloth/models/_utils.py +++ b/unsloth/models/_utils.py @@ -214,6 +214,12 @@ def filter(self, x): return not (self.text in x.getMessage()) del vllm_lora_model_logger except: pass + try: + from vllm.attention.utils.fa_utils import logger as vllm_attention_utils_fa_utils_logger + vllm_attention_utils_fa_utils_logger.addFilter(HideLoggingMessage("Cannot use FA version")) + del vllm_attention_utils_fa_utils_logger + except: + pass pass # The speedups for torchdynamo mostly come with GPU Ampere or higher and which is not detected here. From d995f71d86f2bb3a25473d46ac0543e3dc306713 Mon Sep 17 00:00:00 2001 From: Daniel Han Date: Sun, 19 Oct 2025 23:15:26 -0700 Subject: [PATCH 253/272] Update tokenizer_utils.py --- unsloth/tokenizer_utils.py | 18 +++++++++++++++++- 1 file changed, 17 insertions(+), 1 deletion(-) diff --git a/unsloth/tokenizer_utils.py b/unsloth/tokenizer_utils.py index 067f2596c6..b35c1326ca 100644 --- a/unsloth/tokenizer_utils.py +++ b/unsloth/tokenizer_utils.py @@ -345,7 +345,23 @@ def fix_sentencepiece_tokenizer( ): # From https://github.com/google/sentencepiece/issues/121 # We need to manually edit the sentencepiece tokenizer! - from transformers.utils import sentencepiece_model_pb2 + try: + from transformers.convert_slow_tokenizer import import_protobuf + sentencepiece_model_pb2 = import_protobuf() + except Exception as e: + try: + import google.protobuf + from unsloth_zoo.utils import Version + protobuf_version = Version(google.protobuf.__version__) + if protobuf_version > Version("3.20.3"): + raise RuntimeError( + f"Unsloth: Your protobuf version = {protobuf_version} is too new.\n"\ + f"Please downgrade via `pip install --force-reinstall protobuf==3.20.3`" + ) + except: + # This will only work for older SentencePiece versions <= 3.20.3 + from transformers.utils import sentencepiece_model_pb2 + pass if not os.path.exists(temporary_location): os.makedirs(temporary_location) From c4db81bc773329ee5488b325e69c5232497338f6 Mon Sep 17 00:00:00 2001 From: Daniel Han Date: Sun, 19 Oct 2025 23:19:04 -0700 Subject: [PATCH 254/272] Versioning --- pyproject.toml | 4 ++-- unsloth/models/_utils.py | 2 +- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/pyproject.toml b/pyproject.toml index fb63f89ae6..624054caed 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -59,7 +59,7 @@ huggingfacenotorch = [ ] huggingface = [ "unsloth[huggingfacenotorch]", - "unsloth_zoo>=2025.10.6", + "unsloth_zoo>=2025.10.7", "torchvision", "unsloth[triton]", ] @@ -461,7 +461,7 @@ colab-ampere-torch220 = [ "flash-attn>=2.6.3 ; ('linux' in sys_platform)", ] colab-new = [ - "unsloth_zoo>=2025.10.6", + "unsloth_zoo>=2025.10.7", "packaging", "tyro", "transformers>=4.51.3,!=4.52.0,!=4.52.1,!=4.52.2,!=4.52.3,!=4.53.0,!=4.54.0,!=4.55.0,!=4.55.1,<=4.56.2", diff --git a/unsloth/models/_utils.py b/unsloth/models/_utils.py index b38fb57c4d..22ce2fd9e5 100644 --- a/unsloth/models/_utils.py +++ b/unsloth/models/_utils.py @@ -12,7 +12,7 @@ # See the License for the specific language governing permissions and # limitations under the License. -__version__ = "2025.10.6" +__version__ = "2025.10.7" __all__ = [ "SUPPORTS_BFLOAT16", From ea37dd68e2a1c7e222ceeb41b7cf11e80ee1e9ef Mon Sep 17 00:00:00 2001 From: Daniel Han Date: Sun, 19 Oct 2025 23:19:33 -0700 Subject: [PATCH 255/272] Update pyproject.toml --- pyproject.toml | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/pyproject.toml b/pyproject.toml index 624054caed..1d46c8824b 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -59,7 +59,7 @@ huggingfacenotorch = [ ] huggingface = [ "unsloth[huggingfacenotorch]", - "unsloth_zoo>=2025.10.7", + "unsloth_zoo>=2025.10.8", "torchvision", "unsloth[triton]", ] @@ -461,7 +461,7 @@ colab-ampere-torch220 = [ "flash-attn>=2.6.3 ; ('linux' in sys_platform)", ] colab-new = [ - "unsloth_zoo>=2025.10.7", + "unsloth_zoo>=2025.10.8", "packaging", "tyro", "transformers>=4.51.3,!=4.52.0,!=4.52.1,!=4.52.2,!=4.52.3,!=4.53.0,!=4.54.0,!=4.55.0,!=4.55.1,<=4.56.2", From 5ff72340553952847773cd5a94a49e2ae9107a8c Mon Sep 17 00:00:00 2001 From: Daniel Han Date: Mon, 20 Oct 2025 01:39:56 -0700 Subject: [PATCH 256/272] Update loader.py --- unsloth/models/loader.py | 50 +++++++++++++++++++++++++++------------- 1 file changed, 34 insertions(+), 16 deletions(-) diff --git a/unsloth/models/loader.py b/unsloth/models/loader.py index 4c6c91f6fc..165b7e0551 100644 --- a/unsloth/models/loader.py +++ b/unsloth/models/loader.py @@ -210,10 +210,14 @@ def from_pretrained( model_name = get_model_name(model_name, load_in_4bit) # Check if pre-quantized models are allowed # For eg AMD GPUs need blocksize = 128, but our pre-quants are blocksize = 64 - if not ALLOW_PREQUANTIZED_MODELS and model_name.endswith(("-unsloth-bnb-4bit", "-bnb-4bit")): - model_name = model_name.removesuffix("-unsloth-bnb-4bit") - model_name = model_name.removesuffix("-bnb-4bit") - pass + if not ALLOW_PREQUANTIZED_MODELS and model_name.lower().endswith(("-unsloth-bnb-4bit", "-bnb-4bit")): + model_name = model_name.lower().removesuffix("-unsloth-bnb-4bit") + model_name = model_name.lower().removesuffix("-bnb-4bit") + # Change -BF16 to all False for 4bit, 8bit etc + if model_name.lower().endswith("-BF16"): + load_in_4bit = False + load_in_8bit = False + load_in_16bit = True if USE_MODELSCOPE and not os.path.exists(model_name): from modelscope import snapshot_download @@ -327,10 +331,15 @@ def from_pretrained( model_name = get_model_name(model_name, load_in_4bit) # Check if pre-quantized models are allowed # For eg AMD GPUs need blocksize = 128, but our pre-quants are blocksize = 64 - if not ALLOW_PREQUANTIZED_MODELS and model_name.endswith(("-unsloth-bnb-4bit", "-bnb-4bit")): - model_name = model_name.removesuffix("-unsloth-bnb-4bit") - model_name = model_name.removesuffix("-bnb-4bit") - pass + if not ALLOW_PREQUANTIZED_MODELS and model_name.lower().endswith(("-unsloth-bnb-4bit", "-bnb-4bit")): + model_name = model_name.lower().removesuffix("-unsloth-bnb-4bit") + model_name = model_name.lower().removesuffix("-bnb-4bit") + # Change -BF16 to all False for 4bit, 8bit etc + if model_name.lower().endswith("-BF16"): + load_in_4bit = False + load_in_8bit = False + load_in_16bit = True + model_config = AutoConfig.from_pretrained( model_name, token = token, @@ -649,10 +658,14 @@ def from_pretrained( model_name = get_model_name(model_name, load_in_4bit) # Check if pre-quantized models are allowed # For eg AMD GPUs need blocksize = 128, but our pre-quants are blocksize = 64 - if not ALLOW_PREQUANTIZED_MODELS and model_name.endswith(("-unsloth-bnb-4bit", "-bnb-4bit")): - model_name = model_name.removesuffix("-unsloth-bnb-4bit") - model_name = model_name.removesuffix("-bnb-4bit") - pass + if not ALLOW_PREQUANTIZED_MODELS and model_name.lower().endswith(("-unsloth-bnb-4bit", "-bnb-4bit")): + model_name = model_name.lower().removesuffix("-unsloth-bnb-4bit") + model_name = model_name.lower().removesuffix("-bnb-4bit") + # Change -BF16 to all False for 4bit, 8bit etc + if model_name.lower().endswith("-BF16"): + load_in_4bit = False + load_in_8bit = False + load_in_16bit = True # Check modelscope if USE_MODELSCOPE and not os.path.exists(model_name): @@ -870,10 +883,15 @@ def from_pretrained( model_name = get_model_name(model_name, load_in_4bit) # Check if pre-quantized models are allowed # For eg AMD GPUs need blocksize = 128, but our pre-quants are blocksize = 64 - if not ALLOW_PREQUANTIZED_MODELS and model_name.endswith(("-unsloth-bnb-4bit", "-bnb-4bit")): - model_name = model_name.removesuffix("-unsloth-bnb-4bit") - model_name = model_name.removesuffix("-bnb-4bit") - pass + if not ALLOW_PREQUANTIZED_MODELS and model_name.lower().endswith(("-unsloth-bnb-4bit", "-bnb-4bit")): + model_name = model_name.lower().removesuffix("-unsloth-bnb-4bit") + model_name = model_name.lower().removesuffix("-bnb-4bit") + # Change -BF16 to all False for 4bit, 8bit etc + if model_name.lower().endswith("-BF16"): + load_in_4bit = False + load_in_8bit = False + load_in_16bit = True + model_config = AutoConfig.from_pretrained( model_name, token = token, From 47c2dd6eeb3f8f044f4759701d6e14d09ce743e7 Mon Sep 17 00:00:00 2001 From: Daniel Han Date: Mon, 20 Oct 2025 02:22:33 -0700 Subject: [PATCH 257/272] Update _utils.py --- unsloth/models/_utils.py | 155 ++++++++++++++++++++++----------------- 1 file changed, 87 insertions(+), 68 deletions(-) diff --git a/unsloth/models/_utils.py b/unsloth/models/_utils.py index 22ce2fd9e5..0f5dc50450 100644 --- a/unsloth/models/_utils.py +++ b/unsloth/models/_utils.py @@ -905,6 +905,25 @@ def prepare_model_for_kbit_training( pass # ============================================= +import importlib +global USE_MODELSCOPE +USE_MODELSCOPE = os.environ.get("UNSLOTH_USE_MODELSCOPE", "0") == "1" +if USE_MODELSCOPE: + if importlib.util.find_spec("modelscope") is None: + raise ImportError(f'You are using the modelscope hub, please install modelscope by `pip install modelscope -U`') + pass +pass + +import socket +def has_internet(host = "8.8.8.8", port = 53, timeout = 3): + if os.environ.get("TRANSFORMERS_OFFLINE", "0") == "1": return False + try: + socket.setdefaulttimeout(timeout) + socket.socket(socket.AF_INET, socket.SOCK_STREAM).connect((host, port)) + return True + except socket.error as ex: + return False +pass import psutil def _get_statistics(statistics = None, force_download = True): @@ -912,56 +931,71 @@ def _get_statistics(statistics = None, force_download = True): # We simply download a README.md file from HF - all data is made public. # This is simply so we can check if some envs are broken or not. # You can disable this by commenting the below out - try: - n_cpus = psutil.cpu_count(logical = False) - keynames = "\n" + "\n".join(os.environ.keys()) - if statistics is not None: pass - elif "\nCOLAB_" in keynames and n_cpus == 1: statistics = "colab" - elif "\nCOLAB_" in keynames: statistics = "colabpro" - elif "\nKAGGLE_" in keynames: statistics = "kaggle" - elif "\nRUNPOD_" in keynames: statistics = "runpod" - elif "\nAWS_" in keynames: statistics = "aws" - elif "\nAZURE_" in keynames: statistics = "azure" - # elif "\nK_" in keynames or "\nFUNCTION_" in keynames: statistics = "gcp" - elif "\nINVOCATION_ID" in keynames: statistics = "lambda" - # else: statistics = "other" - else: - def try_vllm_check(): - vendor_files = ( - "/sys/class/dmi/id/product_version", - "/sys/class/dmi/id/bios_vendor", - "/sys/class/dmi/id/product_name", - "/sys/class/dmi/id/chassis_asset_tag", - "/sys/class/dmi/id/sys_vendor", - ) - from pathlib import Path - for vendor_file in vendor_files: - path = Path(vendor_file) - if path.is_file(): - file_content = path.read_text().lower() - if "amazon" in file_content: return "aws" - elif "microsoft corporation" in file_content: return "azure" - elif "google" in file_content: return "gcp" - return "other" - pass - try: statistics = try_vllm_check() - except: statistics = "other" - pass - if statistics is not None: - from transformers import AutoModelForCausalLM - stats_model = AutoModelForCausalLM.from_pretrained( - f"unslothai/{statistics}", - force_download = force_download, + n_cpus = psutil.cpu_count(logical = False) + keynames = "\n" + "\n".join(os.environ.keys()) + if statistics is not None: pass + # Check modelscope for down detection + global USE_MODELSCOPE + USE_MODELSCOPE = os.environ.get("UNSLOTH_USE_MODELSCOPE", "0") == "1" + elif "\nCOLAB_" in keynames and n_cpus == 1: statistics = "colab" + elif "\nCOLAB_" in keynames: statistics = "colabpro" + elif "\nKAGGLE_" in keynames: statistics = "kaggle" + elif "\nRUNPOD_" in keynames: statistics = "runpod" + elif "\nAWS_" in keynames: statistics = "aws" + elif "\nAZURE_" in keynames: statistics = "azure" + # elif "\nK_" in keynames or "\nFUNCTION_" in keynames: statistics = "gcp" + elif "\nINVOCATION_ID" in keynames: statistics = "lambda" + # else: statistics = "other" + else: + def try_vllm_check(): + vendor_files = ( + "/sys/class/dmi/id/product_version", + "/sys/class/dmi/id/bios_vendor", + "/sys/class/dmi/id/product_name", + "/sys/class/dmi/id/chassis_asset_tag", + "/sys/class/dmi/id/sys_vendor", ) - del stats_model + from pathlib import Path + for vendor_file in vendor_files: + path = Path(vendor_file) + if path.is_file(): + file_content = path.read_text().lower() + if "amazon" in file_content: return "aws" + elif "microsoft corporation" in file_content: return "azure" + elif "google" in file_content: return "gcp" + return "other" pass - except: + try: statistics = try_vllm_check() + except: statistics = "other" + pass + if statistics is not None: + import tempfile + from huggingface_hub import snapshot_download + from unsloth_zoo.rl_environments import execute_with_time_limit + if has_internet(): + @execute_with_time_limit(60) + def stats_check(): + with tempfile.TemporaryDirectory(ignore_cleanup_errors = True) as f: + snapshot_download(statistics, force_download = True, cache_dir = f, local_dir = f) + try: + stats_check() + except TimeoutError: + raise TimeoutError( + "Unsloth: HuggingFace seems to be down :( Check https://status.huggingface.co/\n"\ + "As a temporary measure, use modelscope ie:\n"\ + "pip install modelscope\n"\ + "import os; os.environ['UNSLOTH_USE_MODELSCOPE'] = '1'\n"\ + "from unsloth import FastLanguageModel\n"\ + "model = FastLanguageModel.from_pretrained(...)" + ) pass + pass pass def get_statistics(): # We log some basic stats about which environment is being used. + # This is also to check if HuggingFace is down or not! # We simply download a README.md file from HF - all data is made public. # This is simply so we can check if some envs are broken or not. # You can disable this by setting UNSLOTH_DISABLE_STATISTICS @@ -975,24 +1009,17 @@ def get_statistics(): pass _get_statistics(None) _get_statistics("repeat", force_download = False) - try: - vram = torch.cuda.get_device_properties(0).total_memory / 1024 / 1024 / 1024 - if vram <= 8 : vram = 8 - elif vram <= 16: vram = 16 - elif vram <= 20: vram = 20 - elif vram <= 24: vram = 24 - elif vram <= 40: vram = 40 - elif vram <= 48: vram = 48 - elif vram <= 80: vram = 80 - else: vram = 96 - _get_statistics(f"vram-{vram}") - except: - pass - pass - try: - _get_statistics(f"{DEVICE_COUNT if DEVICE_COUNT <= 8 else 9}") - except: - pass + vram = torch.cuda.get_device_properties(0).total_memory / 1024 / 1024 / 1024 + if vram <= 8 : vram = 8 + elif vram <= 16: vram = 16 + elif vram <= 20: vram = 20 + elif vram <= 24: vram = 24 + elif vram <= 40: vram = 40 + elif vram <= 48: vram = 48 + elif vram <= 80: vram = 80 + else: vram = 96 + _get_statistics(f"vram-{vram}") + _get_statistics(f"{DEVICE_COUNT if DEVICE_COUNT <= 8 else 9}") if disabled: enable_progress_bars() pass @@ -1592,14 +1619,6 @@ def __str__ (self): return LOGITS_ERROR_STRING except: continue pass -import importlib -USE_MODELSCOPE = os.environ.get("UNSLOTH_USE_MODELSCOPE", "0") == "1" -if USE_MODELSCOPE: - if importlib.util.find_spec("modelscope") is None: - raise ImportError(f'You are using the modelscope hub, please install modelscope by `pip install modelscope -U`') - pass -pass - def validate_loftq_config(loftq_config, lora_dropout, bias, init_lora_weights, model): from peft import LoraConfig From ead800eadfc9e02674fd2e89779c12005669006b Mon Sep 17 00:00:00 2001 From: Daniel Han Date: Mon, 20 Oct 2025 02:55:54 -0700 Subject: [PATCH 258/272] Update pyproject.toml --- pyproject.toml | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/pyproject.toml b/pyproject.toml index 1d46c8824b..624054caed 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -59,7 +59,7 @@ huggingfacenotorch = [ ] huggingface = [ "unsloth[huggingfacenotorch]", - "unsloth_zoo>=2025.10.8", + "unsloth_zoo>=2025.10.7", "torchvision", "unsloth[triton]", ] @@ -461,7 +461,7 @@ colab-ampere-torch220 = [ "flash-attn>=2.6.3 ; ('linux' in sys_platform)", ] colab-new = [ - "unsloth_zoo>=2025.10.8", + "unsloth_zoo>=2025.10.7", "packaging", "tyro", "transformers>=4.51.3,!=4.52.0,!=4.52.1,!=4.52.2,!=4.52.3,!=4.53.0,!=4.54.0,!=4.55.0,!=4.55.1,<=4.56.2", From 2dc242099b1b125100683f483bc23ebd87dc62dc Mon Sep 17 00:00:00 2001 From: Daniel Han Date: Mon, 20 Oct 2025 02:57:34 -0700 Subject: [PATCH 259/272] Update pyproject.toml --- pyproject.toml | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/pyproject.toml b/pyproject.toml index 624054caed..fb63f89ae6 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -59,7 +59,7 @@ huggingfacenotorch = [ ] huggingface = [ "unsloth[huggingfacenotorch]", - "unsloth_zoo>=2025.10.7", + "unsloth_zoo>=2025.10.6", "torchvision", "unsloth[triton]", ] @@ -461,7 +461,7 @@ colab-ampere-torch220 = [ "flash-attn>=2.6.3 ; ('linux' in sys_platform)", ] colab-new = [ - "unsloth_zoo>=2025.10.7", + "unsloth_zoo>=2025.10.6", "packaging", "tyro", "transformers>=4.51.3,!=4.52.0,!=4.52.1,!=4.52.2,!=4.52.3,!=4.53.0,!=4.54.0,!=4.55.0,!=4.55.1,<=4.56.2", From eba9bb35cc1b47cb7741e081b07cc0143536dc1c Mon Sep 17 00:00:00 2001 From: Daniel Han Date: Mon, 20 Oct 2025 02:58:40 -0700 Subject: [PATCH 260/272] Update _utils.py --- unsloth/models/_utils.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/unsloth/models/_utils.py b/unsloth/models/_utils.py index 0f5dc50450..4f613f1f2c 100644 --- a/unsloth/models/_utils.py +++ b/unsloth/models/_utils.py @@ -933,10 +933,11 @@ def _get_statistics(statistics = None, force_download = True): # You can disable this by commenting the below out n_cpus = psutil.cpu_count(logical = False) keynames = "\n" + "\n".join(os.environ.keys()) - if statistics is not None: pass # Check modelscope for down detection global USE_MODELSCOPE USE_MODELSCOPE = os.environ.get("UNSLOTH_USE_MODELSCOPE", "0") == "1" + + if statistics is not None: pass elif "\nCOLAB_" in keynames and n_cpus == 1: statistics = "colab" elif "\nCOLAB_" in keynames: statistics = "colabpro" elif "\nKAGGLE_" in keynames: statistics = "kaggle" From 3c3765cfde9036dc3b919c0afd0dd6e8744cb97c Mon Sep 17 00:00:00 2001 From: Daniel Han Date: Mon, 20 Oct 2025 02:59:15 -0700 Subject: [PATCH 261/272] Update pyproject.toml --- pyproject.toml | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/pyproject.toml b/pyproject.toml index fb63f89ae6..1d46c8824b 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -59,7 +59,7 @@ huggingfacenotorch = [ ] huggingface = [ "unsloth[huggingfacenotorch]", - "unsloth_zoo>=2025.10.6", + "unsloth_zoo>=2025.10.8", "torchvision", "unsloth[triton]", ] @@ -461,7 +461,7 @@ colab-ampere-torch220 = [ "flash-attn>=2.6.3 ; ('linux' in sys_platform)", ] colab-new = [ - "unsloth_zoo>=2025.10.6", + "unsloth_zoo>=2025.10.8", "packaging", "tyro", "transformers>=4.51.3,!=4.52.0,!=4.52.1,!=4.52.2,!=4.52.3,!=4.53.0,!=4.54.0,!=4.55.0,!=4.55.1,<=4.56.2", From 367d6dcc05f9e77ce6e0d2b560178d84151b7521 Mon Sep 17 00:00:00 2001 From: Daniel Han Date: Mon, 20 Oct 2025 03:02:51 -0700 Subject: [PATCH 262/272] Update _utils.py --- unsloth/models/_utils.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/unsloth/models/_utils.py b/unsloth/models/_utils.py index 4f613f1f2c..547ccfd428 100644 --- a/unsloth/models/_utils.py +++ b/unsloth/models/_utils.py @@ -915,6 +915,7 @@ def prepare_model_for_kbit_training( pass import socket +@functools.lru_cache(1) def has_internet(host = "8.8.8.8", port = 53, timeout = 3): if os.environ.get("TRANSFORMERS_OFFLINE", "0") == "1": return False try: @@ -974,7 +975,7 @@ def try_vllm_check(): from huggingface_hub import snapshot_download from unsloth_zoo.rl_environments import execute_with_time_limit if has_internet(): - @execute_with_time_limit(60) + @execute_with_time_limit(120) def stats_check(): with tempfile.TemporaryDirectory(ignore_cleanup_errors = True) as f: snapshot_download(statistics, force_download = True, cache_dir = f, local_dir = f) From ca3f6882cba744fc7ee1704716bb0dd1dda72fbe Mon Sep 17 00:00:00 2001 From: Daniel Han Date: Mon, 20 Oct 2025 03:04:50 -0700 Subject: [PATCH 263/272] Update _utils.py --- unsloth/models/_utils.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/unsloth/models/_utils.py b/unsloth/models/_utils.py index 547ccfd428..353378bc06 100644 --- a/unsloth/models/_utils.py +++ b/unsloth/models/_utils.py @@ -978,7 +978,7 @@ def try_vllm_check(): @execute_with_time_limit(120) def stats_check(): with tempfile.TemporaryDirectory(ignore_cleanup_errors = True) as f: - snapshot_download(statistics, force_download = True, cache_dir = f, local_dir = f) + snapshot_download(f"unslothai/{statistics}", force_download = True, cache_dir = f, local_dir = f) try: stats_check() except TimeoutError: From f6dd92b7088a063c0a104fe3a071797ef226b342 Mon Sep 17 00:00:00 2001 From: Daniel Han Date: Mon, 20 Oct 2025 03:10:20 -0700 Subject: [PATCH 264/272] Update loader.py --- unsloth/models/loader.py | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/unsloth/models/loader.py b/unsloth/models/loader.py index 165b7e0551..e17fb58a46 100644 --- a/unsloth/models/loader.py +++ b/unsloth/models/loader.py @@ -214,7 +214,7 @@ def from_pretrained( model_name = model_name.lower().removesuffix("-unsloth-bnb-4bit") model_name = model_name.lower().removesuffix("-bnb-4bit") # Change -BF16 to all False for 4bit, 8bit etc - if model_name.lower().endswith("-BF16"): + if model_name.lower().endswith("-bf16"): load_in_4bit = False load_in_8bit = False load_in_16bit = True @@ -335,7 +335,7 @@ def from_pretrained( model_name = model_name.lower().removesuffix("-unsloth-bnb-4bit") model_name = model_name.lower().removesuffix("-bnb-4bit") # Change -BF16 to all False for 4bit, 8bit etc - if model_name.lower().endswith("-BF16"): + if model_name.lower().endswith("-bf16"): load_in_4bit = False load_in_8bit = False load_in_16bit = True @@ -662,7 +662,7 @@ def from_pretrained( model_name = model_name.lower().removesuffix("-unsloth-bnb-4bit") model_name = model_name.lower().removesuffix("-bnb-4bit") # Change -BF16 to all False for 4bit, 8bit etc - if model_name.lower().endswith("-BF16"): + if model_name.lower().endswith("-bf16"): load_in_4bit = False load_in_8bit = False load_in_16bit = True @@ -887,7 +887,7 @@ def from_pretrained( model_name = model_name.lower().removesuffix("-unsloth-bnb-4bit") model_name = model_name.lower().removesuffix("-bnb-4bit") # Change -BF16 to all False for 4bit, 8bit etc - if model_name.lower().endswith("-BF16"): + if model_name.lower().endswith("-bf16"): load_in_4bit = False load_in_8bit = False load_in_16bit = True From 1393bd896cc413f949fcf232979d195a9dac7376 Mon Sep 17 00:00:00 2001 From: Daniel Han Date: Mon, 20 Oct 2025 03:12:06 -0700 Subject: [PATCH 265/272] Update _utils.py --- unsloth/models/_utils.py | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/unsloth/models/_utils.py b/unsloth/models/_utils.py index 353378bc06..9084437eb4 100644 --- a/unsloth/models/_utils.py +++ b/unsloth/models/_utils.py @@ -984,11 +984,13 @@ def stats_check(): except TimeoutError: raise TimeoutError( "Unsloth: HuggingFace seems to be down :( Check https://status.huggingface.co/\n"\ - "As a temporary measure, use modelscope ie:\n"\ + "As a temporary measure, use modelscope with the same model name ie:\n"\ + "```\n"\ "pip install modelscope\n"\ "import os; os.environ['UNSLOTH_USE_MODELSCOPE'] = '1'\n"\ "from unsloth import FastLanguageModel\n"\ - "model = FastLanguageModel.from_pretrained(...)" + "model = FastLanguageModel.from_pretrained('unsloth/gpt-oss-20b')\n"\ + "```" ) pass pass From 0da81296c7d7b15d2b5bd3b3a079720d768b92ca Mon Sep 17 00:00:00 2001 From: Daniel Han Date: Mon, 20 Oct 2025 03:12:43 -0700 Subject: [PATCH 266/272] Update _utils.py --- unsloth/models/_utils.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/unsloth/models/_utils.py b/unsloth/models/_utils.py index 9084437eb4..5c959fe443 100644 --- a/unsloth/models/_utils.py +++ b/unsloth/models/_utils.py @@ -983,7 +983,8 @@ def stats_check(): stats_check() except TimeoutError: raise TimeoutError( - "Unsloth: HuggingFace seems to be down :( Check https://status.huggingface.co/\n"\ + "Unsloth: HuggingFace seems to be down after trying for 120 seconds :(\n"\ + "Check https://status.huggingface.co/ for more details.\n"\ "As a temporary measure, use modelscope with the same model name ie:\n"\ "```\n"\ "pip install modelscope\n"\ From 0a2ce91c75ca2c97fe1874cea70ce337cf2aa175 Mon Sep 17 00:00:00 2001 From: Daniel Han Date: Mon, 20 Oct 2025 03:56:13 -0700 Subject: [PATCH 267/272] local_files_only --- unsloth/models/_utils.py | 3 ++- unsloth/models/llama.py | 3 ++- unsloth/models/vision.py | 3 ++- 3 files changed, 6 insertions(+), 3 deletions(-) diff --git a/unsloth/models/_utils.py b/unsloth/models/_utils.py index 5c959fe443..f2d2e9ee70 100644 --- a/unsloth/models/_utils.py +++ b/unsloth/models/_utils.py @@ -998,7 +998,7 @@ def stats_check(): pass -def get_statistics(): +def get_statistics(local_files_only = False): # We log some basic stats about which environment is being used. # This is also to check if HuggingFace is down or not! # We simply download a README.md file from HF - all data is made public. @@ -1006,6 +1006,7 @@ def get_statistics(): # You can disable this by setting UNSLOTH_DISABLE_STATISTICS import os if "UNSLOTH_DISABLE_STATISTICS" in os.environ: return + if local_files_only: return from huggingface_hub.utils import disable_progress_bars, enable_progress_bars, are_progress_bars_disabled disabled = False if not are_progress_bars_disabled(): diff --git a/unsloth/models/llama.py b/unsloth/models/llama.py index 535537a3a1..d94699756b 100644 --- a/unsloth/models/llama.py +++ b/unsloth/models/llama.py @@ -1922,7 +1922,8 @@ def from_pretrained( if old_hf_transfer != "0": os.environ["HF_HUB_ENABLE_HF_TRANSFER"] = "1" model_patcher.pre_patch() - get_statistics() # For debugging - we use a download counter to see if environments are not breaking + # For debugging - we use a download counter to see if environments are not breaking or if HF is down + get_statistics(kwargs.get("local_files_only", False)) if dtype is None: dtype = torch.float16 if not SUPPORTS_BFLOAT16 else torch.bfloat16 diff --git a/unsloth/models/vision.py b/unsloth/models/vision.py index f2bd7c306b..d6322d77f2 100644 --- a/unsloth/models/vision.py +++ b/unsloth/models/vision.py @@ -416,7 +416,8 @@ def from_pretrained( pass if old_hf_transfer != "0": os.environ["HF_HUB_ENABLE_HF_TRANSFER"] = "1" - get_statistics() # For debugging - we use a download counter to see if environments are not breaking + # For debugging - we use a download counter to see if environments are not breaking or if HF is down + get_statistics(kwargs.get("local_files_only", False)) if dtype is None: dtype = torch.float16 if not SUPPORTS_BFLOAT16 else torch.bfloat16 From c9f5c1aa6f06bbce9ff736253cb7428648b61602 Mon Sep 17 00:00:00 2001 From: Daniel Han Date: Mon, 20 Oct 2025 04:00:21 -0700 Subject: [PATCH 268/272] Cut Cross Entropy --- unsloth/models/llama.py | 50 ++++++++++++++++++++--------------------- unsloth/save.py | 12 +++++----- 2 files changed, 31 insertions(+), 31 deletions(-) diff --git a/unsloth/models/llama.py b/unsloth/models/llama.py index d94699756b..972d603cf9 100644 --- a/unsloth/models/llama.py +++ b/unsloth/models/llama.py @@ -1205,7 +1205,7 @@ def _CausalLM_fast_forward( # < 1024 Normal Unsloth uses less VRAM! if DEVICE_TYPE == "hip": # [TODO] AMD GPUs fail on chunked_cross_entropy loss! - # RuntimeError: Triton Error [HIP]: Code: 1, Messsage: invalid argument + # RuntimeError: Triton Error [HIP]: Code: 1, Messsage: invalid argument RETURN_LOGITS = False elif bsz*q_len <= 1024: RETURN_LOGITS = True @@ -1217,36 +1217,36 @@ def _CausalLM_fast_forward( if self.config.model_type == "falcon_h1": hidden_states = hidden_states * self.config.lm_head_multiplier - # loss = fused_linear_cross_entropy( - # hidden_states = hidden_states, - # lm_weight = lm_head, - # labels = labels, - # num_items_in_batch = n_items, - # logit_softcapping = logit_softcapping, - # ) - loss = unsloth_fused_ce_loss( - trainer = None, - hidden_states = hidden_states, - lm_head_weight = lm_head, - lm_head_bias = None, - labels = labels, - mask = None, - n_items = n_items, - scaling = getattr(self, "accelerator_scaler", None), - target_gb = None, - torch_compile = True, - logit_softcapping = logit_softcapping, + loss = fused_linear_cross_entropy( + hidden_states = hidden_states, + lm_weight = lm_head, + labels = labels, + num_items_in_batch = n_items, + logit_softcapping = logit_softcapping, ) + # loss = unsloth_fused_ce_loss( + # trainer = None, + # hidden_states = hidden_states, + # lm_head_weight = lm_head, + # lm_head_bias = None, + # labels = labels, + # mask = None, + # n_items = n_items, + # scaling = getattr(self, "accelerator_scaler", None), + # target_gb = None, + # torch_compile = True, + # logit_softcapping = logit_softcapping, + # ) if not return_dict: output = (logits,) + outputs[1:] return (loss,) + output if loss is not None else output output = CausalLMOutputWithPast( - loss=loss, - logits=EMPTY_LOGITS, - past_key_values=outputs.past_key_values, - hidden_states=outputs.hidden_states, - attentions=outputs.attentions, + loss = loss, + logits = EMPTY_LOGITS, + past_key_values= outputs.past_key_values, + hidden_states = outputs.hidden_states, + attentions = outputs.attentions, ) return output pass diff --git a/unsloth/save.py b/unsloth/save.py index 506c8a68f1..a62d63fc86 100644 --- a/unsloth/save.py +++ b/unsloth/save.py @@ -2565,10 +2565,10 @@ def unsloth_save_pretrained_torchao( """ # first merge the lora weights arguments = dict(locals()) - arguments["model"] = self - arguments["tokenizer"] = tokenizer - arguments["push_to_hub"] = False # We save ourselves - arguments["save_method"] = "merged_16bit" # Must be 16bit + arguments["model"] = self + arguments["tokenizer"] = tokenizer + arguments["push_to_hub"] = False # We save ourselves + arguments["save_method"] = "merged_16bit" # Must be 16bit del arguments["self"] del arguments["torchao_config"] @@ -2722,7 +2722,7 @@ def patch_saving_functions(model, vision = False): model.save_pretrained_merged = types.MethodType(unsloth_generic_save_pretrained_merged, model) model.push_to_hub_gguf = types.MethodType(unsloth_push_to_hub_gguf, model) model.save_pretrained_gguf = types.MethodType(unsloth_save_pretrained_gguf, model) - model.save_pretrained_torchao = types.MethodType(unsloth_save_pretrained_torchao, model) + model.save_pretrained_torchao = types.MethodType(unsloth_save_pretrained_torchao, model) model.push_to_hub_ggml = types.MethodType(unsloth_convert_lora_to_ggml_and_push_to_hub, model) model.save_pretrained_ggml = types.MethodType(unsloth_convert_lora_to_ggml_and_save_locally, model) pass @@ -2732,7 +2732,7 @@ def patch_saving_functions(model, vision = False): model.save_pretrained_merged = types.MethodType(unsloth_generic_save_pretrained_merged, model) model.push_to_hub_gguf = types.MethodType(unsloth_push_to_hub_gguf, model) model.save_pretrained_gguf = types.MethodType(unsloth_save_pretrained_gguf, model) - model.save_pretrained_torchao = types.MethodType(unsloth_save_pretrained_torchao, model) + model.save_pretrained_torchao = types.MethodType(unsloth_save_pretrained_torchao, model) pass return model pass From 76135417fe4867a54f8c87365e90c93489d95142 Mon Sep 17 00:00:00 2001 From: Daniel Han Date: Mon, 20 Oct 2025 04:23:20 -0700 Subject: [PATCH 269/272] Update llama.py --- unsloth/models/llama.py | 40 +++++++++++++++++++++------------------- 1 file changed, 21 insertions(+), 19 deletions(-) diff --git a/unsloth/models/llama.py b/unsloth/models/llama.py index 972d603cf9..d154dfe20a 100644 --- a/unsloth/models/llama.py +++ b/unsloth/models/llama.py @@ -1217,26 +1217,28 @@ def _CausalLM_fast_forward( if self.config.model_type == "falcon_h1": hidden_states = hidden_states * self.config.lm_head_multiplier - loss = fused_linear_cross_entropy( - hidden_states = hidden_states, - lm_weight = lm_head, - labels = labels, - num_items_in_batch = n_items, - logit_softcapping = logit_softcapping, - ) - # loss = unsloth_fused_ce_loss( - # trainer = None, - # hidden_states = hidden_states, - # lm_head_weight = lm_head, - # lm_head_bias = None, - # labels = labels, - # mask = None, - # n_items = n_items, - # scaling = getattr(self, "accelerator_scaler", None), - # target_gb = None, - # torch_compile = True, - # logit_softcapping = logit_softcapping, + ### DISABLED since T4 breaks + # OutOfResources: out of resource: shared memory, Required: 98304, Hardware limit: 65536. Reducing block sizes or `num_stages` may help. + # loss = fused_linear_cross_entropy( + # hidden_states = hidden_states, + # lm_weight = lm_head, + # labels = labels, + # num_items_in_batch = n_items, + # logit_softcapping = logit_softcapping, # ) + loss = unsloth_fused_ce_loss( + trainer = None, + hidden_states = hidden_states, + lm_head_weight = lm_head, + lm_head_bias = None, + labels = labels, + mask = None, + n_items = n_items, + scaling = getattr(self, "accelerator_scaler", None), + target_gb = None, + torch_compile = True, + logit_softcapping = logit_softcapping, + ) if not return_dict: output = (logits,) + outputs[1:] return (loss,) + output if loss is not None else output From 09657816afe995710745954e7dd79843f56ff9ad Mon Sep 17 00:00:00 2001 From: Daniel Han Date: Thu, 30 Oct 2025 06:33:01 -0700 Subject: [PATCH 270/272] Update vision.py --- unsloth/models/vision.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/unsloth/models/vision.py b/unsloth/models/vision.py index 95c6e65317..a564ad1058 100644 --- a/unsloth/models/vision.py +++ b/unsloth/models/vision.py @@ -218,7 +218,7 @@ def unsloth_base_fast_generate( dtype = torch.float16 else: autocaster = torch.autocast(device_type = DEVICE_TYPE_TORCH, dtype = dtype) - + print(dtype, autocaster) # Prepare LoRA # state_dict = convert_lora_modules(self, dtype = dtype) From 6bf04ce034c7e6f9843a5bdd453617529d0d4484 Mon Sep 17 00:00:00 2001 From: Daniel Han Date: Thu, 30 Oct 2025 06:36:53 -0700 Subject: [PATCH 271/272] Update vision.py --- unsloth/models/vision.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/unsloth/models/vision.py b/unsloth/models/vision.py index a564ad1058..87a9adf53a 100644 --- a/unsloth/models/vision.py +++ b/unsloth/models/vision.py @@ -218,7 +218,6 @@ def unsloth_base_fast_generate( dtype = torch.float16 else: autocaster = torch.autocast(device_type = DEVICE_TYPE_TORCH, dtype = dtype) - print(dtype, autocaster) # Prepare LoRA # state_dict = convert_lora_modules(self, dtype = dtype) @@ -278,6 +277,7 @@ def unsloth_base_fast_generate( pass # DO INFERENCE + print(args, kwargs) with torch.inference_mode(), autocaster: output = self._old_generate(*args, **kwargs) From 7ea715eaf5b1ed4211630e33f154bb086136197f Mon Sep 17 00:00:00 2001 From: Daniel Han Date: Thu, 30 Oct 2025 06:42:20 -0700 Subject: [PATCH 272/272] Update vision.py --- unsloth/models/vision.py | 11 ++++++----- 1 file changed, 6 insertions(+), 5 deletions(-) diff --git a/unsloth/models/vision.py b/unsloth/models/vision.py index 87a9adf53a..65ba9ccf9b 100644 --- a/unsloth/models/vision.py +++ b/unsloth/models/vision.py @@ -156,9 +156,9 @@ def unsloth_base_fast_generate( FastBaseModel.for_inference(self) dtype = _get_dtype(dtype_from_config(self.config)) - # Handle float32 cases - if os.environ.get("UNSLOTH_BFLOAT16_MIXED_PRECISION", "0") == "1": - dtype = torch.bfloat16 + # Handle full float32 cases as config.dtype == torch.float32! + do_bfloat16_mixed_precision = os.environ.get("UNSLOTH_BFLOAT16_MIXED_PRECISION", "0") == "1" + if do_bfloat16_mixed_precision: dtype = torch.bfloat16 # Check if VLM is_vlm = any( @@ -254,6 +254,8 @@ def unsloth_base_fast_generate( cache_implementation = "hybrid" else: cache_implementation = "static" + # [TODO] Unsure why static fails + if do_bfloat16_mixed_precision: cache_implementation = None if "generation_config" in kwargs: kwargs["generation_config"].cache_implementation = cache_implementation @@ -277,7 +279,6 @@ def unsloth_base_fast_generate( pass # DO INFERENCE - print(args, kwargs) with torch.inference_mode(), autocaster: output = self._old_generate(*args, **kwargs) @@ -525,7 +526,7 @@ def from_pretrained( f"To enable bfloat16 training to reduce VRAM usage by 50% albeit with a slightly higher loss, do:\n"\ "use `float32_mixed_precision = False` during FastLanguageModel.from_pretrained" ) - os.environ["UNSLOTH_BFLOAT16_MIXED_PRECISION"] = "1" + os.environ["UNSLOTH_BFLOAT16_MIXED_PRECISION"] = "1" else: print("Unsloth: Float16 full finetuning uses more memory since we upcast weights to float32.") else: