diff --git a/pyproject.toml b/pyproject.toml index 8e18688ddf..e563ba6fc5 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -7,7 +7,7 @@ name = "unsloth" dynamic = ["version"] description = "2-5X faster LLM finetuning" readme = "README.md" -requires-python = ">=3.9,<3.13" +requires-python = ">=3.9,<=3.13" license = {text = "Apache-2.0"} keywords = ["ai", "llm",] authors = [ @@ -37,7 +37,7 @@ triton = [ ] huggingface = [ - "unsloth_zoo>=2025.8.3", + "unsloth_zoo>=2025.8.4", "packaging", "tyro", "transformers>=4.51.3,!=4.47.0,!=4.52.0,!=4.52.1,!=4.52.2,!=4.52.3,!=4.53.0", @@ -384,7 +384,7 @@ colab-ampere-torch220 = [ "flash-attn>=2.6.3", ] colab-new = [ - "unsloth_zoo>=2025.8.3", + "unsloth_zoo>=2025.8.4", "packaging", "tyro", "transformers>=4.51.3,!=4.47.0,!=4.52.0,!=4.52.1,!=4.52.2,!=4.52.3,!=4.53.0", diff --git a/unsloth/models/_utils.py b/unsloth/models/_utils.py index 4426a28266..d1df57ad5c 100644 --- a/unsloth/models/_utils.py +++ b/unsloth/models/_utils.py @@ -12,7 +12,7 @@ # See the License for the specific language governing permissions and # limitations under the License. -__version__ = "2025.8.4" +__version__ = "2025.8.5" __all__ = [ "SUPPORTS_BFLOAT16", @@ -58,6 +58,7 @@ "HAS_CUT_CROSS_ENTROPY", "EMPTY_LOGITS", "fused_linear_cross_entropy", + "unsloth_fused_ce_loss", "patch_unsloth_smart_gradient_checkpointing", "unpatch_unsloth_smart_gradient_checkpointing", @@ -109,6 +110,7 @@ HAS_CUT_CROSS_ENTROPY, fused_linear_cross_entropy, _unsloth_get_batch_samples, + unsloth_fused_ce_loss, ) from unsloth_zoo.vision_utils import ( process_vision_info, @@ -152,6 +154,41 @@ def __init__(self, text): self.text = text def filter(self, x): return not (self.text in x.getMessage()) pass +# Stop vLLM messages +if os.environ.get('UNSLOTH_ENABLE_LOGGING', '0') != '1': + try: + from vllm.worker.worker import logger as vllm_worker_logger + vllm_worker_logger.addFilter(HideLoggingMessage("Sleep mode freed")) + del vllm_worker_logger + except: + pass + try: + from vllm.v1.worker.gpu_worker import logger as vllm_gpu_worker_logger + vllm_gpu_worker_logger.addFilter(HideLoggingMessage("Sleep mode freed")) + del vllm_gpu_worker_logger + except: + pass + try: + from vllm.executor.executor_base import logger as vllm_executor_logger + vllm_executor_logger.addFilter(HideLoggingMessage("to fall asleep")) + vllm_executor_logger.addFilter(HideLoggingMessage("to wake up")) + del vllm_executor_logger + except: + pass + try: + from vllm.core.block.prefix_caching_block import logger as vllm_prefix_caching_logger + vllm_prefix_caching_logger.addFilter(HideLoggingMessage("reset prefix cache")) + del vllm_prefix_caching_logger + except: + pass + try: + from vllm.v1.core.block_pool import logger as vllm_block_pool_logger + vllm_block_pool_logger.addFilter(HideLoggingMessage("reset prefix cache")) + del vllm_block_pool_logger + except: + pass +pass + # The speedups for torchdynamo mostly come with GPU Ampere or higher and which is not detected here. from transformers.training_args import logger as transformers_training_args_logger transformers_training_args_logger.addFilter(HideLoggingMessage("The speedups")) @@ -224,6 +261,17 @@ def filter(self, x): return not (self.text in x.getMessage()) except: pass +# You passed `quantization_config` or equivalent parameters +try: + warnings.filterwarnings( + action = "ignore", + message = r".*quantization_config.*", + category = UserWarning, + append = True, + ) +except: + pass + # Errors out on # Some weights of Gemma3nForConditionalGeneration were not initialized from the model checkpoint from transformers.modeling_utils import logger as transformers_logger diff --git a/unsloth/models/llama.py b/unsloth/models/llama.py index 3c0d5012ae..eafbd5a433 100644 --- a/unsloth/models/llama.py +++ b/unsloth/models/llama.py @@ -1197,12 +1197,25 @@ def _CausalLM_fast_forward( if self.config.model_type == "falcon_h1": hidden_states = hidden_states * self.config.lm_head_multiplier - loss = fused_linear_cross_entropy( - hidden_states = hidden_states, - lm_weight = lm_head, - labels = labels, - num_items_in_batch = n_items, - logit_softcapping = logit_softcapping, + # loss = fused_linear_cross_entropy( + # hidden_states = hidden_states, + # lm_weight = lm_head, + # labels = labels, + # num_items_in_batch = n_items, + # logit_softcapping = logit_softcapping, + # ) + loss = unsloth_fused_ce_loss( + trainer = None, + hidden_states = hidden_states, + lm_head_weight = lm_head, + lm_head_bias = None, + labels = labels, + mask = None, + n_items = n_items, + scaling = getattr(self, "accelerator_scaler", None), + target_gb = 1, + torch_compile = True, + logit_softcapping = logit_softcapping, ) if not return_dict: output = (logits,) + outputs[1:] diff --git a/unsloth/models/loader.py b/unsloth/models/loader.py index ea746be43d..15f3e43aef 100644 --- a/unsloth/models/loader.py +++ b/unsloth/models/loader.py @@ -111,6 +111,14 @@ def from_pretrained( disable_log_stats = True, *args, **kwargs, ): + # Login to allow private models + if token is None: token = get_token() + if token is not None: + try: + from huggingface_hub import login + login(token = token) + except: + pass if load_in_8bit or full_finetuning: return FastModel.from_pretrained( model_name = model_name, @@ -513,6 +521,13 @@ def from_pretrained( *args, **kwargs, ): if token is None: token = get_token() + # Login to allow private models + if token is not None: + try: + from huggingface_hub import login + login(token = token) + except: + pass if whisper_language is not None: assert(type(whisper_language) is str) if whisper_task is not None: assert(type(whisper_task) is str) SUPPORTS_BFLOAT16 = is_bfloat16_supported() @@ -587,10 +602,12 @@ def from_pretrained( if transformers_version < Version("4.53.0"): raise RuntimeError("Unsloth: Gemma 3N only works on transformers >= 4.53.0" + LATEST) elif "falcon-h1" in lowered_model_name: + # Falcon must use float32 Triton ie TRITON_F32_DEFAULT = 'ieee' + # since Mamba kernels error out on using lower precision os.environ["UNSLOTH_FORCE_CUSTOM_DTYPE"] = \ "float16;torch.float32;torch.float16;"\ - "if name.endswith(('q_proj', 'k_proj', 'v_proj', 'o_proj', 'gate_proj', 'up_proj', 'down_proj', 'head')): module.to(torch.float16); "\ - "os.environ['TRITON_F32_DEFAULT'] = 'ieee';" + "if name.endswith(('q_proj', 'k_proj', 'v_proj', 'o_proj', 'gate_proj', 'up_proj', 'down_proj', 'head')): module.to(torch.float16);"\ + "os.environ['TRITON_F32_DEFAULT'] = 'ieee'" elif "gpt-oss" in lowered_model_name: os.environ["UNSLOTH_DISABLE_STATIC_GENERATION"] = "1" # CCE fails on Tesla T4 diff --git a/unsloth/models/rl.py b/unsloth/models/rl.py index deb779588c..e751ef5e30 100644 --- a/unsloth/models/rl.py +++ b/unsloth/models/rl.py @@ -421,6 +421,20 @@ def _patch_trl_rl_trainers(trainer_file = "grpo_trainer"): RLTrainer_post += neftune_check pass + # Add accelerator scaler to model + if "model" in call_args: + neftune_check = \ + "if hasattr(self, 'accelerator'):\n"\ + " scaler = self.accelerator.scaler\n"\ + " current_model = model\n"\ + " while hasattr(current_model, 'model'):\n"\ + " current_model.accelerator_scaler = scaler\n"\ + " current_model = current_model.model\n"\ + " current_model.accelerator_scaler = scaler\n"\ + "pass\n" + RLTrainer_post += neftune_check + pass + # Edit optional metrics other_metrics_processor = "" if trainer_file in RL_METRICS_CHANGES: