diff --git a/pyproject.toml b/pyproject.toml index 7dfca63faa..5b9dc8bb57 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -40,7 +40,7 @@ triton = [ ] huggingface = [ - "unsloth_zoo>=2025.3.7", + "unsloth_zoo>=2025.3.8", "packaging", "tyro", "transformers>=4.46.1,!=4.47.0", @@ -354,7 +354,7 @@ colab-ampere-torch220 = [ "flash-attn>=2.6.3", ] colab-new = [ - "unsloth_zoo>=2025.3.7", + "unsloth_zoo>=2025.3.8", "packaging", "tyro", "transformers>=4.46.1,!=4.47.0", diff --git a/unsloth/__init__.py b/unsloth/__init__.py index 38453f3614..5bbb85d520 100644 --- a/unsloth/__init__.py +++ b/unsloth/__init__.py @@ -198,7 +198,7 @@ def is_bf16_supported(): return SUPPORTS_BFLOAT16 # Check for unsloth_zoo try: unsloth_zoo_version = importlib_version("unsloth_zoo") - if Version(unsloth_zoo_version) < Version("2025.3.7"): + if Version(unsloth_zoo_version) < Version("2025.3.8"): try: os.system("pip install --upgrade --no-cache-dir --no-deps unsloth_zoo") except: diff --git a/unsloth/models/__init__.py b/unsloth/models/__init__.py index a187ee577a..317525c793 100644 --- a/unsloth/models/__init__.py +++ b/unsloth/models/__init__.py @@ -13,7 +13,7 @@ # limitations under the License. from .llama import FastLlamaModel -from .loader import FastLanguageModel, FastVisionModel +from .loader import FastLanguageModel, FastVisionModel, FastTextModel, FastModel from .mistral import FastMistralModel from .qwen2 import FastQwen2Model from .granite import FastGraniteModel diff --git a/unsloth/models/_utils.py b/unsloth/models/_utils.py index 37c69ef877..03eb21f4eb 100644 --- a/unsloth/models/_utils.py +++ b/unsloth/models/_utils.py @@ -12,7 +12,7 @@ # See the License for the specific language governing permissions and # limitations under the License. -__version__ = "2025.3.8" +__version__ = "2025.3.9" __all__ = [ "SUPPORTS_BFLOAT16", diff --git a/unsloth/models/llama.py b/unsloth/models/llama.py index a490fb8ab4..3504037b66 100644 --- a/unsloth/models/llama.py +++ b/unsloth/models/llama.py @@ -91,7 +91,7 @@ def original_apply_o(self, X): pass from math import sqrt as math_sqrt -KV_CACHE_INCREMENT = 256 # KV Cache update size +KV_CACHE_INCREMENT = 512 # KV Cache update size torch_nn_functional_softmax = torch.nn.functional.softmax # SDPA has GQA internally SDPA_HAS_GQA = "enable_gqa" in scaled_dot_product_attention.__doc__ @@ -1656,6 +1656,13 @@ def from_pretrained( "Are you certain you want to do remote code execution?" ) pass + if fast_inference: + import platform + if platform.system().lower() == 'windows': + print("Unsloth: vLLM does not work in Windows! Will use Unsloth inference!") + fast_inference = False + pass + if token is None: token = get_token() if model_patcher is None: model_patcher = FastLlamaModel SUPPORTS_BFLOAT16 = is_bfloat16_supported() @@ -1966,12 +1973,17 @@ def from_pretrained( for layer in model.model.layers: layer.self_attn.rotary_emb = rotary_emb pass - + + # Add for_inference and for_training + model.for_training = functools.partial(FastLlamaModel.for_training, model) + model.for_inference = functools.partial(FastLlamaModel.for_inference, model) + # Patch generate if model.generate.__name__ != "unsloth_fast_generate": model._old_generate = model.generate unsloth_fast_generate.__doc__ = model._old_generate.__doc__ model.generate = types.MethodType(unsloth_fast_generate, model) + pass return model, tokenizer pass @@ -2404,7 +2416,7 @@ def get_peft_model( # Add for_inference and for_training model.for_training = functools.partial(FastLlamaModel.for_training, model) model.for_inference = functools.partial(FastLlamaModel.for_inference, model) - + # Patch generate if model.generate.__name__ != "unsloth_fast_generate": model._old_generate = model.generate diff --git a/unsloth/models/loader.py b/unsloth/models/loader.py index 30128cd134..800c016cc8 100644 --- a/unsloth/models/loader.py +++ b/unsloth/models/loader.py @@ -383,10 +383,13 @@ def from_pretrained( patch_loss_functions, post_patch_loss_function, ) -from .vision import FastBaseVisionModel - +from .vision import FastBaseModel +from transformers import ( + AutoModelForVision2Seq, + AutoModelForCausalLM, +) -class FastVisionModel(FastBaseVisionModel): +class FastModel(FastBaseModel): @staticmethod def from_pretrained( model_name = "unsloth/Llama-3.2-11B-Vision-Instruct-bnb-4bit", @@ -413,7 +416,7 @@ def from_pretrained( patch_compiling_bitsandbytes() if use_gradient_checkpointing == "unsloth": patch_unsloth_smart_gradient_checkpointing(dtype = dtype) - + old_model_name = model_name if not use_exact_model_name: model_name = get_model_name(model_name, load_in_4bit) @@ -427,7 +430,7 @@ def from_pretrained( from huggingface_hub.utils import disable_progress_bars, enable_progress_bars, are_progress_bars_disabled was_disabled = are_progress_bars_disabled() disable_progress_bars() - + autoconfig_error = None peft_error = None try: @@ -458,7 +461,7 @@ def from_pretrained( # Old transformers versions check both_exist = (is_model and is_peft) and not SUPPORTS_LLAMA32 - + # New transformers need to check manually. if SUPPORTS_LLAMA32: # Check if folder exists locally @@ -515,9 +518,12 @@ def from_pretrained( if not was_disabled: enable_progress_bars() do_logging = os.environ.get("UNSLOTH_ENABLE_LOGGING", "0") == "1" - redirector = sys.stdout if do_logging else open(os.devnull, "w") + if do_logging: + redirector = contextlib.nullcontext() + else: + redirector = contextlib.redirect_stdout(open(os.devnull, "w")) - with contextlib.redirect_stdout(redirector): + with redirector: patch_loss_functions(torch_compile = False) model_types = unsloth_compile_transformers( model_name = model_name, @@ -547,7 +553,6 @@ def from_pretrained( return_logits = return_logits, ) pass - if do_logging: redirector.close() # Check if this is local model since the tokenizer gets overwritten if os.path.exists(os.path.join(old_model_name, "tokenizer_config.json")) and \ @@ -559,7 +564,12 @@ def from_pretrained( tokenizer_name = None pass - model, tokenizer = FastBaseVisionModel.from_pretrained( + # Check if VLM + is_vlm = (x.endswith("ForConditionalGeneration") for x in model_config.architectures) + is_vlm = is_vlm or hasattr(model_config, "vision_config") + auto_model = AutoModelForVision2Seq if is_vlm else AutoModelForCausalLM + + model, tokenizer = FastBaseModel.from_pretrained( model_name = model_name, max_seq_length = max_seq_length, dtype = _get_dtype(dtype), @@ -570,6 +580,7 @@ def from_pretrained( revision = revision if not is_peft else None, model_types = model_types, tokenizer_name = tokenizer_name, + auto_model = auto_model, *args, **kwargs, ) @@ -617,8 +628,14 @@ def from_pretrained( trust_remote_code = trust_remote_code, ) # Patch it as well! - model = FastBaseVisionModel.patch_peft_model(model, use_gradient_checkpointing) + model = FastBaseModel.patch_peft_model(model, use_gradient_checkpointing) pass return model, tokenizer pass pass + +class FastVisionModel(FastModel): + pass + +class FastTextModel(FastModel): + pass diff --git a/unsloth/models/mapper.py b/unsloth/models/mapper.py index da7f449bb4..a2e609f203 100644 --- a/unsloth/models/mapper.py +++ b/unsloth/models/mapper.py @@ -611,6 +611,21 @@ "open-thoughts/OpenThinker-7B", "unsloth/OpenThinker-7B-bnb-4bit", ), + "unsloth/granite-3.2-2b-instruct-unsloth-bnb-4bit" : ( + "unsloth/granite-3.2-2b-instruct", + "ibm-granite/granite-3.2-2b-instruct", + "unsloth/granite-3.2-2b-instruct-bnb-4bit", + ), + "unsloth/granite-3.2-8b-instruct-unsloth-bnb-4bit" : ( + "unsloth/granite-3.2-8b-instruct", + "ibm-granite/granite-3.2-8b-instruct", + "unsloth/granite-3.2-8b-instruct-bnb-4bit", + ), + "unsloth/QwQ-32B-unsloth-bnb-4bit" : ( + "unsloth/QwQ-32B", + "Qwen/QwQ-32B", + "unsloth/QwQ-32B-bnb-4bit", + ), } INT_TO_FLOAT_MAPPER = {} diff --git a/unsloth/models/vision.py b/unsloth/models/vision.py index d13d394669..ff07ef6917 100644 --- a/unsloth/models/vision.py +++ b/unsloth/models/vision.py @@ -17,6 +17,8 @@ BitsAndBytesConfig, AutoModelForVision2Seq, AutoProcessor, + AutoTokenizer, + AutoModelForCausalLM, ) from .llama import * from ..kernels import ( @@ -31,48 +33,60 @@ requires_grad_for_gradient_checkpointing, ) from triton import __version__ as triton_version +from unsloth_zoo.utils import _get_dtype +from unsloth_zoo.patching_utils import patch_model_and_tokenizer +import types +import functools __all__ = [ - "FastBaseVisionModel", + "FastBaseModel", ] -def _wrap_fast_inference(generate, device_type, dtype, model): - # Wraps inference with bfloat16 / float16 - @torch.inference_mode - def _fast_generate(*args, **kwargs): - # For num_logits_to_keep - # kwargs["num_logits_to_keep"] = 1 - # Remove token_type_ids - kwargs.pop("token_type_ids", None) +def unsloth_base_fast_generate( + self, + *args, + **kwargs, +): + FastBaseModel.for_inference(self) + dtype = _get_dtype(self.config.torch_dtype) - # Check pad_token - model_eos_token_id = getattr(model.config, "eos_token_id", None) - if model_eos_token_id is not None and hasattr(model_eos_token_id, "__iter__"): - model_eos_token_id = model_eos_token_id[0] + # Check if VLM + is_vlm = (x.endswith("ForConditionalGeneration") for x in self.config.architectures) + is_vlm = is_vlm or hasattr(self.config, "vision_config") - kwargs["pad_token_id"] = kwargs.pop("pad_token_id", model_eos_token_id) + # Remove token_type_ids + kwargs.pop("token_type_ids", None) - try: - kwargs["pixel_values"] = kwargs["pixel_values"].to(model.dtype) - except: - pass + # VLMs do not allow logits_to_keep + if not is_vlm: kwargs["logits_to_keep"] = 1 - # Autocasted - with torch.autocast(device_type = device_type, dtype = dtype): - output = generate(*args, **kwargs) - pass - return output + # Check pad_token + model_eos_token_id = getattr(self.config, "eos_token_id", None) + if model_eos_token_id is not None and hasattr(model_eos_token_id, "__iter__"): + model_eos_token_id = model_eos_token_id[0] + + kwargs["pad_token_id"] = kwargs.pop("pad_token_id", model_eos_token_id) + + # Get pixel values for VLMs + try: kwargs["pixel_values"] = kwargs["pixel_values"].to(dtype) + except: pass + + # Mixed precision autocast + with torch.inference_mode(), torch.autocast(device_type = "cuda", dtype = dtype): + output = self._old_generate(*args, **kwargs) pass - return _fast_generate + + FastBaseModel.for_training(self) + return output pass -class FastBaseVisionModel: +class FastBaseModel: @staticmethod def from_pretrained( - model_name = "unsloth/llama-3-8b-bnb-4bit", + model_name = "unsloth/Llama-3.2-1B-Instruct", max_seq_length = None, dtype = None, load_in_4bit = True, @@ -81,6 +95,7 @@ def from_pretrained( trust_remote_code = False, model_types = None, tokenizer_name = None, + auto_model = AutoModelForVision2Seq, **kwargs, ): if trust_remote_code: @@ -94,12 +109,16 @@ def from_pretrained( gpu_stats = torch.cuda.get_device_properties(0) max_memory = round(gpu_stats.total_memory / 1024 / 1024 / 1024, 3) + from importlib.metadata import version as importlib_version + try: vllm_version = f" vLLM: {importlib_version('vllm')}." + except: vllm_version = "" + statistics = \ - f"==((====))== Unsloth {__version__}: Fast {model_types[0].title()} vision patching. Transformers: {transformers_version}.\n"\ - f" {chr(92)}{chr(92)} /| GPU: {gpu_stats.name}. Max memory: {max_memory} GB. Platform: {platform_system}.\n"\ + f"==((====))== Unsloth {__version__}: Fast {model_types[0].title()} patching. Transformers: {transformers_version}.{vllm_version}\n"\ + f" {chr(92)}{chr(92)} /| {gpu_stats.name}. Num GPUs = {torch.cuda.device_count()}. Max memory: {max_memory} GB. Platform: {platform_system}.\n"\ f"O^O/ {chr(92)}_/ {chr(92)} Torch: {torch.__version__}. CUDA: {gpu_stats.major}.{gpu_stats.minor}. CUDA Toolkit: {torch.version.cuda}. Triton: {triton_version}\n"\ f"{chr(92)} / Bfloat16 = {str(SUPPORTS_BFLOAT16).upper()}. FA [Xformers = {xformers_version}. FA2 = {HAS_FLASH_ATTENTION}]\n"\ - f' "-____-" Free Apache license: http://github.com/unslothai/unsloth' + f' "-____-" Free license: http://github.com/unslothai/unsloth' print(statistics) # Warn about fast transfers @@ -136,8 +155,8 @@ def from_pretrained( # Cannot be None, since HF now checks for the config if load_in_4bit: kwargs["quantization_config"] = bnb_config - - model = AutoModelForVision2Seq.from_pretrained( + + model = auto_model.from_pretrained( model_name, device_map = device_map, torch_dtype = dtype, @@ -152,26 +171,25 @@ def from_pretrained( # Counteract saved tokenizers tokenizer_name = model_name if tokenizer_name is None else tokenizer_name - tokenizer = AutoProcessor.from_pretrained( + auto_processor = AutoProcessor if auto_model is AutoModelForVision2Seq else AutoTokenizer + tokenizer = auto_processor.from_pretrained( tokenizer_name, padding_side = "right", token = token, ) # Add padding side as well - tokenizer.tokenizer.padding_side = "right" + if hasattr(tokenizer, "tokenizer"): + tokenizer.tokenizer.padding_side = "right" model, tokenizer = patch_tokenizer(model, tokenizer) model = post_patch_loss_function(model) - - # Fix up config for transformers uploading PEFT - # Not necessary anymore since we require transformers>=4.37! - if False: - name = model.config._name_or_path - if name.startswith("unsloth/") and name.endswith("-bnb-4bit"): - name = name[:len(name) - len("-bnb-4bit")] - model.config.update({"_name_or_path" : name}) - pass - pass + # Fix other stuff like BnB compute data types + model, tokenizer = patch_model_and_tokenizer( + model, + tokenizer, + downcast_rope = False, + fix_embeddings = False, + ) # Log Unsloth version for future fastpaths for inference if hasattr(model, "config"): @@ -187,13 +205,22 @@ def from_pretrained( # Save tokenizer for inference purposes tokenizer.padding_side = "left" # Force inference tokenizer.tokenizer.padding_side = "left" # Force inference - internal_model = model - while hasattr(internal_model, "model"): - internal_model._saved_temp_tokenizer = tokenizer - internal_model = internal_model.model + m = model + while hasattr(m, "model"): + m._saved_temp_tokenizer = tokenizer + # Also set is_loaded_in_8bit to disable incorrect DDP + m.is_loaded_in_8bit = True + m = m.model pass - internal_model._saved_temp_tokenizer = tokenizer - + m._saved_temp_tokenizer = tokenizer + # Also set is_loaded_in_8bit to disable incorrect DDP + m.is_loaded_in_8bit = True + + # Patch generate + if model.generate.__name__ != "unsloth_base_fast_generate": + model._old_generate = model.generate + unsloth_base_fast_generate.__doc__ = model._old_generate.__doc__ + model.generate = types.MethodType(unsloth_base_fast_generate, model) return model, tokenizer pass @@ -272,7 +299,7 @@ def get_peft_model( # Enable gradients on modules which are trainable requires_grad_for_gradient_checkpointing(model) - model = FastBaseVisionModel.patch_peft_model(model, use_gradient_checkpointing) + model = FastBaseModel.patch_peft_model(model, use_gradient_checkpointing) # Clear deleted GPU items for _ in range(3): @@ -281,6 +308,9 @@ def get_peft_model( pass patch_saving_functions(model, vision = True) + # Add for_inference and for_training + model.for_training = functools.partial(FastBaseModel.for_training, model) + model.for_inference = functools.partial(FastBaseModel.for_inference, model) return model pass @@ -314,62 +344,57 @@ def patch_peft_model( patch_saving_functions(model, vision = True) # Patch tokenizer to pad to the right - internal_model = model - while hasattr(internal_model, "model"): - if hasattr(internal_model, "_saved_temp_tokenizer"): - internal_model._saved_temp_tokenizer.tokenizer.padding_side = "right" + m = model + while hasattr(m, "model"): + if hasattr(m, "_saved_temp_tokenizer"): + m._saved_temp_tokenizer.tokenizer.padding_side = "right" pass - internal_model = internal_model.model + # Also set is_loaded_in_8bit to disable incorrect DDP + m.is_loaded_in_8bit = True + m = m.model pass - if hasattr(internal_model, "_saved_temp_tokenizer"): - internal_model._saved_temp_tokenizer.tokenizer.padding_side = "right" + if hasattr(m, "_saved_temp_tokenizer"): + m._saved_temp_tokenizer.tokenizer.padding_side = "right" pass + # Also set is_loaded_in_8bit to disable incorrect DDP + m.is_loaded_in_8bit = True # Clear deleted GPU items for _ in range(3): gc.collect() torch.cuda.empty_cache() pass + # Add for_inference and for_training + model.for_training = functools.partial(FastBaseModel.for_training, model) + model.for_inference = functools.partial(FastBaseModel.for_inference, model) + + # Patch generate + if model.generate.__name__ != "unsloth_base_fast_generate": + model._old_generate = model.generate + unsloth_base_fast_generate.__doc__ = model._old_generate.__doc__ + model.generate = types.MethodType(unsloth_base_fast_generate, model) return model pass @staticmethod def for_inference(model): - model.gradient_checkpointing = False - model.training = False - - for name, module in model.named_modules(): - if hasattr(module, "gradient_checkpointing"): - module.gradient_checkpointing = False - if hasattr(module, "training"): - module.training = False - pass - - dtype = model.config.torch_dtype - if type(dtype) is str: - if dtype == "float16": dtype = torch.float16 - elif dtype == "bfloat16": dtype = torch.bfloat16 - pass - device_type = model.device.type - - # Wrap model.generate - if model.generate.__name__ != "_fast_generate": - model._unwrapped_old_generate = model.generate - model.generate = _wrap_fast_inference(model.generate, device_type, dtype, model) - pass - - # Patch tokenizer to pad to the left - internal_model = model - while hasattr(internal_model, "model"): - if hasattr(internal_model, "_saved_temp_tokenizer"): - internal_model._saved_temp_tokenizer.tokenizer.padding_side = "left" - pass - internal_model = internal_model.model - pass - if hasattr(internal_model, "_saved_temp_tokenizer"): - internal_model._saved_temp_tokenizer.tokenizer.padding_side = "left" + if not hasattr(model, "parameters"): + raise TypeError("Unsloth: I think you're passing a tokenizer, not the model to for_inference!") + + def _for_inference(m): + if hasattr(m, "gradient_checkpointing"): m.gradient_checkpointing = False + if hasattr(m, "training"): m.training = False + # Pad tokenizer to the left + if hasattr(m, "_saved_temp_tokenizer"): m._saved_temp_tokenizer.padding_side = "left" + # Set a flag for generation! + m._flag_for_generation = True pass + m = model + while hasattr(m, "model"): + _for_inference(m) + m = m.model + _for_inference(m) # Also disable training for embeddings for NEFTune if hasattr(model, "get_input_embeddings"): @@ -380,40 +405,34 @@ def for_inference(model): embeddings = model.get_output_embeddings() if hasattr(embeddings, "training"): embeddings.training = False pass - return model pass @staticmethod def for_training(model, use_gradient_checkpointing = True): - model.gradient_checkpointing = use_gradient_checkpointing - model.training = True - - for name, module in model.named_modules(): - if hasattr(module, "gradient_checkpointing"): - module.gradient_checkpointing = use_gradient_checkpointing - if hasattr(module, "training"): - module.training = True - pass + if not hasattr(model, "parameters"): + raise TypeError("Unsloth: I think you're passing a tokenizer, not the model to for_training!") - # Also revert model.generate - if hasattr(model, "_unwrapped_old_generate"): - model.generate = model._unwrapped_old_generate - del model._unwrapped_old_generate + # Delete all fast inference loras + for param in model.parameters(): + if hasattr(param, "_fast_lora"): + del param._fast_lora pass - # Patch tokenizer to pad to the right - internal_model = model - while hasattr(internal_model, "model"): - if hasattr(internal_model, "_saved_temp_tokenizer"): - internal_model._saved_temp_tokenizer.tokenizer.padding_side = "right" - pass - internal_model = internal_model.model - pass - if hasattr(internal_model, "_saved_temp_tokenizer"): - internal_model._saved_temp_tokenizer.tokenizer.padding_side = "right" + def _for_training(m): + if hasattr(m, "gradient_checkpointing"): m.gradient_checkpointing = use_gradient_checkpointing + if hasattr(m, "training"): m.training = True + # Pad tokenizer to the left + if hasattr(m, "_saved_temp_tokenizer"): m._saved_temp_tokenizer.padding_side = "right" + # Set a flag for generation! + if hasattr(m, "_flag_for_generation"): del m._flag_for_generation pass + m = model + while hasattr(m, "model"): + _for_training(m) + m = m.model + _for_training(m) # Also re-enable training for embeddings for NEFTune if hasattr(model, "get_input_embeddings"): @@ -424,7 +443,6 @@ def for_training(model, use_gradient_checkpointing = True): embeddings = model.get_output_embeddings() if hasattr(embeddings, "training"): embeddings.training = True pass - return model pass pass