diff --git a/unsloth/__init__.py b/unsloth/__init__.py index 745b210208..980425e1f1 100644 --- a/unsloth/__init__.py +++ b/unsloth/__init__.py @@ -55,7 +55,7 @@ pass # Reduce VRAM usage by reducing fragmentation -os.environ["PYTORCH_CUDA_ALLOC_CONF"] = "expandable_segments:True" +os.environ["PYTORCH_CUDA_ALLOC_CONF"] = "expandable_segments:True,roundup_power2_divisions:[64:128,256:64,>:32]" # Hugging Face Hub faster downloads if "HF_HUB_ENABLE_HF_TRANSFER" not in os.environ: diff --git a/unsloth/kernels/__init__.py b/unsloth/kernels/__init__.py index 82e7641693..ef5fa5da70 100644 --- a/unsloth/kernels/__init__.py +++ b/unsloth/kernels/__init__.py @@ -42,6 +42,7 @@ apply_lora_mlp_geglu_approx, apply_lora_qkv, apply_lora_o, + fast_lora_forward, ) from .utils import fast_dequantize, fast_gemv, QUANT_STATE, fast_linear_forward, matmul_lora diff --git a/unsloth/kernels/fast_lora.py b/unsloth/kernels/fast_lora.py index 2177b43b9e..c2b7929a29 100644 --- a/unsloth/kernels/fast_lora.py +++ b/unsloth/kernels/fast_lora.py @@ -410,3 +410,81 @@ def apply_lora_o(self, X): O = LoRA_W.apply(X, OW, OW_quant, OA, OB, OS) return O pass + + +IDENTITY_DROPOUT = torch.nn.Identity +@torch._disable_dynamo +def fast_lora_forward(self, x: torch.Tensor, *args, **kwargs) -> torch.Tensor: + raise NotImplementedError( + "Unsloth: Currently not supported yet - reshaping done incorrectly" + ) + self._check_forward_args(x, *args, **kwargs) + adapter_names = kwargs.pop("adapter_names", None) + + if self.disable_adapters: + if self.merged: + self.unmerge() + result = self.base_layer(x, *args, **kwargs) + elif adapter_names is not None: + result = self._mixed_batch_forward(x, *args, adapter_names=adapter_names, **kwargs) + elif self.merged: + result = self.base_layer(x, *args, **kwargs) + else: + # Fastpath + if len(self.active_adapters) == 1: + active_adapter = self.active_adapters[0] + if active_adapter not in self.lora_A.keys(): return self.base_layer(x, *args, **kwargs) + + dropout = self.lora_dropout[active_adapter] + if isinstance(dropout, IDENTITY_DROPOUT) and not self.use_dora[active_adapter]: + lora_A = self.lora_A[active_adapter].weight + lora_B = self.lora_B[active_adapter].weight + scaling = self.scaling[active_adapter] + W = self.base_layer.weight + return LoRA_W.apply(x, W, QUANT_STATE(W), lora_A, lora_B, scaling) + pass + pass + + result = self.base_layer(x, *args, **kwargs) + # As per Tim Dettmers, for 4bit, we need to defensively clone here. + # The reason is that in some cases, an error can occur that backprop + # does not work on a manipulated view. This issue may be solved with + # newer PyTorch versions but this would need extensive testing to be + # sure. + result = result.clone() + + for active_adapter in self.active_adapters: + if active_adapter not in self.lora_A.keys(): + continue + lora_A = self.lora_A[active_adapter] + lora_B = self.lora_B[active_adapter] + dropout = self.lora_dropout[active_adapter] + scaling = self.scaling[active_adapter] + + requires_conversion = not torch.is_autocast_enabled() + if requires_conversion: + expected_dtype = result.dtype + x = x.to(lora_A.weight.dtype) + + if not self.use_dora[active_adapter]: + result = result + lora_B(lora_A(dropout(x))) * scaling + else: + if isinstance(dropout, torch.nn.Identity) or not self.training: + base_result = result + else: + x = dropout(x) + base_result = None + + result = result + self.lora_magnitude_vector[active_adapter]( + x, + lora_A=lora_A, + lora_B=lora_B, + scaling=scaling, + base_layer=self.get_base_layer(), + base_result=base_result, + ) + if requires_conversion: + result = result.to(expected_dtype) + + return result +pass diff --git a/unsloth/kernels/rms_layernorm.py b/unsloth/kernels/rms_layernorm.py index 4b22f8c3e5..b74d636c63 100644 --- a/unsloth/kernels/rms_layernorm.py +++ b/unsloth/kernels/rms_layernorm.py @@ -57,6 +57,7 @@ def _rms_layernorm_forward( @triton.jit def _rms_layernorm_backward( dY, dY_row_stride, + dX, dX_row_stride, X, X_row_stride, W, W_row_stride, r, r_row_stride, @@ -78,6 +79,9 @@ def _rms_layernorm_backward( X += row_idx * X_row_stride r += row_idx * r_row_stride + if GEMMA: dX += row_idx * dY_row_stride + else: dX = dY + dY_row = tl.load(dY + col_offsets, mask = mask, other = 0).to(tl.float32) X_row = tl.load(X + col_offsets, mask = mask, other = 0).to(tl.float32) W_row = tl.load(W + col_offsets, mask = mask, other = 0).to(tl.float32) @@ -91,7 +95,7 @@ def _rms_layernorm_backward( rowsum_dY_normed = tl.sum(dY_W * normed, axis = 0) output = inv_var/n_cols * (n_cols*dY_W - normed*rowsum_dY_normed) - tl.store(dY + col_offsets, output, mask = mask) + tl.store(dX + col_offsets, output, mask = mask) pass @@ -172,9 +176,11 @@ def backward(ctx, dY : torch.Tensor): n_cols : int n_rows, n_cols = dY.shape # dW = X + dX = torch.empty_like(dY, device = "cuda:0") if ctx.GEMMA else dY _rms_layernorm_backward[(n_rows,)]( dY, dY.stride(0), + dX, dX.stride(0), X, X .stride(0), W, W .stride(0), r, r .stride(0), @@ -184,7 +190,7 @@ def backward(ctx, dY : torch.Tensor): BLOCK_SIZE = ctx.BLOCK_SIZE, num_warps = ctx.num_warps, ) - dX = dY.view(*shape) + dX = dX.view(*shape) return dX, None, None, None pass pass diff --git a/unsloth/models/__init__.py b/unsloth/models/__init__.py index e67a9e5fad..3230cdc207 100644 --- a/unsloth/models/__init__.py +++ b/unsloth/models/__init__.py @@ -12,7 +12,7 @@ # See the License for the specific language governing permissions and # limitations under the License. -from .loader import FastLanguageModel +from .loader import FastLanguageModel, FastVisionModel from .llama import FastLlamaModel from .mistral import FastMistralModel from .qwen2 import FastQwen2Model diff --git a/unsloth/models/_utils.py b/unsloth/models/_utils.py index daa81d97ac..ee85ba3c36 100644 --- a/unsloth/models/_utils.py +++ b/unsloth/models/_utils.py @@ -12,7 +12,7 @@ # See the License for the specific language governing permissions and # limitations under the License. -__version__ = "2024.11.7" +__version__ = "2024.11.8" __all__ = [ "prepare_model_for_kbit_training", @@ -52,6 +52,17 @@ "unpatch_unsloth_gradient_checkpointing", "patch_gradient_checkpointing", "unpatch_gradient_checkpointing", + + "HAS_CUT_CROSS_ENTROPY", + "fused_linear_cross_entropy", + "patch_unsloth_smart_gradient_checkpointing", + "unpatch_unsloth_smart_gradient_checkpointing", + "create_gradient_checkpointing_buffer", + + "patch_compiled_autograd", + "process_vision_info", + "unsloth_compile_transformers", + "patch_fast_lora", ] import torch @@ -70,6 +81,7 @@ patch_layernorm, patch_torch_compile, patch_model_and_tokenizer, + patch_compiled_autograd, ) from unsloth_zoo.gradient_checkpointing import ( Unsloth_Offloaded_Gradient_Checkpointer, @@ -81,6 +93,21 @@ unsloth_gradient_checkpoint, patch_gradient_checkpointing, unpatch_gradient_checkpointing, + + patch_unsloth_smart_gradient_checkpointing, + unpatch_unsloth_smart_gradient_checkpointing, + create_gradient_checkpointing_buffer, +) +from unsloth_zoo.loss_utils import ( + HAS_CUT_CROSS_ENTROPY, + fused_linear_cross_entropy, +) +from unsloth_zoo.vision_utils import ( + process_vision_info, +) +from unsloth_zoo.compiler import ( + get_transformers_model_type, + unsloth_compile_transformers as _unsloth_compile_transformers, ) # ============================================= @@ -120,6 +147,22 @@ def filter(self, x): return not (self.text in x.getMessage()) except: pass +# The model weights are not tied. Please use the `tie_weights` method before using the `infer_auto_device` function. +try: + from accelerate.utils.modeling import logger as accelerate_utils_modeling_logger + accelerate_utils_modeling_logger.addFilter(HideLoggingMessage("The model weights are not tied")) + del accelerate_utils_modeling_logger +except: + pass + +# Setting `pad_token_id` to `eos_token_id` +try: + from transformers.generation.utils import logger as transformers_generation_utils_logger + transformers_generation_utils_logger.addFilter(HideLoggingMessage("Setting `pad_token_id` to `eos_token_id`")) + del transformers_generation_utils_logger +except: + pass + # ============================================= # ============================================= @@ -282,54 +325,60 @@ def _is_openai_available(): return False # ============================================= # Get Xformers -from xformers import __version__ as xformers_version -# Temporarily disable 0.0.27 and higher - inference issues -if False: #Version(xformers_version) >= Version("0.0.27"): - raise ImportError( - "Unsloth: If you are in Colab, we updated the top cell install instructions - please change it to below "\ - "then press Disconnect Runtime and then Restart it.\n"\ - "\n"\ - "%%capture\n" - "# Installs Unsloth, Xformers (Flash Attention) and all other packages!\n" - '!pip install "unsloth[colab-new] @ git+https://github.com/unslothai/unsloth.git"\n' - '!pip install --no-deps "xformers<=0.0.27" trl peft accelerate bitsandbytes\n'\ - '\n'\ - f"Otherwise in local machines, your xformers version of {xformers_version} is too new.\n"\ - 'Please downgrade xformers via `pip install --force-reinstall "xformers<=0.0.27"' - ) -pass +try: + from xformers import __version__ as xformers_version + # Temporarily disable 0.0.27 and higher - inference issues + if False: #Version(xformers_version) >= Version("0.0.27"): + raise ImportError( + "Unsloth: If you are in Colab, we updated the top cell install instructions - please change it to below "\ + "then press Disconnect Runtime and then Restart it.\n"\ + "\n"\ + "%%capture\n" + "# Installs Unsloth, Xformers (Flash Attention) and all other packages!\n" + '!pip install "unsloth[colab-new] @ git+https://github.com/unslothai/unsloth.git"\n' + '!pip install --no-deps "xformers<=0.0.27" trl peft accelerate bitsandbytes\n'\ + '\n'\ + f"Otherwise in local machines, your xformers version of {xformers_version} is too new.\n"\ + 'Please downgrade xformers via `pip install --force-reinstall "xformers<=0.0.27"' + ) + pass -if Version(torch_version) < Version("2.2.0") and Version(xformers_version) >= Version("0.0.24"): - raise ImportError( - f"Unsloth: You have torch = {torch_version} but xformers = {xformers_version}.\n"\ - f"Please install xformers < 0.0.24 for torch = {torch_version}." - ) -elif Version(torch_version) < Version("2.3.0") and Version(xformers_version) >= Version("0.0.26"): - raise ImportError( - f"Unsloth: You have torch = {torch_version} but xformers = {xformers_version}.\n"\ - f"Please install xformers < 0.0.26 for torch = {torch_version}." - ) -elif Version(torch_version) < Version("2.4.0") and Version(xformers_version) > Version("0.0.27"): - raise ImportError( - f"Unsloth: You have torch = {torch_version} but xformers = {xformers_version}.\n"\ - f"Please install xformers <= 0.0.27 for torch = {torch_version}." - ) -pass + if Version(torch_version) < Version("2.2.0") and Version(xformers_version) >= Version("0.0.24"): + raise ImportError( + f"Unsloth: You have torch = {torch_version} but xformers = {xformers_version}.\n"\ + f"Please install xformers < 0.0.24 for torch = {torch_version}." + ) + elif Version(torch_version) < Version("2.3.0") and Version(xformers_version) >= Version("0.0.26"): + raise ImportError( + f"Unsloth: You have torch = {torch_version} but xformers = {xformers_version}.\n"\ + f"Please install xformers < 0.0.26 for torch = {torch_version}." + ) + elif Version(torch_version) < Version("2.4.0") and Version(xformers_version) > Version("0.0.27"): + raise ImportError( + f"Unsloth: You have torch = {torch_version} but xformers = {xformers_version}.\n"\ + f"Please install xformers <= 0.0.27 for torch = {torch_version}." + ) + pass -from xformers._cpp_lib import _register_extensions -try: - _register_extensions() # Check if C++ modules are loaded correctly -except Exception as error: - raise ImportError( - "Unsloth: Xformers was not installed correctly.\n"\ - "Please install xformers separately first.\n"\ - "Then confirm if it's correctly installed by running:\n"\ - "python -m xformers.info\n\n" - "Longer error message:\n" + str(error) - ) + from xformers._cpp_lib import _register_extensions + try: + _register_extensions() # Check if C++ modules are loaded correctly + except Exception as error: + raise ImportError( + "Unsloth: Xformers was not installed correctly.\n"\ + "Please install xformers separately first.\n"\ + "Then confirm if it's correctly installed by running:\n"\ + "python -m xformers.info\n\n" + "Longer error message:\n" + str(error) + ) + pass + import xformers.ops.fmha as xformers + xformers_attention = xformers.memory_efficient_attention +except: + xformers = None + xformers_attention = None + xformers_version = None pass -import xformers.ops.fmha as xformers -xformers_attention = xformers.memory_efficient_attention # Check TRL version from trl import __version__ as trl_version @@ -658,7 +707,7 @@ def get_statistics(): ) def _prepare_backend( - self, cpu: bool = False, sagemaker_dp = False, backend: str = None, + self, cpu = False, sagemaker_dp = False, backend: str = None, ) -> tuple[str, DistributedType]: return None, DistributedType.NO pass @@ -1047,3 +1096,69 @@ def patch_tokenizer(model, tokenizer): model.config.update({"unsloth_version" : __version__}) return model, tokenizer pass + + +def patch_fast_lora(): + import peft.tuners.lora.bnb + peft.tuners.lora.bnb.Linear4bit.forward = fast_lora_forward +pass + + +def unsloth_compile_transformers( + model_name, + token = None, + revision = None, + trust_remote_code = False, + sdpa_dynamic_mask = True, + sdpa_bool_masks = True, + sdpa_gqa_replace = True, + sdpa_dynamic_compile = True, + compile_attention = True, + disable_causal_masks = True, + compile_torch_modules = True, + compile_custom_modules = True, + compile_function_calls = True, + fuse_lm_head = True, + gradient_checkpointing = True, + manual_replacements = True, + epilogue_fusion = True, + max_autotune = False, + shape_padding = True, + cudagraphs = False, + debug = False, + import_from_cache = False, + disable = False, +): + if disable: return + model_types = get_transformers_model_type( + model_name = model_name, + token = token, + revision = revision, + trust_remote_code = trust_remote_code, + ) + for model_type in model_types: + _unsloth_compile_transformers( + model_type, + sdpa_dynamic_mask = sdpa_dynamic_mask, + sdpa_bool_masks = sdpa_bool_masks, + sdpa_gqa_replace = sdpa_gqa_replace, + sdpa_dynamic_compile = sdpa_dynamic_compile, + compile_attention = compile_attention, + disable_causal_masks = disable_causal_masks, + compile_torch_modules = compile_torch_modules, + compile_custom_modules = compile_custom_modules, + compile_function_calls = compile_function_calls, + fuse_lm_head = fuse_lm_head, + gradient_checkpointing = gradient_checkpointing, + manual_replacements = manual_replacements, + epilogue_fusion = epilogue_fusion, + max_autotune = max_autotune, + shape_padding = shape_padding, + cudagraphs = cudagraphs, + debug = debug, + import_from_cache = import_from_cache, + disable = disable, + ) + pass + return model_types +pass diff --git a/unsloth/models/gemma2.py b/unsloth/models/gemma2.py index 4eb9d64313..62ecb9690f 100644 --- a/unsloth/models/gemma2.py +++ b/unsloth/models/gemma2.py @@ -60,8 +60,7 @@ from flash_attn import flash_attn_func # [TODO] We must randomnly use torch.compile? -# I checked the gradients and formulas and I'm sure it's correct. -# I'm stumped :( +# Gemma 2 uses double RMS Layernorms, so the backward passes should not overwrite the gradients! @torch.compile(fullgraph = False, dynamic = True, options = torch_compile_options) def fast_rms_layernorm_gemma2_compiled(layernorm, X, gemma = True): old_dtype = X.dtype @@ -207,7 +206,7 @@ def Gemma2DecoderLayer_fast_forward( hidden_states += residual else: residual = hidden_states - hidden_states = fast_rms_layernorm_gemma2_compiled(self.input_layernorm, hidden_states, gemma = True) + hidden_states = fast_rms_layernorm(self.input_layernorm, hidden_states, gemma = True) hidden_states, self_attn_weights, present_key_value = self.self_attn( hidden_states=hidden_states, causal_mask=causal_mask, @@ -218,14 +217,14 @@ def Gemma2DecoderLayer_fast_forward( use_cache=use_cache, padding_mask=padding_mask, ) - hidden_states = fast_rms_layernorm_gemma2_compiled(self.post_attention_layernorm, hidden_states, gemma = True) + hidden_states = fast_rms_layernorm(self.post_attention_layernorm, hidden_states, gemma = True) hidden_states = residual + hidden_states # Fully Connected residual = hidden_states - hidden_states = fast_rms_layernorm_gemma2_compiled(self. pre_feedforward_layernorm, hidden_states, gemma = True) + hidden_states = fast_rms_layernorm(self. pre_feedforward_layernorm, hidden_states, gemma = True) hidden_states = self.mlp(hidden_states) - hidden_states = fast_rms_layernorm_gemma2_compiled(self.post_feedforward_layernorm, hidden_states, gemma = True) + hidden_states = fast_rms_layernorm(self.post_feedforward_layernorm, hidden_states, gemma = True) hidden_states = residual + hidden_states pass diff --git a/unsloth/models/llama.py b/unsloth/models/llama.py index 47a57024a2..0256fc1830 100644 --- a/unsloth/models/llama.py +++ b/unsloth/models/llama.py @@ -719,25 +719,33 @@ def LlamaModel_fast_forward( pass # Gemma2 has alternating SWA and global attn + use_static_mask = True + dynamic_SWA_mask = None + dynamic_GA_mask = None if IS_GEMMA2: if HAS_FLASH_ATTENTION_SOFTCAPPING and attention_mask is None: self.SWA_mask = True self.GA_mask = False elif attention_mask is not None: - self.SWA_mask = _prepare_4d_causal_attention_mask_for_sdpa( + + # Fixes https://github.com/unslothai/unsloth/issues/853 + # Unsloth needs a 2D mask, not a [2, 1, n, n] mask! + dynamic_SWA_mask = _prepare_4d_causal_attention_mask_for_sdpa( attention_mask, (batch_size, seq_length), inputs_embeds, past_key_values_length, sliding_window = self.config.sliding_window, - ) - self.GA_mask = _prepare_4d_causal_attention_mask_for_sdpa( + )[0][0] + dynamic_GA_mask = _prepare_4d_causal_attention_mask_for_sdpa( attention_mask, (batch_size, seq_length), inputs_embeds, past_key_values_length, sliding_window = None, - ) + )[0][0] + use_static_mask = False + elif not hasattr(self, "SWA_mask"): if HAS_FLEX_ATTENTION: # Use Flex Attention instead! @@ -772,7 +780,12 @@ def LlamaModel_fast_forward( past_key_value = past_key_values[idx] if past_key_values is not None else None mask = causal_mask - if IS_GEMMA2: mask = self.SWA_mask if (idx % 2 == 0) else self.GA_mask + if IS_GEMMA2: + if (idx % 2 == 0): + mask = self.SWA_mask if use_static_mask else dynamic_SWA_mask + else: + mask = self. GA_mask if use_static_mask else dynamic_GA_mask + pass if offloaded_gradient_checkpointing: hidden_states = Unsloth_Offloaded_Gradient_Checkpointer.apply( @@ -955,14 +968,39 @@ def _CausalLM_fast_forward( ) pass hidden_states = outputs[0] + bsz, q_len, hd = hidden_states.shape lm_head = self.lm_head.weight + logit_softcapping = getattr(self.config, "final_logit_softcapping", 0) + logit_scaling = getattr(self.config, "logit_scale", 0) + if bsz == 1 and q_len == 1: logits = torch.mv(lm_head, hidden_states.ravel().to(lm_head.dtype)) logits = logits.unsqueeze(0).unsqueeze(0) elif num_logits_to_keep != 0: logits = self.lm_head(hidden_states[:, -num_logits_to_keep:, :].to(lm_head.dtype)) else: + if HAS_CUT_CROSS_ENTROPY and labels is not None: + n_items = kwargs.get("num_items_in_batch", None) or kwargs.get("n_items", None) + loss = fused_linear_cross_entropy( + hidden_states = hidden_states, + lm_weight = lm_head, + labels = labels, + num_items_in_batch = n_items, + logit_softcapping = logit_softcapping, + ) + if not return_dict: + output = (logits,) + outputs[1:] + return (loss,) + output if loss is not None else output + + return CausalLMOutputWithPast( + loss=loss, + logits=None, + past_key_values=outputs.past_key_values, + hidden_states=outputs.hidden_states, + attentions=outputs.attentions, + ) + pass logits = self.lm_head(hidden_states.to(lm_head.dtype)) pass @@ -974,8 +1012,6 @@ def _CausalLM_fast_forward( pass loss = None - logit_softcapping = getattr(self.config, "final_logit_softcapping", 0) - logit_scaling = getattr(self.config, "logit_scale", 0) if labels is not None: shift_logits = logits if not hasattr(self, "extra_ignored_labels"): diff --git a/unsloth/models/loader.py b/unsloth/models/loader.py index 7a6322d248..232fe6acff 100644 --- a/unsloth/models/loader.py +++ b/unsloth/models/loader.py @@ -20,8 +20,8 @@ from transformers import AutoConfig from transformers import __version__ as transformers_version from peft import PeftConfig, PeftModel -from .mapper import INT_TO_FLOAT_MAPPER, FLOAT_TO_INT_MAPPER, MAP_TO_UNSLOTH_16bit -import os +from .loader_utils import get_model_name +import os, contextlib, sys try: from huggingface_hub.utils import get_token except: @@ -63,105 +63,6 @@ def _get_dtype(dtype): pass -def __get_model_name( - model_name, - load_in_4bit = True, - INT_TO_FLOAT_MAPPER = None, - FLOAT_TO_INT_MAPPER = None, - MAP_TO_UNSLOTH_16bit = None, -): - model_name = str(model_name) - lower_model_name = model_name.lower() - - if not SUPPORTS_FOURBIT and lower_model_name in INT_TO_FLOAT_MAPPER: - - model_name = INT_TO_FLOAT_MAPPER[lower_model_name] - logger.warning_once( - f"Unsloth: Your transformers version of {transformers_version} does not support native "\ - f"4bit loading.\nThe minimum required version is 4.37.\n"\ - f'Try `pip install --upgrade "transformers>=4.37"`\n'\ - f"to obtain the latest transformers build, then restart this session.\n"\ - f"For now, we shall load `{model_name}` instead (still 4bit, just slower downloading)." - ) - return model_name - - elif not load_in_4bit and lower_model_name in INT_TO_FLOAT_MAPPER: - - new_model_name = INT_TO_FLOAT_MAPPER[lower_model_name] - # logger.warning_once( - # f"Unsloth: You passed in `{model_name}` which is a 4bit model, yet you set\n"\ - # f"`load_in_4bit = False`. We shall load `{new_model_name}` instead." - # ) - return new_model_name - - elif not load_in_4bit and lower_model_name in MAP_TO_UNSLOTH_16bit: - - new_model_name = MAP_TO_UNSLOTH_16bit[lower_model_name] - return new_model_name - - elif load_in_4bit and SUPPORTS_FOURBIT and lower_model_name in FLOAT_TO_INT_MAPPER: - - new_model_name = FLOAT_TO_INT_MAPPER[lower_model_name] - # logger.warning_once( - # f"Unsloth: You passed in `{model_name}` and `load_in_4bit = True`.\n"\ - # f"We shall load `{new_model_name}` for 4x faster loading." - # ) - return new_model_name - pass - - return None -pass - - -def _get_new_mapper(): - try: - import requests - new_mapper = "https://raw.githubusercontent.com/unslothai/unsloth/main/unsloth/models/mapper.py" - with requests.get(new_mapper, timeout = 3) as new_mapper: new_mapper = new_mapper.text - new_mapper = new_mapper[new_mapper.find("__INT_TO_FLOAT_MAPPER"):] - new_mapper = new_mapper\ - .replace("INT_TO_FLOAT_MAPPER", "NEW_INT_TO_FLOAT_MAPPER")\ - .replace("FLOAT_TO_INT_MAPPER", "NEW_FLOAT_TO_INT_MAPPER")\ - .replace("MAP_TO_UNSLOTH_16bit", "NEW_MAP_TO_UNSLOTH_16bit") - - exec(new_mapper, globals()) - return NEW_INT_TO_FLOAT_MAPPER, NEW_FLOAT_TO_INT_MAPPER, NEW_MAP_TO_UNSLOTH_16bit - except: - return {}, {}, {} - pass -pass - - -def get_model_name(model_name, load_in_4bit = True): - new_model_name = __get_model_name( - model_name = model_name, - load_in_4bit = load_in_4bit, - INT_TO_FLOAT_MAPPER = INT_TO_FLOAT_MAPPER, - FLOAT_TO_INT_MAPPER = FLOAT_TO_INT_MAPPER, - MAP_TO_UNSLOTH_16bit = MAP_TO_UNSLOTH_16bit, - ) - if new_model_name is None and model_name.count("/") == 1 and model_name[0].isalnum(): - # Try checking if a new Unsloth version allows it! - NEW_INT_TO_FLOAT_MAPPER, NEW_FLOAT_TO_INT_MAPPER, NEW_MAP_TO_UNSLOTH_16bit = _get_new_mapper() - upgraded_model_name = __get_model_name( - model_name = model_name, - load_in_4bit = load_in_4bit, - INT_TO_FLOAT_MAPPER = NEW_INT_TO_FLOAT_MAPPER, - FLOAT_TO_INT_MAPPER = NEW_FLOAT_TO_INT_MAPPER, - MAP_TO_UNSLOTH_16bit = NEW_MAP_TO_UNSLOTH_16bit, - ) - if upgraded_model_name is not None: - raise NotImplementedError( - f"Unsloth: {model_name} is not supported in your current Unsloth version! Please update Unsloth via:\n\n"\ - 'pip uninstall unsloth -y\n'\ - 'pip install --upgrade --no-cache-dir "unsloth[colab-new] @ git+https://github.com/unslothai/unsloth.git"' - ) - pass - pass - return new_model_name if new_model_name is not None else model_name -pass - - class FastLanguageModel(FastLlamaModel): @staticmethod def from_pretrained( @@ -333,7 +234,8 @@ def from_pretrained( else: raise NotImplementedError( f"Unsloth: {model_name} not supported yet!\n"\ - "Make an issue to https://github.com/unslothai/unsloth!", + "Maybe you're doing vision finetuning? Please use FastVisionModel instead!\n"\ + "Otherwise, make an issue to https://github.com/unslothai/unsloth!", ) pass @@ -411,4 +313,236 @@ def from_pretrained( pass return model, tokenizer pass -pass \ No newline at end of file +pass + + +from ._utils import ( + patch_compiling_bitsandbytes, + patch_model_and_tokenizer, + prepare_model_for_kbit_training, + patch_unsloth_smart_gradient_checkpointing, + patch_compiled_autograd, + process_vision_info, + unsloth_compile_transformers, +) +from ..kernels import ( + patch_loss_functions, + post_patch_loss_function, +) +from .vision import FastBaseVisionModel + + +class FastVisionModel(FastBaseVisionModel): + @staticmethod + def from_pretrained( + model_name = "unsloth/Llama-3.2-11B-Vision-Instruct-bnb-4bit", + max_seq_length = None, # [TODO] No effect + dtype = None, + load_in_4bit = True, + token = None, + device_map = "sequential", + rope_scaling = None, # [TODO] No effect + fix_tokenizer = True, # [TODO] No effect + trust_remote_code = False, + use_gradient_checkpointing = "unsloth", + resize_model_vocab = None, # [TODO] No effect + revision = None, + *args, **kwargs, + ): + if token is None: token = get_token() + + patch_compiled_autograd() + patch_compiling_bitsandbytes() + if use_gradient_checkpointing == "unsloth": + patch_unsloth_smart_gradient_checkpointing() + + old_model_name = model_name + model_name = get_model_name(model_name, load_in_4bit) + + with contextlib.redirect_stdout(open(os.devnull, "w")): + patch_loss_functions(torch_compile = False) + model_types = unsloth_compile_transformers( + model_name = model_name, + sdpa_dynamic_mask = True, + sdpa_bool_masks = True, + sdpa_gqa_replace = True, + sdpa_dynamic_compile = True, + compile_attention = True, + disable_causal_masks = True, + compile_torch_modules = True, + compile_custom_modules = True, + compile_function_calls = True, + fuse_lm_head = True, + gradient_checkpointing = True, + manual_replacements = True, + epilogue_fusion = True, + max_autotune = False, + shape_padding = True, + cudagraphs = False, + debug = False, + import_from_cache = False, + disable = False, + ) + pass + + # First check if it's a normal model via AutoConfig + from huggingface_hub.utils import disable_progress_bars, enable_progress_bars, are_progress_bars_disabled + was_disabled = are_progress_bars_disabled() + disable_progress_bars() + + autoconfig_error = None + peft_error = None + try: + model_config = AutoConfig.from_pretrained( + model_name, + token = token, + revision = revision, + trust_remote_code = trust_remote_code, + ) + is_model = True + except Exception as error: + autoconfig_error = str(error) + is_model = False + try: + peft_config = PeftConfig.from_pretrained( + model_name, + token = token, + revision = revision, + trust_remote_code = trust_remote_code, + ) + is_peft = True + except Exception as error: + peft_error = str(error) + is_peft = False + pass + + # Both config.json and adapter_config.json should not exist! + + # Old transformers versions check + both_exist = (is_model and is_peft) and not SUPPORTS_LLAMA32 + + # New transformers need to check manually. + if SUPPORTS_LLAMA32: + # Check if folder exists locally + if os.path.isdir(model_name): + exist_adapter_config = os.path.exists(os.path.join(model_name, "adapter_config.json")) + exist_config = os.path.exists(os.path.join(model_name, "config.json")) + both_exist = exist_adapter_config and exist_config + else: + files = HfFileSystem(token = token).glob(os.path.join(model_name, "*.json")) + files = (os.path.split(x)[-1] for x in files) + if sum(x == "adapter_config.json" or x == "config.json" for x in files) >= 2: + both_exist = True + pass + pass + pass + + # Error out if both LoRA and normal model config exists. + if both_exist: + raise RuntimeError( + "Unsloth: Your repo has a LoRA adapter and a base model.\n"\ + "You have 2 files `config.json` and `adapter_config.json`.\n"\ + "We must only allow one config file.\n"\ + "Please separate the LoRA and base models to 2 repos." + ) + + elif not is_model and not is_peft: + error = autoconfig_error or peft_error + # Old transformers version + if "rope_scaling" in error.lower() and not SUPPORTS_LLAMA31: + raise ImportError( + f"Unsloth: Your transformers version of {transformers_version} does not support new RoPE scaling methods.\n"\ + f"This includes Llama 3.1. The minimum required version is 4.43.2\n"\ + f'Try `pip install --upgrade "transformers>=4.43.2"`\n'\ + f"to obtain the latest transformers build, then restart this session."\ + ) + raise RuntimeError(autoconfig_error or peft_error) + pass + + # Get base model for PEFT: + if is_peft: + # Check base model again for PEFT + model_name = get_model_name(peft_config.base_model_name_or_path, load_in_4bit) + model_config = AutoConfig.from_pretrained( + model_name, + token = token, + revision = revision, + trust_remote_code = trust_remote_code, + ) + pass + + if not was_disabled: enable_progress_bars() + + # Check if this is local model since the tokenizer gets overwritten + if os.path.exists(os.path.join(old_model_name, "tokenizer_config.json")) and \ + os.path.exists(os.path.join(old_model_name, "tokenizer.json")) and \ + os.path.exists(os.path.join(old_model_name, "special_tokens_map.json")): + + tokenizer_name = old_model_name + else: + tokenizer_name = None + pass + + model, tokenizer = FastBaseVisionModel.from_pretrained( + model_name = model_name, + max_seq_length = max_seq_length, + dtype = _get_dtype(dtype), + load_in_4bit = load_in_4bit, + token = token, + device_map = device_map, + trust_remote_code = trust_remote_code, + revision = revision if not is_peft else None, + model_types = model_types, + tokenizer_name = tokenizer_name, + *args, **kwargs, + ) + + if resize_model_vocab is not None: + model.resize_token_embeddings(resize_model_vocab) + pass + + # In case the model supports tagging, add the unsloth tag. + if hasattr(model, "add_model_tags"): + model.add_model_tags(["unsloth",]) + pass + if hasattr(tokenizer, "add_model_tags"): + tokenizer.add_model_tags(["unsloth",]) + pass + + if load_in_4bit: + # Fix up bitsandbytes config + quantization_config = \ + { + # Sometimes torch_dtype is not a string!! + "bnb_4bit_compute_dtype" : model.config.to_dict()["torch_dtype"], + "bnb_4bit_quant_type" : "nf4", + "bnb_4bit_use_double_quant" : True, + "llm_int8_enable_fp32_cpu_offload" : False, + "llm_int8_has_fp16_weight" : False, + "llm_int8_skip_modules" : None, + "llm_int8_threshold" : 6.0, + "load_in_4bit" : True, + "load_in_8bit" : False, + "quant_method" : "bitsandbytes", + } + model.config.update({"quantization_config" : quantization_config}) + pass + + if is_peft: + # From https://github.com/huggingface/peft/issues/184 + # Now add PEFT adapters + model.enable_input_require_grads() + model = PeftModel.from_pretrained( + model, + old_model_name, + token = token, + revision = revision, + is_trainable = True, + trust_remote_code = trust_remote_code, + ) + # Patch it as well! + model = FastBaseVisionModel.patch_peft_model(model, use_gradient_checkpointing) + pass + return model, tokenizer + pass +pass diff --git a/unsloth/models/loader_utils.py b/unsloth/models/loader_utils.py new file mode 100644 index 0000000000..b778b7e95b --- /dev/null +++ b/unsloth/models/loader_utils.py @@ -0,0 +1,120 @@ +# Copyright 2023-present Daniel Han-Chen & the Unsloth team. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from .mapper import INT_TO_FLOAT_MAPPER, FLOAT_TO_INT_MAPPER, MAP_TO_UNSLOTH_16bit +# https://github.com/huggingface/transformers/pull/26037 allows 4 bit loading! +from packaging.version import Version +from transformers import __version__ as transformers_version +transformers_version = Version(transformers_version) +SUPPORTS_FOURBIT = transformers_version >= Version("4.37") + + +def __get_model_name( + model_name, + load_in_4bit = True, + INT_TO_FLOAT_MAPPER = None, + FLOAT_TO_INT_MAPPER = None, + MAP_TO_UNSLOTH_16bit = None, +): + model_name = str(model_name) + lower_model_name = model_name.lower() + + if not SUPPORTS_FOURBIT and lower_model_name in INT_TO_FLOAT_MAPPER: + + model_name = INT_TO_FLOAT_MAPPER[lower_model_name] + print( + f"Unsloth: Your transformers version of {transformers_version} does not support native "\ + f"4bit loading.\nThe minimum required version is 4.37.\n"\ + f'Try `pip install --upgrade "transformers>=4.37"`\n'\ + f"to obtain the latest transformers build, then restart this session.\n"\ + f"For now, we shall load `{model_name}` instead (still 4bit, just slower downloading)." + ) + return model_name + + elif not load_in_4bit and lower_model_name in INT_TO_FLOAT_MAPPER: + + new_model_name = INT_TO_FLOAT_MAPPER[lower_model_name] + # logger.warning_once( + # f"Unsloth: You passed in `{model_name}` which is a 4bit model, yet you set\n"\ + # f"`load_in_4bit = False`. We shall load `{new_model_name}` instead." + # ) + return new_model_name + + elif not load_in_4bit and lower_model_name in MAP_TO_UNSLOTH_16bit: + + new_model_name = MAP_TO_UNSLOTH_16bit[lower_model_name] + return new_model_name + + elif load_in_4bit and SUPPORTS_FOURBIT and lower_model_name in FLOAT_TO_INT_MAPPER: + + new_model_name = FLOAT_TO_INT_MAPPER[lower_model_name] + # logger.warning_once( + # f"Unsloth: You passed in `{model_name}` and `load_in_4bit = True`.\n"\ + # f"We shall load `{new_model_name}` for 4x faster loading." + # ) + return new_model_name + pass + + return None +pass + + +def _get_new_mapper(): + try: + import requests + new_mapper = "https://raw.githubusercontent.com/unslothai/unsloth/main/unsloth/models/mapper.py" + with requests.get(new_mapper, timeout = 3) as new_mapper: new_mapper = new_mapper.text + new_mapper = new_mapper[new_mapper.find("__INT_TO_FLOAT_MAPPER"):] + new_mapper = new_mapper\ + .replace("INT_TO_FLOAT_MAPPER", "NEW_INT_TO_FLOAT_MAPPER")\ + .replace("FLOAT_TO_INT_MAPPER", "NEW_FLOAT_TO_INT_MAPPER")\ + .replace("MAP_TO_UNSLOTH_16bit", "NEW_MAP_TO_UNSLOTH_16bit") + + exec(new_mapper, globals()) + return NEW_INT_TO_FLOAT_MAPPER, NEW_FLOAT_TO_INT_MAPPER, NEW_MAP_TO_UNSLOTH_16bit + except: + return {}, {}, {} + pass +pass + + +def get_model_name(model_name, load_in_4bit = True): + new_model_name = __get_model_name( + model_name = model_name, + load_in_4bit = load_in_4bit, + INT_TO_FLOAT_MAPPER = INT_TO_FLOAT_MAPPER, + FLOAT_TO_INT_MAPPER = FLOAT_TO_INT_MAPPER, + MAP_TO_UNSLOTH_16bit = MAP_TO_UNSLOTH_16bit, + ) + if new_model_name is None and model_name.count("/") == 1 and model_name[0].isalnum(): + # Try checking if a new Unsloth version allows it! + NEW_INT_TO_FLOAT_MAPPER, NEW_FLOAT_TO_INT_MAPPER, NEW_MAP_TO_UNSLOTH_16bit = _get_new_mapper() + upgraded_model_name = __get_model_name( + model_name = model_name, + load_in_4bit = load_in_4bit, + INT_TO_FLOAT_MAPPER = NEW_INT_TO_FLOAT_MAPPER, + FLOAT_TO_INT_MAPPER = NEW_FLOAT_TO_INT_MAPPER, + MAP_TO_UNSLOTH_16bit = NEW_MAP_TO_UNSLOTH_16bit, + ) + if upgraded_model_name is not None: + raise NotImplementedError( + f"Unsloth: {model_name} is not supported in your current Unsloth version! Please update Unsloth via:\n\n"\ + 'pip uninstall unsloth unsloth_zoo -y\n'\ + 'pip install --upgrade --no-cache-dir "unsloth[colab-new] @ git+https://github.com/unslothai/unsloth.git"\n'\ + 'pip install --upgrade --no-cache-dir "git+https://github.com/unslothai/unsloth-zoo.git"\n'\ + ) + pass + pass + return new_model_name if new_model_name is not None else model_name +pass diff --git a/unsloth/models/mapper.py b/unsloth/models/mapper.py index d4f1278e1d..fc1dc8cdb0 100644 --- a/unsloth/models/mapper.py +++ b/unsloth/models/mapper.py @@ -409,12 +409,12 @@ "Qwen/Qwen2.5-Coder-32B", ), "unsloth/Qwen2.5-Coder-0.5B-Instruct-bnb-4bit" : ( - "unsloth/Qwen2.5-Coder-Instruct-0.5B", - "Qwen/Qwen2.5-Coder-Instruct-0.5B", + "unsloth/Qwen2.5-Coder-0.5B-Instruct", + "Qwen/Qwen2.5-Coder-0.5B-Instruct", ), "unsloth/Qwen2.5-Coder-1.5B-Instruct-bnb-4bit" : ( - "unsloth/Qwen2.5-Coder-Instruct-1.5B", - "Qwen/Qwen2.5-Coder-Instruct-1.5B", + "unsloth/Qwen2.5-Coder-1.5B-Instruct", + "Qwen/Qwen2.5-Coder-1.5B-Instruct", ), "unsloth/Qwen2.5-Coder-3B-Instruct-bnb-4bit" : ( "unsloth/Qwen2.5-Coder-3B-Instruct", @@ -452,6 +452,46 @@ "unsloth/Llama-3.1-Nemotron-70B-Instruct", "nvidia/Llama-3.1-Nemotron-70B-Instruct-HF", ), + "unsloth/Qwen2-VL-2B-Instruct-bnb-4bit" : ( + "unsloth/Qwen2-VL-2B-Instruct", + "Qwen/Qwen2-VL-2B-Instruct", + ), + "unsloth/Qwen2-VL-7B-Instruct-bnb-4bit" : ( + "unsloth/Qwen2-VL-7B-Instruct", + "Qwen/Qwen2-VL-7B-Instruct", + ), + "unsloth/Llama-3.2-11B-Vision-Instruct-bnb-4bit" : ( + "unsloth/Llama-3.2-11B-Vision-Instruct", + "meta-llama/Llama-3.2-11B-Vision-Instruct", + ), + "unsloth/Llama-3.2-90B-Vision-Instruct-bnb-4bit" : ( + "unsloth/Llama-3.2-90B-Vision-Instruct", + "meta-llama/Llama-3.2-90B-Vision-Instruct", + ), + "unsloth/Llama-3.2-11B-Vision-bnb-4bit" : ( + "unsloth/Llama-3.2-11B-Vision", + "meta-llama/Llama-3.2-11B-Vision", + ), + "unsloth/Llama-3.2-90B-Vision-bnb-4bit" : ( + "unsloth/Llama-3.2-90B-Vision", + "meta-llama/Llama-3.2-90B-Vision", + ), + "unsloth/Pixtral-12B-2409-bnb-4bit" : ( + "unsloth/Pixtral-12B-2409", + "mistralai/Pixtral-12B-2409", + ), + "unsloth/Pixtral-12B-2409-Base-bnb-4bit" : ( + "unsloth/Pixtral-12B-Base-2409", + "mistralai/Pixtral-12B-Base-2409", + ), + "unsloth/llava-1.5-7b-hf-bnb-4bit" : ( + "unsloth/llava-1.5-7b-hf", + "llava-hf/llava-1.5-7b-hf", + ), + "unsloth/llava-v1.6-mistral-7b-hf-bnb-4bit" : ( + "unsloth/llava-v1.6-mistral-7b-hf", + "llava-hf/llava-v1.6-mistral-7b-hf", + ), } INT_TO_FLOAT_MAPPER = {} diff --git a/unsloth/models/vision.py b/unsloth/models/vision.py index 0b8c08a371..d083144651 100644 --- a/unsloth/models/vision.py +++ b/unsloth/models/vision.py @@ -1,58 +1,86 @@ +# Unsloth Zoo - Utilities for Unsloth # Copyright 2023-present Daniel Han-Chen & the Unsloth team. All rights reserved. # -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at +# This program is free software: you can redistribute it and/or modify +# it under the terms of the GNU Lesser General Public License as published by +# the Free Software Foundation, either version 3 of the License, or +# (at your option) any later version. # -# http://www.apache.org/licenses/LICENSE-2.0 +# This program is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU General Public License for more details. # -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - +# You should have received a copy of the GNU Lesser General Public License +# along with this program. If not, see . + +import torch +from transformers import ( + BitsAndBytesConfig, + AutoModelForVision2Seq, + AutoProcessor, +) from .llama import * -from ..kernels import patch_layernorm, unpatch_layernorm -from ..kernels import patch_rms_layernorm, unpatch_rms_layernorm -from ..kernels import patch_llama_for_causal_lm, unpatch_llama_for_causal_lm -from ._utils import patch_gradient_checkpointing - -from transformers import AutoProcessor -try: - from transformers import MllamaForConditionalGeneration -except: - raise ImportError( - "Unsloth: Please update your transformers version to 4.46.0 for Llama 3.2 support!" - ) -pass +from ..kernels import ( + post_patch_loss_function, +) +from ._utils import __version__ +from peft import LoraConfig, TaskType, get_peft_model +from transformers import set_seed as transformers_set_seed +from unsloth_zoo.peft_utils import ( + get_peft_regex, + merge_and_overwrite_lora, +) + +__all__ = [ + "FastBaseVisionModel", +] + +def _wrap_fast_inference(generate, device_type, dtype, model): + # Wraps inference with bfloat16 / float16 + @torch.inference_mode + def _fast_generate(*args, **kwargs): + # For num_logits_to_keep + kwargs["num_logits_to_keep"] = 1 + + # Remove token_type_ids + kwargs.pop("token_type_ids", None) + + # Check pad_token + model_eos_token_id = getattr(model.config, "eos_token_id", None) + if model_eos_token_id is not None and hasattr(model_eos_token_id, "__iter__"): + model_eos_token_id = model_eos_token_id[0] + + kwargs["pad_token_id"] = kwargs.pop("pad_token_id", model_eos_token_id) -class FastVisionModel: + try: + kwargs["pixel_values"] = kwargs["pixel_values"].to(model.dtype) + except: + pass - def pre_patch(self): - patch_gradient_checkpointing() - patch_layernorm() - patch_rms_layernorm() - patch_llama_for_causal_lm() + # Autocasted + with torch.autocast(device_type = device_type, dtype = dtype): + output = generate(*args, **kwargs) + pass + return output pass + return _fast_generate +pass - def post_unpatch(self): - unpatch_layernorm() - unpatch_rms_layernorm() - unpatch_llama_for_causal_lm() - pass +class FastBaseVisionModel: @staticmethod def from_pretrained( - model_name = "llava-hf/llava-1.5-7b-hf", + model_name = "unsloth/llama-3-8b-bnb-4bit", max_seq_length = None, dtype = None, load_in_4bit = True, token = None, device_map = "sequential", - rope_scaling = None, trust_remote_code = False, + model_types = None, + tokenizer_name = None, **kwargs, ): if trust_remote_code: @@ -67,7 +95,7 @@ def from_pretrained( max_memory = round(gpu_stats.total_memory / 1024 / 1024 / 1024, 3) statistics = \ - f"==((====))== Unsloth {__version__}: Fast {model_patcher.__name__[4:-5]} patching. Transformers = {transformers_version}.\n"\ + f"==((====))== Unsloth {__version__}: Fast {model_types[0].title()} vision patching. Transformers = {transformers_version}.\n"\ f" \\\ /| GPU: {gpu_stats.name}. Max memory: {max_memory} GB. Platform = {platform_system}.\n"\ f"O^O/ \_/ \\ Pytorch: {torch.__version__}. CUDA = {gpu_stats.major}.{gpu_stats.minor}. CUDA Toolkit = {torch.version.cuda}.\n"\ f"\ / Bfloat16 = {str(SUPPORTS_BFLOAT16).upper()}. FA [Xformers = {xformers_version}. FA2 = {HAS_FLASH_ATTENTION}]\n"\ @@ -81,6 +109,7 @@ def from_pretrained( pass # Return old flag os.environ["HF_HUB_ENABLE_HF_TRANSFER"] = old_hf_transfer + os.environ["HF_HUB_ENABLE_HF_TRANSFER"] = "1" get_statistics() # For debugging - we use a download counter to see if environments are not breaking @@ -105,160 +134,36 @@ def from_pretrained( ) pass + kwargs.pop("attn_implementation", None); # No need since we auto call it + # Cannot be None, since HF now checks for the config if load_in_4bit: kwargs["quantization_config"] = bnb_config - self.pre_patch() - model = MllamaForConditionalGeneration.from_pretrained( + model = AutoModelForVision2Seq.from_pretrained( model_name, device_map = device_map, torch_dtype = dtype, - # quantization_config = bnb_config, + # quantization_config = bnb_config, token = token, - max_position_embeddings = max_position_embeddings, trust_remote_code = trust_remote_code, - attn_implementation = "sdpa", + # attn_implementation = "sdpa", [TODO] Pixtral for eg fails **kwargs, ) - self.post_unpatch() - # Return old flag os.environ["HF_HUB_ENABLE_HF_TRANSFER"] = old_hf_transfer # We currently only support NVIDIA GPUs - AMD / Intel is a work in progress! post_check = check_nvidia() # Counteract saved tokenizers + tokenizer_name = model_name if tokenizer_name is None else tokenizer_name tokenizer = AutoProcessor.from_pretrained( - model_name, - ) - model = FastVisionModel.post_patch(model) - - # Patch Trainer - from transformers.trainer import Trainer - try: - if Trainer._inner_training_loop.__name__ != "_fast_inner_training_loop": - inner_training_loop = inspect.getsource(Trainer._inner_training_loop) - Trainer._original_training_loop = inner_training_loop - else: - inner_training_loop = Trainer._original_training_loop - except: - raise RuntimeError('Unsloth currently does not support multi GPU setups - but we are working on it!') - pass - - if ((post_check - pre_check) >= 1).sum() > 1: - raise RuntimeError('Unsloth currently does not support multi GPU setups - but we are working on it!') - - import transformers.trainer - items_in_trainer = dir(transformers.trainer) - good_items = [] - for item in items_in_trainer: - # TODO: Support Deepspeed - if item.startswith(("deepspeed", "xm", "met", "smp")): continue - if item in inner_training_loop: good_items.append(item) - pass - exec("from transformers.trainer import (" + ", ".join(x for x in good_items) + ")", globals()) - - start = re.search('logger\.info\([\"\'].+?Running training', inner_training_loop).span(0)[0] - end = inner_training_loop.find("\n\n", start) - original_debug = inner_training_loop[start:end] - spaces = re.search('\n([\s\t]{1,})', original_debug).group(0)[1:] - front_spaces = re.match('([\s\t]{1,})', inner_training_loop).group(0) - - debug_info = """debug_info = \\ - f"==((====))== Unsloth - 2x faster free finetuning | Num GPUs = {args.world_size}\\n"\\ - f" \\\\\\ /| Num examples = {num_examples:,} | Num Epochs = {num_train_epochs:,}\\n"\\ - f"O^O/ \\_/ \\ Batch size per device = {self._train_batch_size:,} | Gradient Accumulation steps = {args.gradient_accumulation_steps}\\n"\\ - f"\\ / Total batch size = {total_train_batch_size:,} | Total steps = {max_steps:,}\\n"\\ - f' "-____-" Number of trainable parameters = {get_model_param_count(model, trainable_only=True):,}' - logger.warning(debug_info) - import subprocess, re, gc, numpy as np - a = np.array([0,]) - try: - a = subprocess.check_output('nvidia-smi --query-gpu=memory.used --format=csv', shell = True) - a = re.findall(rb'([\\d]{1,})[\\s]{1,}M', a) - a = np.array([int(x.decode('utf-8'))/1024 for x in a]) - except: - if not torch.cuda.is_available(): - raise RuntimeError('Unsloth: We do not support AMD / Intel machines yet - it is a work in progress!') - if ((a - PRE_CHECK) >= 1).sum() > 1: - raise RuntimeError('Unsloth currently does not support multi GPU setups - but we are working on it!') - for _ in range(3): - gc.collect() - torch.cuda.empty_cache()""" - - debug_info = debug_info.split('\n') - debug_info = "\n".join([debug_info[0]] + [spaces + x[8:] for x in debug_info[1:]]) - inner_training_loop = inner_training_loop.replace(original_debug, debug_info) - - debug_info = """n_total_devices = total_train_batch_size // \\ - args.gradient_accumulation_steps // self._train_batch_size - if n_total_devices > 1: - logger.warning_once('Unsloth currently does not support multi GPU setups - but we are working on it!') - debug_info =""" - debug_info = debug_info.split('\n') - debug_info = "\n".join([debug_info[0]] + [spaces + x[8:] for x in debug_info[1:]]) - inner_training_loop = inner_training_loop.replace("debug_info =", debug_info, 1) - - front_spaces = re.match(r"[\t\s]{1,}", inner_training_loop).group(0) - inner_training_loop = re.sub(r"^" + front_spaces, "", inner_training_loop, flags = re.MULTILINE) - inner_training_loop = inner_training_loop.replace( - "train_dataloader = tpu_spmd_dataloader(train_dataloader)", - "raise RuntimeError('Unsloth: TPUs are not yet supported!')" + tokenizer_name, + padding_side = "right", + token = token, ) - inner_training_loop = inner_training_loop.replace( - "self.accelerator.free_memory()", - "self.accelerator.free_memory()\n" + \ - front_spaces + "if self.is_deepspeed_enabled:"\ - "raise RuntimeError('Unsloth: Deepspeed is not yet supported!')\n", 1, - ) - - check_batches = """train_dataloader = self.get_train_dataloader() - ga = args.gradient_accumulation_steps - bsz = self._train_batch_size - total_batches = bsz * ga * args.world_size - n_total_devices = total_batches // ga // bsz - if n_total_devices > 1: - logger.warning_once('Unsloth currently does not support multi GPU setups - but we are working on it!') - divisor = n_total_devices / 1 - bsz = self._train_batch_size = max(int(bsz / divisor), 1) - if total_batches // ga // bsz > 1: - divisor = n_total_devices / 1 - ga = args.gradient_accumulation_steps = max(int(ga / divisor), 1)""" - check_batches = check_batches.split('\n') - check_batches = "\n".join([check_batches[0]] + [front_spaces + x[8:] for x in check_batches[1:]]) - inner_training_loop = inner_training_loop.replace( - "train_dataloader = self.get_train_dataloader()", - check_batches, 1, - ) - inner_training_loop = inner_training_loop.replace( - "_inner_training_loop", - "_fast_inner_training_loop", 1, - ) - exec(inner_training_loop, globals()) - Trainer._inner_training_loop = _fast_inner_training_loop - inner_training_loop = inner_training_loop.replace( - "is_torch_tpu_available()", - "False", - ) - if "n_total_devices >" not in inner_training_loop: - raise RuntimeError('Unsloth currently does not support multi GPU setups - but we are working on it!') - pass - inner_training_loop = inner_training_loop.replace( - "is_sagemaker_mp_enabled()", - "False", - ) - exec(inner_training_loop, globals()) - Trainer._inner_training_loop = _fast_inner_training_loop - - # Save max_seq_length - model.max_seq_length = max_position_embeddings - internal_model = model - while hasattr(internal_model, "model"): - internal_model.max_seq_length = max_position_embeddings - internal_model = internal_model.model - pass - internal_model.max_seq_length = max_position_embeddings + model, tokenizer = patch_tokenizer(model, tokenizer) + model = post_patch_loss_function(model) # Fix up config for transformers uploading PEFT # Not necessary anymore since we require transformers>=4.37! @@ -271,121 +176,105 @@ def from_pretrained( pass # Log Unsloth version for future fastpaths for inference - model.config.update({"unsloth_version" : __version__}) - - # Add save modules - patch_saving_functions(model) - Trainer._inner_training_loop = _fast_inner_training_loop + if hasattr(model, "config"): + model.config.update({"unsloth_version" : __version__}) + pass + patch_saving_functions(model, vision = True) + patch_saving_functions(tokenizer, vision = True) - # Also fix torch_dtype + # Save tokenizer for inference purposes + tokenizer.padding_side = "left" # Force inference internal_model = model while hasattr(internal_model, "model"): - if hasattr(internal_model, "config"): - if internal_model.config.torch_dtype == "float32": - internal_model.config.torch_dtype = torch.float32 - elif internal_model.config.torch_dtype == "bfloat16": - internal_model.config.torch_dtype = torch.bfloat16 - elif internal_model.config.torch_dtype == "float16": - internal_model.config.torch_dtype = torch.float16 - pass - pass + internal_model._saved_temp_tokenizer = tokenizer internal_model = internal_model.model pass - if hasattr(internal_model, "config"): - if internal_model.config.torch_dtype == "float32": - internal_model.config.torch_dtype = torch.float32 - elif internal_model.config.torch_dtype == "bfloat16": - internal_model.config.torch_dtype = torch.bfloat16 - elif internal_model.config.torch_dtype == "float16": - internal_model.config.torch_dtype = torch.float16 - pass - pass + internal_model._saved_temp_tokenizer = tokenizer return model, tokenizer pass - @staticmethod - def post_patch(model): - # Patch model - layers = model.model.layers - lm_head = model.get_output_embeddings().weight - - # Also patch all dtypes - BnB seems to not allocate the correct type? - # BnB default dtype seems to be float16! - correct_dtype = lm_head.weight.dtype - - for name, module in model.named_modules(): - if isinstance(module, (Bnb_Linear4bit, Peft_Linear4bit)): - weight = module.weight - quant_state = weight.quant_state - - if type(quant_state) is list: - # BnB seems to have float16 as default! - module.weight.quant_state[2] = correct_dtype # Cast to correct dtype - else: - # https://github.com/TimDettmers/bitsandbytes/pull/763/files - quant_state.dtype = correct_dtype - pass - pass - pass - - # Clear deleted GPU items - for _ in range(3): - gc.collect() - torch.cuda.empty_cache() - return model - pass - - @staticmethod def get_peft_model( model, - r = 16, - target_modules = "all-linear", - lora_alpha = 16, - lora_dropout = 0, - bias = "none", - layers_to_transform = None, - layers_pattern = None, + r = 16, + target_modules = None, + lora_alpha = 16, + lora_dropout = 0, + bias = "none", + finetune_vision_layers = True, + finetune_language_layers = True, + finetune_attention_modules = True, + finetune_mlp_modules = True, + layers_to_transform = None, + layers_pattern = None, use_gradient_checkpointing = True, - random_state = 3407, - max_seq_length = 2048, # not used anymore - use_rslora = False, - modules_to_save = None, - init_lora_weights = True, - loftq_config = {}, - temporary_location = "_unsloth_temporary_saved_buffers", + random_state = 3407, + max_seq_length = 2048, # not used anymore + use_rslora = False, + modules_to_save = None, + init_lora_weights = True, + loftq_config = {}, + temporary_location = "_unsloth_temporary_saved_buffers", **kwargs, ): transformers_set_seed(random_state) - # Get LoRA - arguments = dict( - r = r, - lora_alpha = lora_alpha, - target_modules = target_modules, - lora_dropout = lora_dropout, - bias = bias, - layers_to_transform = layers_to_transform, - init_lora_weights = init_lora_weights, - # loftq_config = loftq_config, - # use_rslora = use_rslora, - modules_to_save = modules_to_save, - **kwargs, - ) + if type(r) is not int: + raise TypeError(f"Unsloth: Rank of {str(r)} must be an integer.") + if r <= 0: + raise TypeError(f"Unsloth: Rank of {str(r)} must be larger than 0.") + + if isinstance(model, PeftModelForCausalLM): + raise RuntimeError("Unsloth: You already added LoRA adapters to your model!") + + if target_modules == "all-linear": + finetune_vision_layers = True + finetune_language_layers = True + finetune_attention_modules = True + finetune_mlp_modules = True + pass + if target_modules is None: + target_modules = get_peft_regex( + model, + finetune_vision_layers = finetune_vision_layers, + finetune_language_layers = finetune_language_layers, + finetune_attention_modules = finetune_attention_modules, + finetune_mlp_modules = finetune_mlp_modules, + ) + else: + assert(type(target_modules) in (list, tuple,)) + pass - lora_config = LoraConfig(**arguments) + # Clear deleted GPU items + for _ in range(3): + gc.collect() + torch.cuda.empty_cache() + pass - model = _get_peft_model(model, lora_config) + lora_config = LoraConfig( + r = r, + lora_alpha = lora_alpha, + target_modules = target_modules, + lora_dropout = lora_dropout, + bias = bias, + task_type = TaskType.CAUSAL_LM, + ) + model = prepare_model_for_kbit_training( + model, + use_gradient_checkpointing = use_gradient_checkpointing, + ) + model = get_peft_model(model, lora_config) - model = FastVisionModel.patch_peft_model(model, use_gradient_checkpointing) + model = FastBaseVisionModel.patch_peft_model(model, use_gradient_checkpointing) # Clear deleted GPU items for _ in range(3): gc.collect() torch.cuda.empty_cache() pass + patch_saving_functions(model, vision = True) return model pass @@ -396,6 +285,11 @@ def patch_peft_model( model, use_gradient_checkpointing = True, ): + if not isinstance(model, PeftModelForCausalLM): + raise TypeError( + "Unsloth: Your model needs to call `.get_peft_model` first!" + ) + pass model = prepare_model_for_kbit_training( model, @@ -403,20 +297,6 @@ def patch_peft_model( use_reentrant = True, ) - # Fix up config for transformers uploading PEFT - for active_adapter in model.peft_config.keys(): - # Not necessary since we requires transformers >= 4.37 - if False: - name = model.peft_config[active_adapter].base_model_name_or_path - if name.startswith("unsloth/") and name.endswith("-bnb-4bit"): - name = name[:len(name) - len("-bnb-4bit")] - model.peft_config[active_adapter].base_model_name_or_path = name - pass - # Add revision to enable future fast inference paths - # [TODO] Bugs out!see https://github.com/unslothai/unsloth/issues/492 - # model.peft_config[active_adapter].revision = f"unsloth" - pass - from transformers.trainer import Trainer if Trainer._inner_training_loop.__name__ != "_fast_inner_training_loop": raise RuntimeError( @@ -426,24 +306,7 @@ def patch_peft_model( 'Thank you for your understanding and we appreciate it immensely!' ) pass - - logger.warning_once( - f"Unsloth {__version__} patched {len(model.model.model.layers)} layers with "\ - f"{n_qkv} QKV layers, {n_o} O layers and {n_mlp} MLP layers.", - ) - patch_saving_functions(model) - - # Patch cross entropy loss labels - # Fixes https://github.com/unslothai/unsloth/issues/10 - max_seq_length = model.max_seq_length - extra_ignored_labels = torch.full((max_seq_length, 1), -100, device = "cuda:0") - model.model.extra_ignored_labels = extra_ignored_labels - internal_model = model - while hasattr(internal_model, "model"): - internal_model.max_seq_length = max_seq_length - internal_model = internal_model.model - pass - internal_model.max_seq_length = max_seq_length + patch_saving_functions(model, vision = True) # Patch tokenizer to pad to the right internal_model = model @@ -468,37 +331,40 @@ def patch_peft_model( @staticmethod def for_inference(model): - # if model.config.model_type == "qwen2": - # FastLlamaModel.for_training(model) - # return - # pass - - internal_model = model - internal_model.gradient_checkpointing = False - internal_model.training = False + model.gradient_checkpointing = False + model.training = False - while hasattr(internal_model, "model"): - internal_model = internal_model.model - internal_model.gradient_checkpointing = False - internal_model.training = False - pass - if hasattr(internal_model, "training"): - internal_model.training = False + for name, module in model.named_modules(): + if hasattr(module, "gradient_checkpointing"): + module.gradient_checkpointing = False + if hasattr(module, "training"): + module.training = False pass - # Also check if lm_head / embeddings are trained - internal_model = model - while not hasattr(internal_model, "lm_head"): - internal_model = internal_model.model - pass - lm_head = internal_model.lm_head.weight - device_type = lm_head.device.type dtype = model.config.torch_dtype - if type(dtype) is str: if dtype == "float16": dtype = torch.float16 elif dtype == "bfloat16": dtype = torch.bfloat16 pass + device_type = model.device.type + + # Wrap model.generate + if model.generate.__name__ != "_fast_generate": + model._unwrapped_old_generate = model.generate + model.generate = _wrap_fast_inference(model.generate, device_type, dtype, model) + pass + + # Patch tokenizer to pad to the left + internal_model = model + while hasattr(internal_model, "model"): + if hasattr(internal_model, "_saved_temp_tokenizer"): + internal_model._saved_temp_tokenizer.padding_side = "left" + pass + internal_model = internal_model.model + pass + if hasattr(internal_model, "_saved_temp_tokenizer"): + internal_model._saved_temp_tokenizer.padding_side = "left" + pass # Also disable training for embeddings for NEFTune if hasattr(model, "get_input_embeddings"): @@ -516,23 +382,32 @@ def for_inference(model): @staticmethod def for_training(model, use_gradient_checkpointing = True): - internal_model = model - internal_model.gradient_checkpointing = use_gradient_checkpointing - internal_model.training = True + model.gradient_checkpointing = use_gradient_checkpointing + model.training = True - # Delete all fast inference loras - for param in model.parameters(): - if hasattr(param, "_fast_lora"): - del param._fast_lora + for name, module in model.named_modules(): + if hasattr(module, "gradient_checkpointing"): + module.gradient_checkpointing = use_gradient_checkpointing + if hasattr(module, "training"): + module.training = True pass + # Also revert model.generate + if hasattr(model, "_unwrapped_old_generate"): + model.generate = model._unwrapped_old_generate + del model._unwrapped_old_generate + pass + + # Patch tokenizer to pad to the right + internal_model = model while hasattr(internal_model, "model"): + if hasattr(internal_model, "_saved_temp_tokenizer"): + internal_model._saved_temp_tokenizer.padding_side = "right" + pass internal_model = internal_model.model - internal_model.gradient_checkpointing = use_gradient_checkpointing - internal_model.training = True pass - if hasattr(internal_model, "training"): - internal_model.training = True + if hasattr(internal_model, "_saved_temp_tokenizer"): + internal_model._saved_temp_tokenizer.padding_side = "right" pass # Also re-enable training for embeddings for NEFTune @@ -548,3 +423,5 @@ def for_training(model, use_gradient_checkpointing = True): return model pass pass + + diff --git a/unsloth/save.py b/unsloth/save.py index b4c6b499cf..b503b2b47a 100644 --- a/unsloth/save.py +++ b/unsloth/save.py @@ -2041,8 +2041,153 @@ def unsloth_convert_lora_to_ggml_and_save_locally( print("Unsloth: Done.") print(f"Unsloth: Conversion completed! Output file: {output_file}") print("\nThis GGML making function was made by Maheswar. Ping him @Maheswar on the Unsloth Discord or on HuggingFace (@mahiatlinux) if you like this!") +pass + + +from unsloth_zoo.peft_utils import merge_and_overwrite_lora +from .models.loader_utils import get_model_name + +@torch.inference_mode +def unsloth_generic_save( + model, + tokenizer, + save_directory : Union[str, os.PathLike] = "unsloth_finetuned_merge", + save_method : str = "lora", # ["lora", "merged_16bit", "merged_4bit"] + push_to_hub : bool = False, + token : Optional[Union[str, bool]] = None, + is_main_process : bool = True, + state_dict : Optional[dict] = None, + save_function : Callable = torch.save, + max_shard_size : Union[int, str] = "5GB", + safe_serialization : bool = True, + variant : Optional[str] = None, + save_peft_format : bool = True, + + # Push to hub + use_temp_dir : Optional[bool] = None, + commit_message : Optional[str] = "Trained with Unsloth", + private : Optional[bool] = None, + create_pr : bool = False, + revision : str = None, + commit_description : str = "Upload model trained with Unsloth 2x faster", + tags : List[str] = None, + + # Our functions + temporary_location : str = "_unsloth_temporary_saved_buffers", + maximum_memory_usage : float = 0.9, +): + if token is None and push_to_hub: token = get_token() + + merge_and_overwrite_lora( + get_model_name, + create_huggingface_repo, + model, + save_location = save_directory, + push_to_hub = push_to_hub, + token = token, + upload_location = save_directory if push_to_hub else None, + low_disk_space_usage = True, + private = private, + ) + return +pass + + +def unsloth_generic_save_pretrained_merged( + self, + save_directory : Union[str, os.PathLike], + tokenizer = None, + save_method : str = "merged_16bit", # ["lora", "merged_16bit", "merged_4bit"] + push_to_hub : bool = False, + token : Optional[Union[str, bool]] = None, + is_main_process : bool = True, + state_dict : Optional[dict] = None, + save_function : Callable = torch.save, + max_shard_size : Union[int, str] = "5GB", + safe_serialization : bool = True, + variant : Optional[str] = None, + save_peft_format : bool = True, + tags : List[str] = None, + temporary_location : str = "_unsloth_temporary_saved_buffers", + maximum_memory_usage : float = 0.75, +): + """ + Same as .push_to_hub(...) except 4bit weights are auto + converted to float16 with as few overhead as possible. -def patch_saving_functions(model): + Choose for `save_method` to be either: + 1. `16bit`: Merge LoRA into float16 weights. Useful for GGUF / llama.cpp. + 2. `4bit`: Merge LoRA into int4 weights. Useful for DPO / HF inference. + 3. `lora`: Save LoRA adapters with no merging. Useful for HF inference. + """ + if tokenizer is None: + logger.warning_once( + "Unsloth: You're not saving a tokenizer as well?\n"\ + "You can do it separately via `tokenizer.save_pretrained(...)`" + ) + pass + + arguments = dict(locals()) + arguments["model"] = self + del arguments["self"] + unsloth_generic_save(**arguments) + for _ in range(3): + gc.collect() +pass + + +def unsloth_generic_push_to_hub_merged( + self, + repo_id : str, + tokenizer = None, + save_method : str = "merged_16bit", # ["lora", "merged_16bit", "merged_4bit"] + use_temp_dir : Optional[bool] = None, + commit_message : Optional[str] = "Trained with Unsloth", + private : Optional[bool] = None, + token : Union[bool, str, None] = None, + max_shard_size : Union[int, str, None] = "5GB", + create_pr : bool = False, + safe_serialization : bool = True, + revision : str = None, + commit_description : str = "Upload model trained with Unsloth 2x faster", + tags : Optional[List[str]] = None, + temporary_location : str = "_unsloth_temporary_saved_buffers", + maximum_memory_usage : float = 0.75, +): + """ + Same as .push_to_hub(...) except 4bit weights are auto + converted to float16 with as few overhead as possible. + + Choose for `save_method` to be either: + 1. `16bit`: Merge LoRA into float16 weights. Useful for GGUF / llama.cpp. + 2. `4bit`: Merge LoRA into int4 weights. Useful for DPO / HF inference. + 3. `lora`: Save LoRA adapters with no merging. Useful for HF inference. + """ + if tokenizer is None: + logger.warning_once( + "Unsloth: You're not saving a tokenizer as well?\n"\ + "You can do it separately via `tokenizer.push_to_hub(...)`" + ) + pass + + arguments = dict(locals()) + arguments["model"] = self + arguments["save_directory"] = repo_id + arguments["push_to_hub"] = True + del arguments["self"] + del arguments["repo_id"] + unsloth_generic_save(**arguments) + for _ in range(3): + gc.collect() +pass + + +def not_implemented_save(*args, **kwargs): + raise NotImplementedError("Unsloth: Sorry GGUF is currently not supported for vision models!") +pass + + +def patch_saving_functions(model, vision = False): import inspect import types from typing import Callable, Optional, Union, List @@ -2131,14 +2276,22 @@ def patch_saving_functions(model): pass # Add saving methods to top level model - if hasattr(model, "config"): - # Counteract tokenizers - model.push_to_hub_merged = types.MethodType(unsloth_push_to_hub_merged, model) - model.save_pretrained_merged = types.MethodType(unsloth_save_pretrained_merged, model) - model.push_to_hub_gguf = types.MethodType(unsloth_push_to_hub_gguf, model) - model.save_pretrained_gguf = types.MethodType(unsloth_save_pretrained_gguf, model) - model.push_to_hub_ggml = types.MethodType(unsloth_convert_lora_to_ggml_and_push_to_hub, model) - model.save_pretrained_ggml = types.MethodType(unsloth_convert_lora_to_ggml_and_save_locally, model) + if not vision: + if hasattr(model, "config"): + # Counteract tokenizers + model.push_to_hub_merged = types.MethodType(unsloth_push_to_hub_merged, model) + model.save_pretrained_merged = types.MethodType(unsloth_save_pretrained_merged, model) + model.push_to_hub_gguf = types.MethodType(unsloth_push_to_hub_gguf, model) + model.save_pretrained_gguf = types.MethodType(unsloth_save_pretrained_gguf, model) + model.push_to_hub_ggml = types.MethodType(unsloth_convert_lora_to_ggml_and_push_to_hub, model) + model.save_pretrained_ggml = types.MethodType(unsloth_convert_lora_to_ggml_and_save_locally, model) + pass + else: + # Vision only 1 option + model.push_to_hub_merged = types.MethodType(unsloth_generic_push_to_hub_merged, model) + model.save_pretrained_merged = types.MethodType(unsloth_generic_save_pretrained_merged, model) + model.push_to_hub_gguf = types.MethodType(not_implemented_save, model) + model.save_pretrained_gguf = types.MethodType(not_implemented_save, model) pass return model pass diff --git a/unsloth/trainer.py b/unsloth/trainer.py index 00956ed41b..012be4b0cb 100644 --- a/unsloth/trainer.py +++ b/unsloth/trainer.py @@ -20,13 +20,13 @@ import trl import inspect from trl import SFTTrainer -try: - from trl import SFTConfig as TrainingArguments -except: - from transformers import TrainingArguments -pass from . import is_bfloat16_supported -from unsloth_zoo.training_utils import unsloth_train as _unsloth_train +from unsloth_zoo.training_utils import ( + unsloth_train as _unsloth_train, +) +from unsloth_zoo.vision_utils import ( + UnslothVisionDataCollator, +) from packaging.version import Version import dataclasses @@ -35,6 +35,7 @@ "UnslothTrainer", "unsloth_train", "_patch_trl_trainer", + "UnslothVisionDataCollator", ] # Unsloth gradient accumulation fix: @@ -60,7 +61,11 @@ def unsloth_train(trainer, *args, **kwargs): pass pass - +try: + from trl import SFTConfig as TrainingArguments +except: + from transformers import TrainingArguments +pass @dataclass class UnslothTrainingArguments(TrainingArguments): embedding_learning_rate : Optional[float] = field( @@ -134,7 +139,7 @@ def create_optimizer(self): # From `trl>=0.13.0`, they changed how to pass several params to the trainer # We need to patch to make the transition smooth -def create_backwards_compatible_trainer(trainer_class, config_class): +def _backwards_compatible_trainer(trainer_class, config_class): original_init = trainer_class.__init__ @wraps(original_init) @@ -167,6 +172,7 @@ def new_init(self, *args, **kwargs): } # Get parameters that exist in Config but not in TrainingArguments + from transformers import TrainingArguments moved_params = \ set(inspect.signature(config_class) .parameters.keys()) - \ set(inspect.signature(TrainingArguments).parameters.keys()) @@ -207,14 +213,13 @@ def _patch_trl_trainer(): import trl.trainer trl_classes = dir(trl.trainer) - - non_convertable_trainer = set(["PPOv2", "AlignProp"]) - trl_trainers = set(x[:-len("Trainer")] for x in trl_classes if x.endswith("Trainer")) - non_convertable_trainer - trl_configs = set(x[:-len("Config")] for x in trl_classes if x.endswith("Config")) - non_convertable_trainer + trl_trainers = set(x[:-len("Trainer")] for x in trl_classes if x.endswith("Trainer")) + trl_configs = set(x[:-len("Config")] for x in trl_classes if x.endswith("Config")) trl_classes = list(trl_trainers & trl_configs) for x in trl_classes: - exec(f"trl.{x}Trainer.__init__ = create_backwards_compatible_trainer(trl.{x}Trainer, trl.{x}Config)", globals()) + try: exec(f"trl.{x}Trainer.__init__ = _backwards_compatible_trainer(trl.{x}Trainer, trl.{x}Config)", globals()) + except: continue pass trl.__UNSLOTH_BACKWARDS_COMPATIBLE__ = True