diff --git a/unsloth/_gpu_init.py b/unsloth/_gpu_init.py index 2309ab3366..b35e18cf75 100644 --- a/unsloth/_gpu_init.py +++ b/unsloth/_gpu_init.py @@ -234,6 +234,14 @@ def is_bf16_supported(): # set SUPPORTS_BFLOAT16 as torch.xpu.is_bf16_supported() SUPPORTS_BFLOAT16 = torch.xpu.is_bf16_supported() +# Backwards compatibility: some notebooks import `unsloth.is_bf16_supported`. +# Ensure it exists on all backends (HIP / XPU) and has a stable signature. +if "is_bf16_supported" not in globals(): + + def is_bf16_supported(including_emulation = False): + return SUPPORTS_BFLOAT16 + + # For Gradio HF Spaces? # if "SPACE_AUTHOR_NAME" not in os.environ and "SPACE_REPO_NAME" not in os.environ: import triton diff --git a/unsloth/import_fixes.py b/unsloth/import_fixes.py index 182355640a..5173fb5ce0 100644 --- a/unsloth/import_fixes.py +++ b/unsloth/import_fixes.py @@ -405,7 +405,10 @@ def fix_vllm_aimv2_issue(): spec = importlib.util.find_spec("vllm") if spec is None: return - vllm_version = importlib_version("vllm") + try: + vllm_version = importlib_version("vllm") + except Exception: + return if Version(vllm_version) < Version("0.10.1"): vllm_location = spec.origin if vllm_location is None: diff --git a/unsloth/kernels/utils.py b/unsloth/kernels/utils.py index f77baea281..08d67d8c36 100644 --- a/unsloth/kernels/utils.py +++ b/unsloth/kernels/utils.py @@ -136,11 +136,15 @@ def calculate_settings( HAS_CUDA_STREAM = False -import bitsandbytes as bnb +try: + import bitsandbytes as bnb -# https://github.com/bitsandbytes-foundation/bitsandbytes/pull/1330/files -HAS_CUDA_STREAM = Version(bnb.__version__) > Version("0.43.3") -get_ptr = bnb.functional.get_ptr + # https://github.com/bitsandbytes-foundation/bitsandbytes/pull/1330/files + HAS_CUDA_STREAM = Version(bnb.__version__) > Version("0.43.3") + get_ptr = bnb.functional.get_ptr +except Exception: + bnb = None + get_ptr = None if DEVICE_TYPE == "xpu": HAS_XPU_STREAM = True @@ -236,21 +240,32 @@ def _get_tensor_stream(tensor: torch_Tensor) -> c_void_p: WEIGHT_BUFFERS = [] ABSMAX_BUFFERS = [] -# Bitsandbytes operations +# Bitsandbytes operations (optional) ctypes_c_int = ctypes.c_int ctypes_c_int32 = ctypes.c_int32 -cdequantize_blockwise_fp32 = bnb.functional.lib.cdequantize_blockwise_fp32 -cdequantize_blockwise_fp16_nf4 = bnb.functional.lib.cdequantize_blockwise_fp16_nf4 -cdequantize_blockwise_bf16_nf4 = bnb.functional.lib.cdequantize_blockwise_bf16_nf4 -if DEVICE_TYPE == "xpu": - # https://github.com/bitsandbytes-foundation/bitsandbytes/blob/c3b8de268fdb55a88f92feada23fc811a1e6877a/bitsandbytes/backends/xpu/ops.py#L115 - # for xpu, inference gemv using above link - cgemm_4bit_inference_naive_fp16 = bnb.functional.lib.cgemv_4bit_inference_fp16 - cgemm_4bit_inference_naive_bf16 = bnb.functional.lib.cgemv_4bit_inference_bf16 +if bnb is not None: + cdequantize_blockwise_fp32 = bnb.functional.lib.cdequantize_blockwise_fp32 + cdequantize_blockwise_fp16_nf4 = bnb.functional.lib.cdequantize_blockwise_fp16_nf4 + cdequantize_blockwise_bf16_nf4 = bnb.functional.lib.cdequantize_blockwise_bf16_nf4 + + if DEVICE_TYPE == "xpu": + # https://github.com/bitsandbytes-foundation/bitsandbytes/blob/c3b8de268fdb55a88f92feada23fc811a1e6877a/bitsandbytes/backends/xpu/ops.py#L115 + cgemm_4bit_inference_naive_fp16 = bnb.functional.lib.cgemv_4bit_inference_fp16 + cgemm_4bit_inference_naive_bf16 = bnb.functional.lib.cgemv_4bit_inference_bf16 + else: + cgemm_4bit_inference_naive_fp16 = ( + bnb.functional.lib.cgemm_4bit_inference_naive_fp16 + ) + cgemm_4bit_inference_naive_bf16 = ( + bnb.functional.lib.cgemm_4bit_inference_naive_bf16 + ) else: - cgemm_4bit_inference_naive_fp16 = bnb.functional.lib.cgemm_4bit_inference_naive_fp16 - cgemm_4bit_inference_naive_bf16 = bnb.functional.lib.cgemm_4bit_inference_naive_bf16 + cdequantize_blockwise_fp32 = None + cdequantize_blockwise_fp16_nf4 = None + cdequantize_blockwise_bf16_nf4 = None + cgemm_4bit_inference_naive_fp16 = None + cgemm_4bit_inference_naive_bf16 = None torch_device_stream = ( diff --git a/unsloth/models/_utils.py b/unsloth/models/_utils.py index 67b3468425..289c5e6a7e 100644 --- a/unsloth/models/_utils.py +++ b/unsloth/models/_utils.py @@ -2540,7 +2540,14 @@ def patch_tokenizer(model, tokenizer): def patch_fast_lora(): - import peft.tuners.lora.bnb + try: + import peft.tuners.lora.bnb + except Exception as e: + print( + "Unsloth: bitsandbytes/peft bnb not available - skipping 4bit LoRA patch.", + repr(e), + ) + return from ..kernels.fast_lora import fast_lora_forward peft.tuners.lora.bnb.Linear4bit.forward = fast_lora_forward diff --git a/unsloth/models/granite.py b/unsloth/models/granite.py index fea3dc1b36..4dc3fc5b53 100644 --- a/unsloth/models/granite.py +++ b/unsloth/models/granite.py @@ -30,8 +30,18 @@ LlamaLinearScalingRotaryEmbedding, ) from .mistral import * -from bitsandbytes.nn import Linear4bit as Bnb_Linear4bit -from peft.tuners.lora import Linear4bit as Peft_Linear4bit + +try: + from bitsandbytes.nn import Linear4bit as Bnb_Linear4bit +except Exception: + Bnb_Linear4bit = None + +try: + from peft.tuners.lora import Linear4bit as Peft_Linear4bit +except Exception: + Peft_Linear4bit = None + +_BNB_LINEAR_TYPES = tuple(t for t in (Bnb_Linear4bit, Peft_Linear4bit) if t is not None) try: from transformers.models.granite.modeling_granite import ( @@ -599,7 +609,7 @@ def post_patch(model, tokenizer, correct_dtype = None): correct_dtype = lm_head.weight.dtype for name, module in model.named_modules(): - if isinstance(module, (Bnb_Linear4bit, Peft_Linear4bit)): + if _BNB_LINEAR_TYPES and isinstance(module, _BNB_LINEAR_TYPES): weight = module.weight quant_state = weight.quant_state diff --git a/unsloth/models/llama.py b/unsloth/models/llama.py index 7af2a7d136..6033822f6f 100644 --- a/unsloth/models/llama.py +++ b/unsloth/models/llama.py @@ -3208,6 +3208,18 @@ def get_peft_model( if not SUPPORTS_RSLORA: del arguments["use_rslora"] + # PEFT API compatibility: only pass kwargs supported by the installed peft version. + try: + import inspect as _inspect + + if ( + "ensure_weight_tying" + not in _inspect.signature(LoraConfig.__init__).parameters + ): + arguments.pop("ensure_weight_tying", None) + except Exception: + arguments.pop("ensure_weight_tying", None) + _saved_temp_tokenizer = model._saved_temp_tokenizer lora_config = LoraConfig(**arguments) diff --git a/unsloth/models/rl_replacements.py b/unsloth/models/rl_replacements.py index 31d54675c9..6bac97c3bf 100644 --- a/unsloth/models/rl_replacements.py +++ b/unsloth/models/rl_replacements.py @@ -1889,6 +1889,13 @@ def masked_batch_mean(x): if x.shape[1] == 1: # when importance_sampling_level == "sequence" return x.mean() else: + # Align mask/coef lengths when left-padding adds extra tokens. + if x.shape[1] != completion_mask.shape[1]: + min_len = min(x.shape[1], completion_mask.shape[1]) + x = x[:, -min_len:] + cm = completion_mask[:, -min_len:] + denom = cm.sum().clamp(min = 1.0) + return (x * cm).sum() / denom return (x * completion_mask).sum() / completion_token_count if advantages.dim() == 1: diff --git a/unsloth/save.py b/unsloth/save.py index 3628c468e3..860660d557 100644 --- a/unsloth/save.py +++ b/unsloth/save.py @@ -33,8 +33,16 @@ IS_WINDOWS = sys.platform == "win32" LLAMA_CPP_DEFAULT_DIR = "llama.cpp" -from bitsandbytes.nn import Linear4bit as Bnb_Linear4bit -from peft.tuners.lora import Linear4bit as Peft_Linear4bit + +try: + from bitsandbytes.nn import Linear4bit as Bnb_Linear4bit +except Exception: + Bnb_Linear4bit = None + +try: + from peft.tuners.lora import Linear4bit as Peft_Linear4bit +except Exception: + Peft_Linear4bit = None from peft.tuners.lora import Linear as Peft_Linear from typing import Optional, Callable, Union, List import sys @@ -68,6 +76,10 @@ from pathlib import Path from peft import PeftModelForCausalLM, PeftModel +_MERGE_LORA_LINEAR_TYPES = tuple( + t for t in (Bnb_Linear4bit, Peft_Linear4bit, Peft_Linear) if t is not None +) + __all__ = [ "print_quantization_methods", "unsloth_save_model", @@ -381,7 +393,7 @@ def _free_cached_model(model): def _merge_lora(layer, name): bias = getattr(layer, "bias", None) - if isinstance(layer, (Bnb_Linear4bit, Peft_Linear4bit, Peft_Linear)): + if _MERGE_LORA_LINEAR_TYPES and isinstance(layer, _MERGE_LORA_LINEAR_TYPES): # Is LoRA so we need to merge! W, quant_state, A, B, s, bias = get_lora_parameters_bias(layer) if quant_state is not None: