unslothai · danielhanchen · Feb 8, 2026 · Feb 8, 2026 · May 31, 2026 · May 31, 2026
diff --git a/unsloth/_gpu_init.py b/unsloth/_gpu_init.py
@@ -234,6 +234,14 @@ def is_bf16_supported():
     # set SUPPORTS_BFLOAT16 as torch.xpu.is_bf16_supported()
     SUPPORTS_BFLOAT16 = torch.xpu.is_bf16_supported()
 
+# Backwards compatibility: some notebooks import `unsloth.is_bf16_supported`.
+# Ensure it exists on all backends (HIP / XPU) and has a stable signature.
+if "is_bf16_supported" not in globals():
+
+    def is_bf16_supported(including_emulation = False):
+        return SUPPORTS_BFLOAT16
+
+
 # For Gradio HF Spaces?
 # if "SPACE_AUTHOR_NAME" not in os.environ and "SPACE_REPO_NAME" not in os.environ:
 import triton

@@ -405,7 +405,10 @@ def fix_vllm_aimv2_issue():
     spec = importlib.util.find_spec("vllm")
     if spec is None:
         return
-    vllm_version = importlib_version("vllm")
+    try:
+        vllm_version = importlib_version("vllm")
+    except Exception:
-    except Exception:
+    except ImportError:
-    except Exception:
+    except ImportError:
+        return
     if Version(vllm_version) < Version("0.10.1"):
         vllm_location = spec.origin
         if vllm_location is None:

@@ -136,11 +136,15 @@ def calculate_settings(
 
 
 HAS_CUDA_STREAM = False
-import bitsandbytes as bnb
+try:
+    import bitsandbytes as bnb
 
-# https://github.com/bitsandbytes-foundation/bitsandbytes/pull/1330/files
-HAS_CUDA_STREAM = Version(bnb.__version__) > Version("0.43.3")
-get_ptr = bnb.functional.get_ptr
+    # https://github.com/bitsandbytes-foundation/bitsandbytes/pull/1330/files
+    HAS_CUDA_STREAM = Version(bnb.__version__) > Version("0.43.3")
+    get_ptr = bnb.functional.get_ptr
+except Exception:
-except Exception:
+except ImportError:
-except Exception:
+except ImportError:
+    bnb = None
+    get_ptr = None
 
 if DEVICE_TYPE == "xpu":
     HAS_XPU_STREAM = True
@@ -236,21 +240,32 @@ def _get_tensor_stream(tensor: torch_Tensor) -> c_void_p:
         WEIGHT_BUFFERS = []
         ABSMAX_BUFFERS = []
 
-# Bitsandbytes operations
+# Bitsandbytes operations (optional)
 ctypes_c_int = ctypes.c_int
 ctypes_c_int32 = ctypes.c_int32
-cdequantize_blockwise_fp32 = bnb.functional.lib.cdequantize_blockwise_fp32
-cdequantize_blockwise_fp16_nf4 = bnb.functional.lib.cdequantize_blockwise_fp16_nf4
-cdequantize_blockwise_bf16_nf4 = bnb.functional.lib.cdequantize_blockwise_bf16_nf4
 
-if DEVICE_TYPE == "xpu":
-    # https://github.com/bitsandbytes-foundation/bitsandbytes/blob/c3b8de268fdb55a88f92feada23fc811a1e6877a/bitsandbytes/backends/xpu/ops.py#L115
-    # for xpu, inference gemv using above link
-    cgemm_4bit_inference_naive_fp16 = bnb.functional.lib.cgemv_4bit_inference_fp16
-    cgemm_4bit_inference_naive_bf16 = bnb.functional.lib.cgemv_4bit_inference_bf16
+if bnb is not None:
+    cdequantize_blockwise_fp32 = bnb.functional.lib.cdequantize_blockwise_fp32
+    cdequantize_blockwise_fp16_nf4 = bnb.functional.lib.cdequantize_blockwise_fp16_nf4
+    cdequantize_blockwise_bf16_nf4 = bnb.functional.lib.cdequantize_blockwise_bf16_nf4
+
+    if DEVICE_TYPE == "xpu":
+        # https://github.com/bitsandbytes-foundation/bitsandbytes/blob/c3b8de268fdb55a88f92feada23fc811a1e6877a/bitsandbytes/backends/xpu/ops.py#L115
+        cgemm_4bit_inference_naive_fp16 = bnb.functional.lib.cgemv_4bit_inference_fp16
+        cgemm_4bit_inference_naive_bf16 = bnb.functional.lib.cgemv_4bit_inference_bf16
+    else:
+        cgemm_4bit_inference_naive_fp16 = (
+            bnb.functional.lib.cgemm_4bit_inference_naive_fp16
+        )
+        cgemm_4bit_inference_naive_bf16 = (
+            bnb.functional.lib.cgemm_4bit_inference_naive_bf16
+        )
 else:
-    cgemm_4bit_inference_naive_fp16 = bnb.functional.lib.cgemm_4bit_inference_naive_fp16
-    cgemm_4bit_inference_naive_bf16 = bnb.functional.lib.cgemm_4bit_inference_naive_bf16
+    cdequantize_blockwise_fp32 = None
+    cdequantize_blockwise_fp16_nf4 = None
+    cdequantize_blockwise_bf16_nf4 = None
+    cgemm_4bit_inference_naive_fp16 = None
+    cgemm_4bit_inference_naive_bf16 = None
 
 
 torch_device_stream = (

@@ -2540,7 +2540,14 @@ def patch_tokenizer(model, tokenizer):
 
 
 def patch_fast_lora():
-    import peft.tuners.lora.bnb
+    try:
+        import peft.tuners.lora.bnb
+    except Exception as e:
-    except Exception as e:
+    except ImportError as e:
-    except Exception as e:
+    except ImportError as e:
+        print(
+            "Unsloth: bitsandbytes/peft bnb not available - skipping 4bit LoRA patch.",
+            repr(e),
+        )
+        return
     from ..kernels.fast_lora import fast_lora_forward
 
     peft.tuners.lora.bnb.Linear4bit.forward = fast_lora_forward

@@ -30,8 +30,18 @@
     LlamaLinearScalingRotaryEmbedding,
 )
 from .mistral import *
-from bitsandbytes.nn import Linear4bit as Bnb_Linear4bit
-from peft.tuners.lora import Linear4bit as Peft_Linear4bit
+
+try:
+    from bitsandbytes.nn import Linear4bit as Bnb_Linear4bit
+except Exception:
+    Bnb_Linear4bit = None
+
+try:
+    from peft.tuners.lora import Linear4bit as Peft_Linear4bit
+except Exception:
+    Peft_Linear4bit = None
-try:
-    from bitsandbytes.nn import Linear4bit as Bnb_Linear4bit
-except Exception:
-    Bnb_Linear4bit = None
-
-try:
-    from peft.tuners.lora import Linear4bit as Peft_Linear4bit
-except Exception:
-    Peft_Linear4bit = None
+try:
+    from bitsandbytes.nn import Linear4bit as Bnb_Linear4bit
+except ImportError:
+    Bnb_Linear4bit = None
+
+try:
+    from peft.tuners.lora import Linear4bit as Peft_Linear4bit
+except ImportError:
+    Peft_Linear4bit = None
-try:
-    from bitsandbytes.nn import Linear4bit as Bnb_Linear4bit
-except Exception:
-    Bnb_Linear4bit = None
-
-try:
-    from peft.tuners.lora import Linear4bit as Peft_Linear4bit
-except Exception:
-    Peft_Linear4bit = None
+try:
+    from bitsandbytes.nn import Linear4bit as Bnb_Linear4bit
+except ImportError:
+    Bnb_Linear4bit = None
+
+try:
+    from peft.tuners.lora import Linear4bit as Peft_Linear4bit
+except ImportError:
+    Peft_Linear4bit = None
+
+_BNB_LINEAR_TYPES = tuple(t for t in (Bnb_Linear4bit, Peft_Linear4bit) if t is not None)
 
 try:
     from transformers.models.granite.modeling_granite import (
@@ -599,7 +609,7 @@ def post_patch(model, tokenizer, correct_dtype = None):
         correct_dtype = lm_head.weight.dtype
 
         for name, module in model.named_modules():
-            if isinstance(module, (Bnb_Linear4bit, Peft_Linear4bit)):
+            if _BNB_LINEAR_TYPES and isinstance(module, _BNB_LINEAR_TYPES):
                 weight = module.weight
                 quant_state = weight.quant_state
 

@@ -3208,6 +3208,18 @@ def get_peft_model(
         if not SUPPORTS_RSLORA:
             del arguments["use_rslora"]
 
+        # PEFT API compatibility: only pass kwargs supported by the installed peft version.
+        try:
+            import inspect as _inspect
+
+            if (
+                "ensure_weight_tying"
+                not in _inspect.signature(LoraConfig.__init__).parameters
+            ):
+                arguments.pop("ensure_weight_tying", None)
+        except Exception:
+            arguments.pop("ensure_weight_tying", None)
-        except Exception:
-            arguments.pop("ensure_weight_tying", None)
+        except (ImportError, AttributeError, TypeError, ValueError):
+            arguments.pop("ensure_weight_tying", None)
-        except Exception:
-            arguments.pop("ensure_weight_tying", None)
+        except (ImportError, AttributeError, TypeError, ValueError):
+            arguments.pop("ensure_weight_tying", None)
+
         _saved_temp_tokenizer = model._saved_temp_tokenizer
 
         lora_config = LoraConfig(**arguments)

@@ -1889,6 +1889,13 @@ def masked_batch_mean(x):
             if x.shape[1] == 1:  # when importance_sampling_level == "sequence"
                 return x.mean()
             else:
+                # Align mask/coef lengths when left-padding adds extra tokens.
+                if x.shape[1] != completion_mask.shape[1]:
+                    min_len = min(x.shape[1], completion_mask.shape[1])
+                    x = x[:, -min_len:]
+                    cm = completion_mask[:, -min_len:]
+                    denom = cm.sum().clamp(min = 1.0)
+                    return (x * cm).sum() / denom
                 return (x * completion_mask).sum() / completion_token_count
 
         if advantages.dim() == 1:

@@ -33,8 +33,16 @@
 
     IS_WINDOWS = sys.platform == "win32"
     LLAMA_CPP_DEFAULT_DIR = "llama.cpp"
-from bitsandbytes.nn import Linear4bit as Bnb_Linear4bit
-from peft.tuners.lora import Linear4bit as Peft_Linear4bit
+
+try:
+    from bitsandbytes.nn import Linear4bit as Bnb_Linear4bit
+except Exception:
+    Bnb_Linear4bit = None
+
+try:
+    from peft.tuners.lora import Linear4bit as Peft_Linear4bit
+except Exception:
+    Peft_Linear4bit = None
-try:
-    from bitsandbytes.nn import Linear4bit as Bnb_Linear4bit
-except Exception:
-    Bnb_Linear4bit = None
-
-try:
-    from peft.tuners.lora import Linear4bit as Peft_Linear4bit
-except Exception:
-    Peft_Linear4bit = None
+try:
+    from bitsandbytes.nn import Linear4bit as Bnb_Linear4bit
+except ImportError:
+    Bnb_Linear4bit = None
+
+try:
+    from peft.tuners.lora import Linear4bit as Peft_Linear4bit
+except ImportError:
+    Peft_Linear4bit = None
-try:
-    from bitsandbytes.nn import Linear4bit as Bnb_Linear4bit
-except Exception:
-    Bnb_Linear4bit = None
-
-try:
-    from peft.tuners.lora import Linear4bit as Peft_Linear4bit
-except Exception:
-    Peft_Linear4bit = None
+try:
+    from bitsandbytes.nn import Linear4bit as Bnb_Linear4bit
+except ImportError:
+    Bnb_Linear4bit = None
+
+try:
+    from peft.tuners.lora import Linear4bit as Peft_Linear4bit
+except ImportError:
+    Peft_Linear4bit = None
 from peft.tuners.lora import Linear as Peft_Linear
 from typing import Optional, Callable, Union, List
 import sys
@@ -68,6 +76,10 @@
 from pathlib import Path
 from peft import PeftModelForCausalLM, PeftModel
 
+_MERGE_LORA_LINEAR_TYPES = tuple(
+    t for t in (Bnb_Linear4bit, Peft_Linear4bit, Peft_Linear) if t is not None
+)
+
 __all__ = [
     "print_quantization_methods",
     "unsloth_save_model",
@@ -381,7 +393,7 @@ def _free_cached_model(model):
 
 def _merge_lora(layer, name):
     bias = getattr(layer, "bias", None)
-    if isinstance(layer, (Bnb_Linear4bit, Peft_Linear4bit, Peft_Linear)):
+    if _MERGE_LORA_LINEAR_TYPES and isinstance(layer, _MERGE_LORA_LINEAR_TYPES):
         # Is LoRA so we need to merge!
         W, quant_state, A, B, s, bias = get_lora_parameters_bias(layer)
         if quant_state is not None: