unslothai · leizhenyuan · Jun 30, 2025 · Jun 30, 2025 · Jun 30, 2025 · Sep 17, 2025
diff --git a/unsloth/__init__.py b/unsloth/__init__.py
@@ -173,6 +173,7 @@ def is_bf16_supported(): return SUPPORTS_BFLOAT16
 # For Gradio HF Spaces?
 # if "SPACE_AUTHOR_NAME" not in os.environ and "SPACE_REPO_NAME" not in os.environ:
 import triton
+import bitsandbytes as bnb
 if DEVICE_TYPE == "cuda":
     libcuda_dirs = lambda: None
     if Version(triton.__version__) >= Version("3.0.0"):
@@ -181,7 +182,6 @@ def is_bf16_supported(): return SUPPORTS_BFLOAT16
     else: from triton.common.build import libcuda_dirs
 
     # Try loading bitsandbytes and triton
-    import bitsandbytes as bnb
     try:
         cdequantize_blockwise_fp32 = bnb.functional.lib.cdequantize_blockwise_fp32
         libcuda_dirs()
@@ -233,7 +233,6 @@ def is_bf16_supported(): return SUPPORTS_BFLOAT16
     # NO-OP for rocm device
     pass
 elif DEVICE_TYPE == "xpu":
-    # currently intel xpu will not support bnb, will add support in the future
     # TODO: check triton for intel installed properly.
     pass
 

@@ -75,18 +75,19 @@ def calculate_settings(n : int) -> (int, int,):
 pass
 
 HAS_CUDA_STREAM = False
+import bitsandbytes as bnb
+get_ptr = bnb.functional.get_ptr
 # INTEL GPU specific logic
 if DEVICE_TYPE == "xpu":
     # TODO: Changed here after adding XPU BNB support
     HAS_XPU_STREAM = True
     def get_ptr(x: Optional[torch.Tensor]):
         raise RuntimeError("XPU BNB support is not implemented yet. This function should not be called.")
+
 else:
     # NVIDIA-GPU logic here as default
-    import bitsandbytes as bnb
     # https://github.com/bitsandbytes-foundation/bitsandbytes/pull/1330/files
     HAS_CUDA_STREAM = Version(bnb.__version__) > Version("0.43.3")
-    get_ptr = bnb.functional.get_ptr
 
 
 if DEVICE_COUNT > 1:
@@ -148,31 +149,12 @@ def _get_tensor_stream(tensor: torch_Tensor) -> c_void_p:
 # Bitsandbytes operations
 ctypes_c_int   = ctypes.c_int
 ctypes_c_int32 = ctypes.c_int32
-# INTEL GPU Specific Logic
-if DEVICE_TYPE == "xpu":
-    # TODO: After adding XPU BNB support, this function should be implemented
-    def cdequantize_blockwise_fp32(*args, **kwargs):
-        raise RuntimeError("XPU BNB support is not implemented yet. cdequantize_blockwise_fp32 should not be called now.")
-
-    def cdequantize_blockwise_fp16_nf4(*args, **kwargs):
-        raise RuntimeError("XPU BNB support is not implemented yet. cdequantize_blockwise_fp16_nf4 should not be called now.")
-
-    def cdequantize_blockwise_bf16_nf4(*args, **kwargs):
-        raise RuntimeError("XPU BNB support is not implemented yet. cdequantize_blockwise_bf16_nf4 should not be called now.")
+cdequantize_blockwise_fp32      = bnb.functional.lib.cdequantize_blockwise_fp32
+cdequantize_blockwise_fp16_nf4  = bnb.functional.lib.cdequantize_blockwise_fp16_nf4
+cdequantize_blockwise_bf16_nf4  = bnb.functional.lib.cdequantize_blockwise_bf16_nf4
+cgemm_4bit_inference_naive_fp16 = bnb.functional.lib.cgemm_4bit_inference_naive_fp16
+cgemm_4bit_inference_naive_bf16 = bnb.functional.lib.cgemm_4bit_inference_naive_bf16
 
-    def cgemm_4bit_inference_naive_fp16(*args, **kwargs):
-        raise RuntimeError("XPU BNB support is not implemented yet. cgemm_4bit_inference_naive_fp16 should not be called now.")
-
-    def cgemm_4bit_inference_naive_bf16(*args, **kwargs):
-        raise RuntimeError("XPU BNB support is not implemented yet. cgemm_4bit_inference_naive_bf16 should not be called now.")
-else:
-    # NVIDIA GPU Default Logic
-    cdequantize_blockwise_fp32      = bnb.functional.lib.cdequantize_blockwise_fp32
-    cdequantize_blockwise_fp16_nf4  = bnb.functional.lib.cdequantize_blockwise_fp16_nf4
-    cdequantize_blockwise_bf16_nf4  = bnb.functional.lib.cdequantize_blockwise_bf16_nf4
-    cgemm_4bit_inference_naive_fp16 = bnb.functional.lib.cgemm_4bit_inference_naive_fp16
-    cgemm_4bit_inference_naive_bf16 = bnb.functional.lib.cgemm_4bit_inference_naive_bf16
-pass
 
 torch_device_stream = torch.xpu.current_stream if DEVICE_TYPE == "xpu" else torch.cuda.current_stream
 

@@ -517,8 +517,8 @@ def _is_openai_available(): return False
 
 # =============================================
 # Get Flash Attention v2 if Ampere (RTX 30xx, A100)
-if DEVICE_TYPE in ("cuda", "hip"):
-    import bitsandbytes as bnb
+import bitsandbytes as bnb
+
 
 from transformers import AutoTokenizer
 from transformers.utils.import_utils import _is_package_available