diff --git a/unsloth/__init__.py b/unsloth/__init__.py index 1be571b69b..756128f951 100644 --- a/unsloth/__init__.py +++ b/unsloth/__init__.py @@ -173,6 +173,7 @@ def is_bf16_supported(): return SUPPORTS_BFLOAT16 # For Gradio HF Spaces? # if "SPACE_AUTHOR_NAME" not in os.environ and "SPACE_REPO_NAME" not in os.environ: import triton +import bitsandbytes as bnb if DEVICE_TYPE == "cuda": libcuda_dirs = lambda: None if Version(triton.__version__) >= Version("3.0.0"): @@ -181,7 +182,6 @@ def is_bf16_supported(): return SUPPORTS_BFLOAT16 else: from triton.common.build import libcuda_dirs # Try loading bitsandbytes and triton - import bitsandbytes as bnb try: cdequantize_blockwise_fp32 = bnb.functional.lib.cdequantize_blockwise_fp32 libcuda_dirs() @@ -233,7 +233,6 @@ def is_bf16_supported(): return SUPPORTS_BFLOAT16 # NO-OP for rocm device pass elif DEVICE_TYPE == "xpu": - # currently intel xpu will not support bnb, will add support in the future # TODO: check triton for intel installed properly. pass diff --git a/unsloth/kernels/utils.py b/unsloth/kernels/utils.py index 3813c3809c..a0d1539980 100644 --- a/unsloth/kernels/utils.py +++ b/unsloth/kernels/utils.py @@ -75,18 +75,19 @@ def calculate_settings(n : int) -> (int, int,): pass HAS_CUDA_STREAM = False +import bitsandbytes as bnb +get_ptr = bnb.functional.get_ptr # INTEL GPU specific logic if DEVICE_TYPE == "xpu": # TODO: Changed here after adding XPU BNB support HAS_XPU_STREAM = True def get_ptr(x: Optional[torch.Tensor]): raise RuntimeError("XPU BNB support is not implemented yet. This function should not be called.") + else: # NVIDIA-GPU logic here as default - import bitsandbytes as bnb # https://github.com/bitsandbytes-foundation/bitsandbytes/pull/1330/files HAS_CUDA_STREAM = Version(bnb.__version__) > Version("0.43.3") - get_ptr = bnb.functional.get_ptr if DEVICE_COUNT > 1: @@ -148,31 +149,12 @@ def _get_tensor_stream(tensor: torch_Tensor) -> c_void_p: # Bitsandbytes operations ctypes_c_int = ctypes.c_int ctypes_c_int32 = ctypes.c_int32 -# INTEL GPU Specific Logic -if DEVICE_TYPE == "xpu": - # TODO: After adding XPU BNB support, this function should be implemented - def cdequantize_blockwise_fp32(*args, **kwargs): - raise RuntimeError("XPU BNB support is not implemented yet. cdequantize_blockwise_fp32 should not be called now.") - - def cdequantize_blockwise_fp16_nf4(*args, **kwargs): - raise RuntimeError("XPU BNB support is not implemented yet. cdequantize_blockwise_fp16_nf4 should not be called now.") - - def cdequantize_blockwise_bf16_nf4(*args, **kwargs): - raise RuntimeError("XPU BNB support is not implemented yet. cdequantize_blockwise_bf16_nf4 should not be called now.") +cdequantize_blockwise_fp32 = bnb.functional.lib.cdequantize_blockwise_fp32 +cdequantize_blockwise_fp16_nf4 = bnb.functional.lib.cdequantize_blockwise_fp16_nf4 +cdequantize_blockwise_bf16_nf4 = bnb.functional.lib.cdequantize_blockwise_bf16_nf4 +cgemm_4bit_inference_naive_fp16 = bnb.functional.lib.cgemm_4bit_inference_naive_fp16 +cgemm_4bit_inference_naive_bf16 = bnb.functional.lib.cgemm_4bit_inference_naive_bf16 - def cgemm_4bit_inference_naive_fp16(*args, **kwargs): - raise RuntimeError("XPU BNB support is not implemented yet. cgemm_4bit_inference_naive_fp16 should not be called now.") - - def cgemm_4bit_inference_naive_bf16(*args, **kwargs): - raise RuntimeError("XPU BNB support is not implemented yet. cgemm_4bit_inference_naive_bf16 should not be called now.") -else: - # NVIDIA GPU Default Logic - cdequantize_blockwise_fp32 = bnb.functional.lib.cdequantize_blockwise_fp32 - cdequantize_blockwise_fp16_nf4 = bnb.functional.lib.cdequantize_blockwise_fp16_nf4 - cdequantize_blockwise_bf16_nf4 = bnb.functional.lib.cdequantize_blockwise_bf16_nf4 - cgemm_4bit_inference_naive_fp16 = bnb.functional.lib.cgemm_4bit_inference_naive_fp16 - cgemm_4bit_inference_naive_bf16 = bnb.functional.lib.cgemm_4bit_inference_naive_bf16 -pass torch_device_stream = torch.xpu.current_stream if DEVICE_TYPE == "xpu" else torch.cuda.current_stream diff --git a/unsloth/models/_utils.py b/unsloth/models/_utils.py index 41adc74650..a031ba38c4 100644 --- a/unsloth/models/_utils.py +++ b/unsloth/models/_utils.py @@ -517,8 +517,8 @@ def _is_openai_available(): return False # ============================================= # Get Flash Attention v2 if Ampere (RTX 30xx, A100) -if DEVICE_TYPE in ("cuda", "hip"): - import bitsandbytes as bnb +import bitsandbytes as bnb + from transformers import AutoTokenizer from transformers.utils.import_utils import _is_package_available