diff --git a/unsloth/__init__.py b/unsloth/__init__.py index 1be571b69..56b0ff9d3 100644 --- a/unsloth/__init__.py +++ b/unsloth/__init__.py @@ -233,7 +233,7 @@ def is_bf16_supported(): return SUPPORTS_BFLOAT16 # NO-OP for rocm device pass elif DEVICE_TYPE == "xpu": - # currently intel xpu will not support bnb, will add support in the future + import bitsandbytes as bnb # TODO: check triton for intel installed properly. pass diff --git a/unsloth/kernels/utils.py b/unsloth/kernels/utils.py index cb8982df0..523afabbf 100644 --- a/unsloth/kernels/utils.py +++ b/unsloth/kernels/utils.py @@ -75,19 +75,13 @@ def calculate_settings(n : int) -> (int, int,): pass HAS_CUDA_STREAM = False -# INTEL GPU specific logic +import bitsandbytes as bnb +# https://github.com/bitsandbytes-foundation/bitsandbytes/pull/1330/files +HAS_CUDA_STREAM = Version(bnb.__version__) > Version("0.43.3") +get_ptr = bnb.functional.get_ptr + if DEVICE_TYPE == "xpu": - # TODO: Changed here after adding XPU BNB support HAS_XPU_STREAM = True - def get_ptr(x: Optional[torch.Tensor]): - raise RuntimeError("XPU BNB support is not implemented yet. This function should not be called.") -else: - # NVIDIA-GPU logic here as default - import bitsandbytes as bnb - # https://github.com/bitsandbytes-foundation/bitsandbytes/pull/1330/files - HAS_CUDA_STREAM = Version(bnb.__version__) > Version("0.43.3") - get_ptr = bnb.functional.get_ptr - if DEVICE_COUNT > 1: if DEVICE_TYPE in ("cuda", "hip"): @@ -148,31 +142,19 @@ def _get_tensor_stream(tensor: torch_Tensor) -> c_void_p: # Bitsandbytes operations ctypes_c_int = ctypes.c_int ctypes_c_int32 = ctypes.c_int32 -# INTEL GPU Specific Logic -if DEVICE_TYPE == "xpu": - # TODO: After adding XPU BNB support, this function should be implemented - def cdequantize_blockwise_fp32(*args, **kwargs): - raise RuntimeError("XPU BNB support is not implemented yet. cdequantize_blockwise_fp32 should not be called now.") - - def cdequantize_blockwise_fp16_nf4(*args, **kwargs): - raise RuntimeError("XPU BNB support is not implemented yet. cdequantize_blockwise_fp16_nf4 should not be called now.") - - def cdequantize_blockwise_bf16_nf4(*args, **kwargs): - raise RuntimeError("XPU BNB support is not implemented yet. cdequantize_blockwise_bf16_nf4 should not be called now.") - - def cgemm_4bit_inference_naive_fp16(*args, **kwargs): - raise RuntimeError("XPU BNB support is not implemented yet. cgemm_4bit_inference_naive_fp16 should not be called now.") +cdequantize_blockwise_fp32 = bnb.functional.lib.cdequantize_blockwise_fp32 +cdequantize_blockwise_fp16_nf4 = bnb.functional.lib.cdequantize_blockwise_fp16_nf4 +cdequantize_blockwise_bf16_nf4 = bnb.functional.lib.cdequantize_blockwise_bf16_nf4 - def cgemm_4bit_inference_naive_bf16(*args, **kwargs): - raise RuntimeError("XPU BNB support is not implemented yet. cgemm_4bit_inference_naive_bf16 should not be called now.") +if DEVICE_TYPE == "xpu": + # https://github.com/bitsandbytes-foundation/bitsandbytes/blob/c3b8de268fdb55a88f92feada23fc811a1e6877a/bitsandbytes/backends/xpu/ops.py#L115 + # for xpu, inference gemv using above link + cgemm_4bit_inference_naive_fp16 = bnb.functional.lib.cgemv_4bit_inference_fp16 + cgemm_4bit_inference_naive_bf16 = bnb.functional.lib.cgemv_4bit_inference_bf16 else: - # NVIDIA GPU Default Logic - cdequantize_blockwise_fp32 = bnb.functional.lib.cdequantize_blockwise_fp32 - cdequantize_blockwise_fp16_nf4 = bnb.functional.lib.cdequantize_blockwise_fp16_nf4 - cdequantize_blockwise_bf16_nf4 = bnb.functional.lib.cdequantize_blockwise_bf16_nf4 cgemm_4bit_inference_naive_fp16 = bnb.functional.lib.cgemm_4bit_inference_naive_fp16 cgemm_4bit_inference_naive_bf16 = bnb.functional.lib.cgemm_4bit_inference_naive_bf16 -pass + torch_device_stream = torch.xpu.current_stream if DEVICE_TYPE == "xpu" else torch.cuda.current_stream @@ -517,8 +499,12 @@ def fast_gemv(X, W, quant_state, out = None): # assert(out.shape == (1, 1, bout,)) # pass - n = 1 - m = shape[0] + if DEVICE_TYPE == "xpu": + m = 1 + n = shape[0] + else: + n = 1 + m = shape[0] k = shape[1] lda = shape[0] ldc = shape[0] diff --git a/unsloth/models/_utils.py b/unsloth/models/_utils.py index 857c4902d..78b04e267 100644 --- a/unsloth/models/_utils.py +++ b/unsloth/models/_utils.py @@ -517,8 +517,7 @@ def _is_openai_available(): return False # ============================================= # Get Flash Attention v2 if Ampere (RTX 30xx, A100) -if DEVICE_TYPE in ("cuda", "hip"): - import bitsandbytes as bnb +import bitsandbytes as bnb from transformers import AutoTokenizer from transformers.utils.import_utils import _is_package_available