Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion unsloth/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -233,7 +233,7 @@ def is_bf16_supported(): return SUPPORTS_BFLOAT16
# NO-OP for rocm device
pass
elif DEVICE_TYPE == "xpu":
# currently intel xpu will not support bnb, will add support in the future
import bitsandbytes as bnb
# TODO: check triton for intel installed properly.
pass

Expand Down
54 changes: 20 additions & 34 deletions unsloth/kernels/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -75,19 +75,13 @@ def calculate_settings(n : int) -> (int, int,):
pass

HAS_CUDA_STREAM = False
# INTEL GPU specific logic
import bitsandbytes as bnb
# https://github.com/bitsandbytes-foundation/bitsandbytes/pull/1330/files
HAS_CUDA_STREAM = Version(bnb.__version__) > Version("0.43.3")
get_ptr = bnb.functional.get_ptr

if DEVICE_TYPE == "xpu":
# TODO: Changed here after adding XPU BNB support
HAS_XPU_STREAM = True
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Small nit. In this case both HAS_CUDA_STREAM and HAS_XPU_STREAM could be True. For clarity it would be good to make sure it's one or the other.

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I think it's OK since it's really just talking about the API and not the device availability. Maybe the naming doesn't explain that well enough. But in this case, the bitsandbytes C API requires stream arguments on some functions for both CUDA/XPU.

But to be honest, separate from this PR, I would suggest bumping the minimum bitsandbytes version for CUDA to at least >=0.45.0. Ideally >=0.46.0 to ensure torch.compile compatibility. If that's done then HAS_CUDA_STREAM and HAS_XPU_STREAM parts can be completely removed. (It's always true for >=0.44.0).

For Blackwell, the minimum bitsandbytes needed would be >=0.45.3.

For Intel, the minimum bitsandbytes should be >=0.48.0.

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Actually, since I see in pyproject.toml that Unsloth already pins to bitsandbytes>=0.45.5, the checks around HAS_CUDA_STREAM can be removed already.

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Shall we create a new pr for removing HAS_CUDA_STREAM instead of this pr?

def get_ptr(x: Optional[torch.Tensor]):
raise RuntimeError("XPU BNB support is not implemented yet. This function should not be called.")
else:
# NVIDIA-GPU logic here as default
import bitsandbytes as bnb
# https://github.com/bitsandbytes-foundation/bitsandbytes/pull/1330/files
HAS_CUDA_STREAM = Version(bnb.__version__) > Version("0.43.3")
get_ptr = bnb.functional.get_ptr


if DEVICE_COUNT > 1:
if DEVICE_TYPE in ("cuda", "hip"):
Expand Down Expand Up @@ -148,31 +142,19 @@ def _get_tensor_stream(tensor: torch_Tensor) -> c_void_p:
# Bitsandbytes operations
ctypes_c_int = ctypes.c_int
ctypes_c_int32 = ctypes.c_int32
# INTEL GPU Specific Logic
if DEVICE_TYPE == "xpu":
# TODO: After adding XPU BNB support, this function should be implemented
def cdequantize_blockwise_fp32(*args, **kwargs):
raise RuntimeError("XPU BNB support is not implemented yet. cdequantize_blockwise_fp32 should not be called now.")

def cdequantize_blockwise_fp16_nf4(*args, **kwargs):
raise RuntimeError("XPU BNB support is not implemented yet. cdequantize_blockwise_fp16_nf4 should not be called now.")

def cdequantize_blockwise_bf16_nf4(*args, **kwargs):
raise RuntimeError("XPU BNB support is not implemented yet. cdequantize_blockwise_bf16_nf4 should not be called now.")

def cgemm_4bit_inference_naive_fp16(*args, **kwargs):
raise RuntimeError("XPU BNB support is not implemented yet. cgemm_4bit_inference_naive_fp16 should not be called now.")
cdequantize_blockwise_fp32 = bnb.functional.lib.cdequantize_blockwise_fp32
cdequantize_blockwise_fp16_nf4 = bnb.functional.lib.cdequantize_blockwise_fp16_nf4
cdequantize_blockwise_bf16_nf4 = bnb.functional.lib.cdequantize_blockwise_bf16_nf4

def cgemm_4bit_inference_naive_bf16(*args, **kwargs):
raise RuntimeError("XPU BNB support is not implemented yet. cgemm_4bit_inference_naive_bf16 should not be called now.")
if DEVICE_TYPE == "xpu":
# https://github.com/bitsandbytes-foundation/bitsandbytes/blob/c3b8de268fdb55a88f92feada23fc811a1e6877a/bitsandbytes/backends/xpu/ops.py#L115
# for xpu, inference gemv using above link
cgemm_4bit_inference_naive_fp16 = bnb.functional.lib.cgemv_4bit_inference_fp16
cgemm_4bit_inference_naive_bf16 = bnb.functional.lib.cgemv_4bit_inference_bf16
else:
# NVIDIA GPU Default Logic
cdequantize_blockwise_fp32 = bnb.functional.lib.cdequantize_blockwise_fp32
cdequantize_blockwise_fp16_nf4 = bnb.functional.lib.cdequantize_blockwise_fp16_nf4
cdequantize_blockwise_bf16_nf4 = bnb.functional.lib.cdequantize_blockwise_bf16_nf4
cgemm_4bit_inference_naive_fp16 = bnb.functional.lib.cgemm_4bit_inference_naive_fp16
cgemm_4bit_inference_naive_bf16 = bnb.functional.lib.cgemm_4bit_inference_naive_bf16
pass


torch_device_stream = torch.xpu.current_stream if DEVICE_TYPE == "xpu" else torch.cuda.current_stream

Expand Down Expand Up @@ -517,8 +499,12 @@ def fast_gemv(X, W, quant_state, out = None):
# assert(out.shape == (1, 1, bout,))
# pass

n = 1
m = shape[0]
if DEVICE_TYPE == "xpu":
m = 1
n = shape[0]
else:
n = 1
m = shape[0]
k = shape[1]
lda = shape[0]
ldc = shape[0]
Expand Down
3 changes: 1 addition & 2 deletions unsloth/models/_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -517,8 +517,7 @@ def _is_openai_available(): return False

# =============================================
# Get Flash Attention v2 if Ampere (RTX 30xx, A100)
if DEVICE_TYPE in ("cuda", "hip"):
import bitsandbytes as bnb
import bitsandbytes as bnb

from transformers import AutoTokenizer
from transformers.utils.import_utils import _is_package_available
Expand Down