diff --git a/unsloth/__init__.py b/unsloth/__init__.py
index 1be571b69..56b0ff9d3 100644
--- a/unsloth/__init__.py
+++ b/unsloth/__init__.py
@@ -233,7 +233,7 @@ def is_bf16_supported(): return SUPPORTS_BFLOAT16
     # NO-OP for rocm device
     pass
 elif DEVICE_TYPE == "xpu":
-    # currently intel xpu will not support bnb, will add support in the future
+    import bitsandbytes as bnb
     # TODO: check triton for intel installed properly.
     pass
 
diff --git a/unsloth/kernels/utils.py b/unsloth/kernels/utils.py
index cb8982df0..523afabbf 100644
--- a/unsloth/kernels/utils.py
+++ b/unsloth/kernels/utils.py
@@ -75,19 +75,13 @@ def calculate_settings(n : int) -> (int, int,):
 pass
 
 HAS_CUDA_STREAM = False
-# INTEL GPU specific logic
+import bitsandbytes as bnb
+# https://github.com/bitsandbytes-foundation/bitsandbytes/pull/1330/files
+HAS_CUDA_STREAM = Version(bnb.__version__) > Version("0.43.3")
+get_ptr = bnb.functional.get_ptr
+
 if DEVICE_TYPE == "xpu":
-    # TODO: Changed here after adding XPU BNB support
     HAS_XPU_STREAM = True
-    def get_ptr(x: Optional[torch.Tensor]):
-        raise RuntimeError("XPU BNB support is not implemented yet. This function should not be called.")
-else:
-    # NVIDIA-GPU logic here as default
-    import bitsandbytes as bnb
-    # https://github.com/bitsandbytes-foundation/bitsandbytes/pull/1330/files
-    HAS_CUDA_STREAM = Version(bnb.__version__) > Version("0.43.3")
-    get_ptr = bnb.functional.get_ptr
-
 
 if DEVICE_COUNT > 1:
     if DEVICE_TYPE in ("cuda", "hip"):
@@ -148,31 +142,19 @@ def _get_tensor_stream(tensor: torch_Tensor) -> c_void_p:
 # Bitsandbytes operations
 ctypes_c_int   = ctypes.c_int
 ctypes_c_int32 = ctypes.c_int32
-# INTEL GPU Specific Logic
-if DEVICE_TYPE == "xpu":
-    # TODO: After adding XPU BNB support, this function should be implemented
-    def cdequantize_blockwise_fp32(*args, **kwargs):
-        raise RuntimeError("XPU BNB support is not implemented yet. cdequantize_blockwise_fp32 should not be called now.")
-
-    def cdequantize_blockwise_fp16_nf4(*args, **kwargs):
-        raise RuntimeError("XPU BNB support is not implemented yet. cdequantize_blockwise_fp16_nf4 should not be called now.")
-
-    def cdequantize_blockwise_bf16_nf4(*args, **kwargs):
-        raise RuntimeError("XPU BNB support is not implemented yet. cdequantize_blockwise_bf16_nf4 should not be called now.")
-
-    def cgemm_4bit_inference_naive_fp16(*args, **kwargs):
-        raise RuntimeError("XPU BNB support is not implemented yet. cgemm_4bit_inference_naive_fp16 should not be called now.")
+cdequantize_blockwise_fp32      = bnb.functional.lib.cdequantize_blockwise_fp32
+cdequantize_blockwise_fp16_nf4  = bnb.functional.lib.cdequantize_blockwise_fp16_nf4
+cdequantize_blockwise_bf16_nf4  = bnb.functional.lib.cdequantize_blockwise_bf16_nf4
 
-    def cgemm_4bit_inference_naive_bf16(*args, **kwargs):
-        raise RuntimeError("XPU BNB support is not implemented yet. cgemm_4bit_inference_naive_bf16 should not be called now.")
+if DEVICE_TYPE == "xpu":
+    # https://github.com/bitsandbytes-foundation/bitsandbytes/blob/c3b8de268fdb55a88f92feada23fc811a1e6877a/bitsandbytes/backends/xpu/ops.py#L115
+    # for xpu, inference gemv using above link
+    cgemm_4bit_inference_naive_fp16 = bnb.functional.lib.cgemv_4bit_inference_fp16
+    cgemm_4bit_inference_naive_bf16 = bnb.functional.lib.cgemv_4bit_inference_bf16
 else:
-    # NVIDIA GPU Default Logic
-    cdequantize_blockwise_fp32      = bnb.functional.lib.cdequantize_blockwise_fp32
-    cdequantize_blockwise_fp16_nf4  = bnb.functional.lib.cdequantize_blockwise_fp16_nf4
-    cdequantize_blockwise_bf16_nf4  = bnb.functional.lib.cdequantize_blockwise_bf16_nf4
     cgemm_4bit_inference_naive_fp16 = bnb.functional.lib.cgemm_4bit_inference_naive_fp16
     cgemm_4bit_inference_naive_bf16 = bnb.functional.lib.cgemm_4bit_inference_naive_bf16
-pass
+
 
 torch_device_stream = torch.xpu.current_stream if DEVICE_TYPE == "xpu" else torch.cuda.current_stream
 
@@ -517,8 +499,12 @@ def fast_gemv(X, W, quant_state, out = None):
         #     assert(out.shape == (1, 1, bout,))
         # pass
 
-        n = 1
-        m = shape[0]
+        if DEVICE_TYPE == "xpu":
+            m = 1
+            n = shape[0]
+        else:
+            n = 1
+            m = shape[0]
         k = shape[1]
         lda = shape[0]
         ldc = shape[0]
diff --git a/unsloth/models/_utils.py b/unsloth/models/_utils.py
index 857c4902d..78b04e267 100644
--- a/unsloth/models/_utils.py
+++ b/unsloth/models/_utils.py
@@ -517,8 +517,7 @@ def _is_openai_available(): return False
 
 # =============================================
 # Get Flash Attention v2 if Ampere (RTX 30xx, A100)
-if DEVICE_TYPE in ("cuda", "hip"):
-    import bitsandbytes as bnb
+import bitsandbytes as bnb
 
 from transformers import AutoTokenizer
 from transformers.utils.import_utils import _is_package_available