From dbdce096b8f46ce5df461938f5c12861712a43ff Mon Sep 17 00:00:00 2001
From: sstamenk <strahinja.stamenkovic@amd.com>
Date: Fri, 6 Mar 2026 00:07:39 +0100
Subject: [PATCH] Conditionally enable 4bit on CDNA for bitsandbytes>=v0.49.2

---
 unsloth/device_type.py   | 17 +++++++++++------
 unsloth/models/loader.py |  8 ++++----
 2 files changed, 15 insertions(+), 10 deletions(-)

diff --git a/unsloth/device_type.py b/unsloth/device_type.py
index 0f924bfdfd..a42d2b9fab 100644
--- a/unsloth/device_type.py
+++ b/unsloth/device_type.py
@@ -79,16 +79,19 @@ def get_device_count():
 DEVICE_COUNT: int = get_device_count()
 
 # 4-bit quantization requires a block size of 64
-# this is not supported on AMD Instinct GPUs currently
 # | Device Type     | Warp Size | Block Size |
 # |-----------------|-----------|------------|
-# | CUDA            |    32     |     64     |
-# | Radeon (Navi)   |    32     |     64     |
-# | Instinct (MI)   |    64     |    128     |
+# | CUDA            |    32     |     32     |
+# | Radeon (Navi)   |    32     |     32     |
+# | Instinct (MI)   |    64     |     32     |
 #
 # Since bitsandbytes 0.49.0, pre-quantized models with 64 blockwise now works
-# on Radeon GPUs, but not Instinct MI300x for eg [WIP]
+# on Radeon GPUs, but not Instinct MI300x for eg
 # See https://github.com/bitsandbytes-foundation/bitsandbytes/pull/1748
+#
+# Since bitsandbytes 0.49.2, blocksize=64 4-bit quantization is supported on
+# CDNA (MI Instinct / gfx9xx) GPUs as well
+# See https://github.com/bitsandbytes-foundation/bitsandbytes/pull/1856
 
 ALLOW_PREQUANTIZED_MODELS: bool = True
 # HSA_STATUS_ERROR_EXCEPTION checks - sometimes AMD fails for BnB
@@ -104,7 +107,9 @@ def get_device_count():
         ALLOW_BITSANDBYTES = False
     if ALLOW_BITSANDBYTES:
         ALLOW_BITSANDBYTES = Version(bitsandbytes.__version__) > Version("0.48.2.dev0")
-        if Version(bitsandbytes.__version__) > Version("0.49.0"):
+        if Version(bitsandbytes.__version__) >= Version("0.49.2"):
+            pass
+        elif Version(bitsandbytes.__version__) >= Version("0.49.0"):
             try:
                 # Pre-quantized bitsandbytes models use blocksize 64, so we need to check the GPU
                 from bitsandbytes.cextension import ROCM_WARP_SIZE_64
diff --git a/unsloth/models/loader.py b/unsloth/models/loader.py
index e7b975ceea..bd15ed5281 100644
--- a/unsloth/models/loader.py
+++ b/unsloth/models/loader.py
@@ -397,7 +397,7 @@ def from_pretrained(
                     load_in_fp8 = False
 
         # Check if pre-quantized models are allowed
-        # For eg AMD Instinct GPUs need blocksize = 128, but our pre-quants are blocksize = 64
+        # AMD Instinct GPUs need blocksize = 128 on bitsandbytes < 0.49.2 (our pre-quants use blocksize = 64)
         if not ALLOW_PREQUANTIZED_MODELS and model_name.lower().endswith(
             ("-unsloth-bnb-4bit", "-bnb-4bit")
         ):
@@ -537,7 +537,7 @@ def from_pretrained(
                     trust_remote_code = trust_remote_code,
                 )
             # Check if pre-quantized models are allowed
-            # For eg AMD Instinct GPUs need blocksize = 128, but our pre-quants are blocksize = 64
+            # AMD Instinct GPUs need blocksize = 128 on bitsandbytes < 0.49.2 (our pre-quants use blocksize = 64)
             if not ALLOW_PREQUANTIZED_MODELS and model_name.lower().endswith(
                 ("-unsloth-bnb-4bit", "-bnb-4bit")
             ):
@@ -1005,7 +1005,7 @@ def from_pretrained(
                     load_in_fp8 = False
 
         # Check if pre-quantized models are allowed
-        # For eg AMD Instinct GPUs need blocksize = 128, but our pre-quants are blocksize = 64
+        # AMD Instinct GPUs need blocksize = 128 on bitsandbytes < 0.49.2 (our pre-quants use blocksize = 64)
         if not ALLOW_PREQUANTIZED_MODELS and model_name.lower().endswith(
             ("-unsloth-bnb-4bit", "-bnb-4bit")
         ):
@@ -1288,7 +1288,7 @@ def from_pretrained(
             if not use_exact_model_name:
                 model_name = get_model_name(model_name, load_in_4bit)
             # Check if pre-quantized models are allowed
-            # For eg AMD Instinct GPUs need blocksize = 128, but our pre-quants are blocksize = 64
+            # AMD Instinct GPUs need blocksize = 128 on bitsandbytes < 0.49.2 (our pre-quants use blocksize = 64)
             if not ALLOW_PREQUANTIZED_MODELS and model_name.lower().endswith(
                 ("-unsloth-bnb-4bit", "-bnb-4bit")
             ):