From dbdce096b8f46ce5df461938f5c12861712a43ff Mon Sep 17 00:00:00 2001 From: sstamenk Date: Fri, 6 Mar 2026 00:07:39 +0100 Subject: [PATCH] Conditionally enable 4bit on CDNA for bitsandbytes>=v0.49.2 --- unsloth/device_type.py | 17 +++++++++++------ unsloth/models/loader.py | 8 ++++---- 2 files changed, 15 insertions(+), 10 deletions(-) diff --git a/unsloth/device_type.py b/unsloth/device_type.py index 0f924bfdfd..a42d2b9fab 100644 --- a/unsloth/device_type.py +++ b/unsloth/device_type.py @@ -79,16 +79,19 @@ def get_device_count(): DEVICE_COUNT: int = get_device_count() # 4-bit quantization requires a block size of 64 -# this is not supported on AMD Instinct GPUs currently # | Device Type | Warp Size | Block Size | # |-----------------|-----------|------------| -# | CUDA | 32 | 64 | -# | Radeon (Navi) | 32 | 64 | -# | Instinct (MI) | 64 | 128 | +# | CUDA | 32 | 32 | +# | Radeon (Navi) | 32 | 32 | +# | Instinct (MI) | 64 | 32 | # # Since bitsandbytes 0.49.0, pre-quantized models with 64 blockwise now works -# on Radeon GPUs, but not Instinct MI300x for eg [WIP] +# on Radeon GPUs, but not Instinct MI300x for eg # See https://github.com/bitsandbytes-foundation/bitsandbytes/pull/1748 +# +# Since bitsandbytes 0.49.2, blocksize=64 4-bit quantization is supported on +# CDNA (MI Instinct / gfx9xx) GPUs as well +# See https://github.com/bitsandbytes-foundation/bitsandbytes/pull/1856 ALLOW_PREQUANTIZED_MODELS: bool = True # HSA_STATUS_ERROR_EXCEPTION checks - sometimes AMD fails for BnB @@ -104,7 +107,9 @@ def get_device_count(): ALLOW_BITSANDBYTES = False if ALLOW_BITSANDBYTES: ALLOW_BITSANDBYTES = Version(bitsandbytes.__version__) > Version("0.48.2.dev0") - if Version(bitsandbytes.__version__) > Version("0.49.0"): + if Version(bitsandbytes.__version__) >= Version("0.49.2"): + pass + elif Version(bitsandbytes.__version__) >= Version("0.49.0"): try: # Pre-quantized bitsandbytes models use blocksize 64, so we need to check the GPU from bitsandbytes.cextension import ROCM_WARP_SIZE_64 diff --git a/unsloth/models/loader.py b/unsloth/models/loader.py index e7b975ceea..bd15ed5281 100644 --- a/unsloth/models/loader.py +++ b/unsloth/models/loader.py @@ -397,7 +397,7 @@ def from_pretrained( load_in_fp8 = False # Check if pre-quantized models are allowed - # For eg AMD Instinct GPUs need blocksize = 128, but our pre-quants are blocksize = 64 + # AMD Instinct GPUs need blocksize = 128 on bitsandbytes < 0.49.2 (our pre-quants use blocksize = 64) if not ALLOW_PREQUANTIZED_MODELS and model_name.lower().endswith( ("-unsloth-bnb-4bit", "-bnb-4bit") ): @@ -537,7 +537,7 @@ def from_pretrained( trust_remote_code = trust_remote_code, ) # Check if pre-quantized models are allowed - # For eg AMD Instinct GPUs need blocksize = 128, but our pre-quants are blocksize = 64 + # AMD Instinct GPUs need blocksize = 128 on bitsandbytes < 0.49.2 (our pre-quants use blocksize = 64) if not ALLOW_PREQUANTIZED_MODELS and model_name.lower().endswith( ("-unsloth-bnb-4bit", "-bnb-4bit") ): @@ -1005,7 +1005,7 @@ def from_pretrained( load_in_fp8 = False # Check if pre-quantized models are allowed - # For eg AMD Instinct GPUs need blocksize = 128, but our pre-quants are blocksize = 64 + # AMD Instinct GPUs need blocksize = 128 on bitsandbytes < 0.49.2 (our pre-quants use blocksize = 64) if not ALLOW_PREQUANTIZED_MODELS and model_name.lower().endswith( ("-unsloth-bnb-4bit", "-bnb-4bit") ): @@ -1288,7 +1288,7 @@ def from_pretrained( if not use_exact_model_name: model_name = get_model_name(model_name, load_in_4bit) # Check if pre-quantized models are allowed - # For eg AMD Instinct GPUs need blocksize = 128, but our pre-quants are blocksize = 64 + # AMD Instinct GPUs need blocksize = 128 on bitsandbytes < 0.49.2 (our pre-quants use blocksize = 64) if not ALLOW_PREQUANTIZED_MODELS and model_name.lower().endswith( ("-unsloth-bnb-4bit", "-bnb-4bit") ):