huggingface · vivekgoe · Jun 17, 2025 · Jun 13, 2025
@@ -427,6 +427,11 @@ def __call__(self, parser, namespace, values, option_string=None):
         action="store_true",
         help="Load an AutoAWQ quantized checkpoint using AutoAWQ.",
     )
+    quant_parser_group.add_argument(
+        "--quantize_with_bnb",
+        action="store_true",
+        help="Quantize model to NF4 using BnB and then use NF4 weights for text-generation",
+    )
     quant_parser_group.add_argument(
         "--disk_offload",
         action="store_true",

@@ -299,6 +299,22 @@ def setup_model(args, model_dtype, model_kwargs, logger):
         model = AutoModelForCausalLM.from_pretrained(
             args.model_name_or_path, torch_dtype=model_dtype, quantization_config=quantization_config, **model_kwargs
         )
+    elif args.quantize_with_bnb:
+        from transformers import BitsAndBytesConfig
+
+        nf4_config = BitsAndBytesConfig(
+            load_in_4bit=True,
+            bnb_4bit_use_double_quant=True,
+            bnb_4bit_quant_type="nf4",
+            bnb_4bit_compute_dtype=torch.bfloat16,
+        )
+        model = AutoModelForCausalLM.from_pretrained(
+            args.model_name_or_path,
+            quantization_config=nf4_config,
+            device_map={"": "hpu"},
+            torch_dtype=model_dtype,
+            **model_kwargs,
+        )
     elif args.load_quantized_model_with_inc:
         # TODO: This will be removed in v1.20 Synapse release
         # Override neural_compressor split_rank_state_dict for loading neural_magic models on multi-cards.

@@ -672,5 +672,17 @@
       "output": "DeepSpeed is a machine learning framework that enables the training of large-scale models with reduced computational resources. It achieves this by using a technique called model parallelism, which allows the model to be split across multiple GPUs. This makes it possible to train models that are too large to fit on a single GPU.\n## What is DeepSpeed?\nDeepSpeed is a machine learning framework that enables the training of large-scale models with reduced computational resources. It achieves this by using a technique called model parallelism, which allows the model to be split across multiple GPUs",
       "throughput": 94.70370546821054
     }
+  },
+  "tests/test_text_generation_example.py::test_text_generation_bnb[unsloth/Meta-Llama-3.1-70B-bnb-4bit-1-20-False-True]": {
+    "gaudi2": {
+      "output": "DeepSpeed is a machine learning framework that makes distributed training easy, efficient, and flexible. DeepSpeed can train BERT-Large on",
+      "throughput": 0.7572952
+    }
+  },
+  "tests/test_text_generation_example.py::test_text_generation_bnb[meta-llama/Llama-3.1-70B-1-20-True-True]": {
+    "gaudi2": {
+      "output": "DeepSpeed is a machine learning framework that makes distributed training easy, efficient, and effective. It is a deep learning optimization library that makes",
+      "throughput": 0.7583387
+    }
   }
 }
@@ -90,6 +90,10 @@
         "load_quantized_model_with_autoawq": [
             ("TheBloke/Llama-2-7b-Chat-AWQ", 1, 10, False, 128, 2048),
         ],
+        "run_model_with_bnb": [
+            ("unsloth/Meta-Llama-3.1-70B-bnb-4bit", 1, 20, False, True),
+            ("meta-llama/Llama-3.1-70B", 1, 20, True, True),
+        ],
         "deepspeed": [
             pytest.param("bigscience/bloomz", 8, 1, marks=pytest.mark.x8),
             # pytest.param("meta-llama/Llama-2-70b-hf", 8, 1, marks=pytest.mark.x8),
@@ -141,6 +145,7 @@
         "fp8": [],
         "load_quantized_model_with_autogptq": [],
         "load_quantized_model_with_autoawq": [],
+        "run_model_with_bnb": [],
         "deepspeed": [
             ("bigscience/bloomz-7b1", 8, 1),
         ],
@@ -166,6 +171,7 @@ def _test_text_generation(
     fp8: bool = False,
     load_quantized_model_with_autogptq: bool = False,
     load_quantized_model_with_autoawq: bool = False,
+    quantize_with_bnb: bool = False,
     max_input_tokens: int = 0,
     max_output_tokens: int = 100,
     parallel_strategy: str = None,
@@ -304,6 +310,8 @@ def _test_text_generation(
         command += ["--load_quantized_model_with_autogptq"]
     if load_quantized_model_with_autoawq:
         command += ["--load_quantized_model_with_autoawq"]
+    if quantize_with_bnb:
+        command += ["--quantize_with_bnb"]
     if parallel_strategy is not None:
         command += [
             f"--parallel_strategy={parallel_strategy}",
@@ -496,6 +504,32 @@ def test_text_generation_awq(
     )
 
 
+@pytest.mark.skipif(condition=bool("gaudi1" == OH_DEVICE_CONTEXT), reason=f"Skipping test for {OH_DEVICE_CONTEXT}")
+@pytest.mark.parametrize(
+    "model_name, world_size, output_len, quantize_with_bnb, check_output",
+    MODELS_TO_TEST["run_model_with_bnb"],
+)
+def test_text_generation_bnb(
+    model_name: str,
+    world_size: int,
+    output_len: int,
+    quantize_with_bnb: bool,
+    check_output: bool,
+    baseline,
+    token,
+):
+    _test_text_generation(
+        model_name,
+        baseline,
+        token,
+        world_size=world_size,
+        torch_compile=True,
+        quantize_with_bnb=quantize_with_bnb,
+        max_output_tokens=output_len,
+        check_output=check_output,
+    )
+
+
 @pytest.mark.parametrize("model_name, world_size, batch_size", MODELS_TO_TEST["deepspeed"])
 def test_text_generation_deepspeed(model_name: str, world_size: int, batch_size: int, baseline, token):
     _test_text_generation(model_name, baseline, token, deepspeed=True, world_size=world_size, batch_size=batch_size)