From 0248657bd5877ba744c9a69d2be7cb54692e1f78 Mon Sep 17 00:00:00 2001
From: Ruheena Suhani Shaik <rsshaik@habana.ai>
Date: Fri, 13 Jun 2025 08:41:07 +0300
Subject: [PATCH] Integrated NF4 tests to text-generation

---
 examples/text-generation/run_generation.py    |  5 ++
 examples/text-generation/utils.py             | 16 +++++
 .../fixture/tests/test_bnb_inference.json     |  8 ---
 .../tests/test_text_generation_example.json   | 12 ++++
 tests/test_bnb_inference.py                   | 67 -------------------
 tests/test_text_generation_example.py         | 34 ++++++++++
 6 files changed, 67 insertions(+), 75 deletions(-)
 delete mode 100644 tests/baselines/fixture/tests/test_bnb_inference.json
 delete mode 100644 tests/test_bnb_inference.py

diff --git a/examples/text-generation/run_generation.py b/examples/text-generation/run_generation.py
index 40f1c3f3d2..1187595be7 100644
--- a/examples/text-generation/run_generation.py
+++ b/examples/text-generation/run_generation.py
@@ -427,6 +427,11 @@ def __call__(self, parser, namespace, values, option_string=None):
         action="store_true",
         help="Load an AutoAWQ quantized checkpoint using AutoAWQ.",
     )
+    quant_parser_group.add_argument(
+        "--quantize_with_bnb",
+        action="store_true",
+        help="Quantize model to NF4 using BnB and then use NF4 weights for text-generation",
+    )
     quant_parser_group.add_argument(
         "--disk_offload",
         action="store_true",
diff --git a/examples/text-generation/utils.py b/examples/text-generation/utils.py
index 9654c21bda..b43aa8bae0 100644
--- a/examples/text-generation/utils.py
+++ b/examples/text-generation/utils.py
@@ -299,6 +299,22 @@ def setup_model(args, model_dtype, model_kwargs, logger):
         model = AutoModelForCausalLM.from_pretrained(
             args.model_name_or_path, torch_dtype=model_dtype, quantization_config=quantization_config, **model_kwargs
         )
+    elif args.quantize_with_bnb:
+        from transformers import BitsAndBytesConfig
+
+        nf4_config = BitsAndBytesConfig(
+            load_in_4bit=True,
+            bnb_4bit_use_double_quant=True,
+            bnb_4bit_quant_type="nf4",
+            bnb_4bit_compute_dtype=torch.bfloat16,
+        )
+        model = AutoModelForCausalLM.from_pretrained(
+            args.model_name_or_path,
+            quantization_config=nf4_config,
+            device_map={"": "hpu"},
+            torch_dtype=model_dtype,
+            **model_kwargs,
+        )
     elif args.load_quantized_model_with_inc:
         # TODO: This will be removed in v1.20 Synapse release
         # Override neural_compressor split_rank_state_dict for loading neural_magic models on multi-cards.
diff --git a/tests/baselines/fixture/tests/test_bnb_inference.json b/tests/baselines/fixture/tests/test_bnb_inference.json
deleted file mode 100644
index cb7a0ef2d0..0000000000
--- a/tests/baselines/fixture/tests/test_bnb_inference.json
+++ /dev/null
@@ -1,8 +0,0 @@
-{
-  "tests/test_bnb_inference.py::test_nf4_quantization_inference[True-meta-llama/Llama-3.2-1B]": {
-    "output": "Hello my name is Marlene and I am 36 years old. I am a very happy and loving person. I"
-  },
-  "tests/test_bnb_inference.py::test_nf4_quantization_inference[False-meta-llama/Llama-3.2-1B]": {
-    "output": "Hello my name is Marlene and I am 36 years old. I am a very happy and loving person. I"
-  }
-}
\ No newline at end of file
diff --git a/tests/baselines/fixture/tests/test_text_generation_example.json b/tests/baselines/fixture/tests/test_text_generation_example.json
index 299bf50686..4d90e72dc6 100644
--- a/tests/baselines/fixture/tests/test_text_generation_example.json
+++ b/tests/baselines/fixture/tests/test_text_generation_example.json
@@ -672,5 +672,17 @@
       "output": "DeepSpeed is a machine learning framework that enables the training of large-scale models with reduced computational resources. It achieves this by using a technique called model parallelism, which allows the model to be split across multiple GPUs. This makes it possible to train models that are too large to fit on a single GPU.\n## What is DeepSpeed?\nDeepSpeed is a machine learning framework that enables the training of large-scale models with reduced computational resources. It achieves this by using a technique called model parallelism, which allows the model to be split across multiple GPUs",
       "throughput": 94.70370546821054
     }
+  },
+  "tests/test_text_generation_example.py::test_text_generation_bnb[unsloth/Meta-Llama-3.1-70B-bnb-4bit-1-20-False-True]": {
+    "gaudi2": {
+      "output": "DeepSpeed is a machine learning framework that makes distributed training easy, efficient, and flexible. DeepSpeed can train BERT-Large on",
+      "throughput": 0.7572952
+    }
+  },
+  "tests/test_text_generation_example.py::test_text_generation_bnb[meta-llama/Llama-3.1-70B-1-20-True-True]": {
+    "gaudi2": {
+      "output": "DeepSpeed is a machine learning framework that makes distributed training easy, efficient, and effective. It is a deep learning optimization library that makes",
+      "throughput": 0.7583387
+    }
   }
 }
\ No newline at end of file
diff --git a/tests/test_bnb_inference.py b/tests/test_bnb_inference.py
deleted file mode 100644
index f4ab6c39f7..0000000000
--- a/tests/test_bnb_inference.py
+++ /dev/null
@@ -1,67 +0,0 @@
-# coding=utf-8
-# Copyright 2022 the HuggingFace Inc. team.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-import copy
-import os
-
-import pytest
-import torch
-from transformers import AutoModelForCausalLM, AutoTokenizer, BitsAndBytesConfig
-
-from .utils import OH_DEVICE_CONTEXT
-
-
-def get_model(token: str, model_id: str):
-    nf4_config = BitsAndBytesConfig(
-        load_in_4bit=True,
-        bnb_4bit_quant_type="nf4",
-        bnb_4bit_compute_dtype=torch.bfloat16,
-    )
-
-    model = AutoModelForCausalLM.from_pretrained(
-        model_id, quantization_config=nf4_config, device_map={"": "hpu"}, torch_dtype=torch.bfloat16, token=token.value
-    )
-
-    return model
-
-
-@pytest.mark.parametrize("model_id", ["meta-llama/Llama-3.2-1B"])
-@pytest.mark.parametrize("compile_on", [True, False])
-@pytest.mark.skipif("gaudi1" == OH_DEVICE_CONTEXT, reason="execution not supported on gaudi1")
-def test_nf4_quantization_inference(token: str, baseline, model_id: str, compile_on: bool):
-    os.environ["PT_HPU_LAZY_MODE"] = "0"
-    from optimum.habana.transformers import modeling_utils
-
-    modeling_utils.adapt_transformers_to_gaudi()
-
-    tokenizer = AutoTokenizer.from_pretrained(model_id, token=token.value)
-    model = get_model(token, model_id)
-
-    generation_config = copy.deepcopy(model.generation_config)
-    generation_config.max_new_tokens = 20
-    generation_config.use_cache = True
-    generation_config.use_flash_attention = True
-
-    if compile_on:
-        model = torch.compile(model, backend="hpu_backend")
-
-    input_text = "Hello my name is"
-    inputs = tokenizer(input_text, return_tensors="pt").to(device="hpu")
-
-    torch.manual_seed(42)
-    outputs = model.generate(**inputs, generation_config=generation_config, lazy_mode=False)
-    decoded_output = tokenizer.decode(outputs[0], skip_special_tokens=True)
-
-    baseline.assertEqual(output=decoded_output)
diff --git a/tests/test_text_generation_example.py b/tests/test_text_generation_example.py
index 701a5c49d5..c997601953 100644
--- a/tests/test_text_generation_example.py
+++ b/tests/test_text_generation_example.py
@@ -90,6 +90,10 @@
         "load_quantized_model_with_autoawq": [
             ("TheBloke/Llama-2-7b-Chat-AWQ", 1, 10, False, 128, 2048),
         ],
+        "run_model_with_bnb": [
+            ("unsloth/Meta-Llama-3.1-70B-bnb-4bit", 1, 20, False, True),
+            ("meta-llama/Llama-3.1-70B", 1, 20, True, True),
+        ],
         "deepspeed": [
             pytest.param("bigscience/bloomz", 8, 1, marks=pytest.mark.x8),
             # pytest.param("meta-llama/Llama-2-70b-hf", 8, 1, marks=pytest.mark.x8),
@@ -141,6 +145,7 @@
         "fp8": [],
         "load_quantized_model_with_autogptq": [],
         "load_quantized_model_with_autoawq": [],
+        "run_model_with_bnb": [],
         "deepspeed": [
             ("bigscience/bloomz-7b1", 8, 1),
         ],
@@ -166,6 +171,7 @@ def _test_text_generation(
     fp8: bool = False,
     load_quantized_model_with_autogptq: bool = False,
     load_quantized_model_with_autoawq: bool = False,
+    quantize_with_bnb: bool = False,
     max_input_tokens: int = 0,
     max_output_tokens: int = 100,
     parallel_strategy: str = None,
@@ -304,6 +310,8 @@ def _test_text_generation(
         command += ["--load_quantized_model_with_autogptq"]
     if load_quantized_model_with_autoawq:
         command += ["--load_quantized_model_with_autoawq"]
+    if quantize_with_bnb:
+        command += ["--quantize_with_bnb"]
     if parallel_strategy is not None:
         command += [
             f"--parallel_strategy={parallel_strategy}",
@@ -496,6 +504,32 @@ def test_text_generation_awq(
     )
 
 
+@pytest.mark.skipif(condition=bool("gaudi1" == OH_DEVICE_CONTEXT), reason=f"Skipping test for {OH_DEVICE_CONTEXT}")
+@pytest.mark.parametrize(
+    "model_name, world_size, output_len, quantize_with_bnb, check_output",
+    MODELS_TO_TEST["run_model_with_bnb"],
+)
+def test_text_generation_bnb(
+    model_name: str,
+    world_size: int,
+    output_len: int,
+    quantize_with_bnb: bool,
+    check_output: bool,
+    baseline,
+    token,
+):
+    _test_text_generation(
+        model_name,
+        baseline,
+        token,
+        world_size=world_size,
+        torch_compile=True,
+        quantize_with_bnb=quantize_with_bnb,
+        max_output_tokens=output_len,
+        check_output=check_output,
+    )
+
+
 @pytest.mark.parametrize("model_name, world_size, batch_size", MODELS_TO_TEST["deepspeed"])
 def test_text_generation_deepspeed(model_name: str, world_size: int, batch_size: int, baseline, token):
     _test_text_generation(model_name, baseline, token, deepspeed=True, world_size=world_size, batch_size=batch_size)