From 0248657bd5877ba744c9a69d2be7cb54692e1f78 Mon Sep 17 00:00:00 2001 From: Ruheena Suhani Shaik Date: Fri, 13 Jun 2025 08:41:07 +0300 Subject: [PATCH] Integrated NF4 tests to text-generation --- examples/text-generation/run_generation.py | 5 ++ examples/text-generation/utils.py | 16 +++++ .../fixture/tests/test_bnb_inference.json | 8 --- .../tests/test_text_generation_example.json | 12 ++++ tests/test_bnb_inference.py | 67 ------------------- tests/test_text_generation_example.py | 34 ++++++++++ 6 files changed, 67 insertions(+), 75 deletions(-) delete mode 100644 tests/baselines/fixture/tests/test_bnb_inference.json delete mode 100644 tests/test_bnb_inference.py diff --git a/examples/text-generation/run_generation.py b/examples/text-generation/run_generation.py index 40f1c3f3d2..1187595be7 100644 --- a/examples/text-generation/run_generation.py +++ b/examples/text-generation/run_generation.py @@ -427,6 +427,11 @@ def __call__(self, parser, namespace, values, option_string=None): action="store_true", help="Load an AutoAWQ quantized checkpoint using AutoAWQ.", ) + quant_parser_group.add_argument( + "--quantize_with_bnb", + action="store_true", + help="Quantize model to NF4 using BnB and then use NF4 weights for text-generation", + ) quant_parser_group.add_argument( "--disk_offload", action="store_true", diff --git a/examples/text-generation/utils.py b/examples/text-generation/utils.py index 9654c21bda..b43aa8bae0 100644 --- a/examples/text-generation/utils.py +++ b/examples/text-generation/utils.py @@ -299,6 +299,22 @@ def setup_model(args, model_dtype, model_kwargs, logger): model = AutoModelForCausalLM.from_pretrained( args.model_name_or_path, torch_dtype=model_dtype, quantization_config=quantization_config, **model_kwargs ) + elif args.quantize_with_bnb: + from transformers import BitsAndBytesConfig + + nf4_config = BitsAndBytesConfig( + load_in_4bit=True, + bnb_4bit_use_double_quant=True, + bnb_4bit_quant_type="nf4", + bnb_4bit_compute_dtype=torch.bfloat16, + ) + model = AutoModelForCausalLM.from_pretrained( + args.model_name_or_path, + quantization_config=nf4_config, + device_map={"": "hpu"}, + torch_dtype=model_dtype, + **model_kwargs, + ) elif args.load_quantized_model_with_inc: # TODO: This will be removed in v1.20 Synapse release # Override neural_compressor split_rank_state_dict for loading neural_magic models on multi-cards. diff --git a/tests/baselines/fixture/tests/test_bnb_inference.json b/tests/baselines/fixture/tests/test_bnb_inference.json deleted file mode 100644 index cb7a0ef2d0..0000000000 --- a/tests/baselines/fixture/tests/test_bnb_inference.json +++ /dev/null @@ -1,8 +0,0 @@ -{ - "tests/test_bnb_inference.py::test_nf4_quantization_inference[True-meta-llama/Llama-3.2-1B]": { - "output": "Hello my name is Marlene and I am 36 years old. I am a very happy and loving person. I" - }, - "tests/test_bnb_inference.py::test_nf4_quantization_inference[False-meta-llama/Llama-3.2-1B]": { - "output": "Hello my name is Marlene and I am 36 years old. I am a very happy and loving person. I" - } -} \ No newline at end of file diff --git a/tests/baselines/fixture/tests/test_text_generation_example.json b/tests/baselines/fixture/tests/test_text_generation_example.json index 299bf50686..4d90e72dc6 100644 --- a/tests/baselines/fixture/tests/test_text_generation_example.json +++ b/tests/baselines/fixture/tests/test_text_generation_example.json @@ -672,5 +672,17 @@ "output": "DeepSpeed is a machine learning framework that enables the training of large-scale models with reduced computational resources. It achieves this by using a technique called model parallelism, which allows the model to be split across multiple GPUs. This makes it possible to train models that are too large to fit on a single GPU.\n## What is DeepSpeed?\nDeepSpeed is a machine learning framework that enables the training of large-scale models with reduced computational resources. It achieves this by using a technique called model parallelism, which allows the model to be split across multiple GPUs", "throughput": 94.70370546821054 } + }, + "tests/test_text_generation_example.py::test_text_generation_bnb[unsloth/Meta-Llama-3.1-70B-bnb-4bit-1-20-False-True]": { + "gaudi2": { + "output": "DeepSpeed is a machine learning framework that makes distributed training easy, efficient, and flexible. DeepSpeed can train BERT-Large on", + "throughput": 0.7572952 + } + }, + "tests/test_text_generation_example.py::test_text_generation_bnb[meta-llama/Llama-3.1-70B-1-20-True-True]": { + "gaudi2": { + "output": "DeepSpeed is a machine learning framework that makes distributed training easy, efficient, and effective. It is a deep learning optimization library that makes", + "throughput": 0.7583387 + } } } \ No newline at end of file diff --git a/tests/test_bnb_inference.py b/tests/test_bnb_inference.py deleted file mode 100644 index f4ab6c39f7..0000000000 --- a/tests/test_bnb_inference.py +++ /dev/null @@ -1,67 +0,0 @@ -# coding=utf-8 -# Copyright 2022 the HuggingFace Inc. team. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -import copy -import os - -import pytest -import torch -from transformers import AutoModelForCausalLM, AutoTokenizer, BitsAndBytesConfig - -from .utils import OH_DEVICE_CONTEXT - - -def get_model(token: str, model_id: str): - nf4_config = BitsAndBytesConfig( - load_in_4bit=True, - bnb_4bit_quant_type="nf4", - bnb_4bit_compute_dtype=torch.bfloat16, - ) - - model = AutoModelForCausalLM.from_pretrained( - model_id, quantization_config=nf4_config, device_map={"": "hpu"}, torch_dtype=torch.bfloat16, token=token.value - ) - - return model - - -@pytest.mark.parametrize("model_id", ["meta-llama/Llama-3.2-1B"]) -@pytest.mark.parametrize("compile_on", [True, False]) -@pytest.mark.skipif("gaudi1" == OH_DEVICE_CONTEXT, reason="execution not supported on gaudi1") -def test_nf4_quantization_inference(token: str, baseline, model_id: str, compile_on: bool): - os.environ["PT_HPU_LAZY_MODE"] = "0" - from optimum.habana.transformers import modeling_utils - - modeling_utils.adapt_transformers_to_gaudi() - - tokenizer = AutoTokenizer.from_pretrained(model_id, token=token.value) - model = get_model(token, model_id) - - generation_config = copy.deepcopy(model.generation_config) - generation_config.max_new_tokens = 20 - generation_config.use_cache = True - generation_config.use_flash_attention = True - - if compile_on: - model = torch.compile(model, backend="hpu_backend") - - input_text = "Hello my name is" - inputs = tokenizer(input_text, return_tensors="pt").to(device="hpu") - - torch.manual_seed(42) - outputs = model.generate(**inputs, generation_config=generation_config, lazy_mode=False) - decoded_output = tokenizer.decode(outputs[0], skip_special_tokens=True) - - baseline.assertEqual(output=decoded_output) diff --git a/tests/test_text_generation_example.py b/tests/test_text_generation_example.py index 701a5c49d5..c997601953 100644 --- a/tests/test_text_generation_example.py +++ b/tests/test_text_generation_example.py @@ -90,6 +90,10 @@ "load_quantized_model_with_autoawq": [ ("TheBloke/Llama-2-7b-Chat-AWQ", 1, 10, False, 128, 2048), ], + "run_model_with_bnb": [ + ("unsloth/Meta-Llama-3.1-70B-bnb-4bit", 1, 20, False, True), + ("meta-llama/Llama-3.1-70B", 1, 20, True, True), + ], "deepspeed": [ pytest.param("bigscience/bloomz", 8, 1, marks=pytest.mark.x8), # pytest.param("meta-llama/Llama-2-70b-hf", 8, 1, marks=pytest.mark.x8), @@ -141,6 +145,7 @@ "fp8": [], "load_quantized_model_with_autogptq": [], "load_quantized_model_with_autoawq": [], + "run_model_with_bnb": [], "deepspeed": [ ("bigscience/bloomz-7b1", 8, 1), ], @@ -166,6 +171,7 @@ def _test_text_generation( fp8: bool = False, load_quantized_model_with_autogptq: bool = False, load_quantized_model_with_autoawq: bool = False, + quantize_with_bnb: bool = False, max_input_tokens: int = 0, max_output_tokens: int = 100, parallel_strategy: str = None, @@ -304,6 +310,8 @@ def _test_text_generation( command += ["--load_quantized_model_with_autogptq"] if load_quantized_model_with_autoawq: command += ["--load_quantized_model_with_autoawq"] + if quantize_with_bnb: + command += ["--quantize_with_bnb"] if parallel_strategy is not None: command += [ f"--parallel_strategy={parallel_strategy}", @@ -496,6 +504,32 @@ def test_text_generation_awq( ) +@pytest.mark.skipif(condition=bool("gaudi1" == OH_DEVICE_CONTEXT), reason=f"Skipping test for {OH_DEVICE_CONTEXT}") +@pytest.mark.parametrize( + "model_name, world_size, output_len, quantize_with_bnb, check_output", + MODELS_TO_TEST["run_model_with_bnb"], +) +def test_text_generation_bnb( + model_name: str, + world_size: int, + output_len: int, + quantize_with_bnb: bool, + check_output: bool, + baseline, + token, +): + _test_text_generation( + model_name, + baseline, + token, + world_size=world_size, + torch_compile=True, + quantize_with_bnb=quantize_with_bnb, + max_output_tokens=output_len, + check_output=check_output, + ) + + @pytest.mark.parametrize("model_name, world_size, batch_size", MODELS_TO_TEST["deepspeed"]) def test_text_generation_deepspeed(model_name: str, world_size: int, batch_size: int, baseline, token): _test_text_generation(model_name, baseline, token, deepspeed=True, world_size=world_size, batch_size=batch_size)