Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
5 changes: 5 additions & 0 deletions examples/text-generation/run_generation.py
Original file line number Diff line number Diff line change
Expand Up @@ -427,6 +427,11 @@ def __call__(self, parser, namespace, values, option_string=None):
action="store_true",
help="Load an AutoAWQ quantized checkpoint using AutoAWQ.",
)
quant_parser_group.add_argument(
"--quantize_with_bnb",
action="store_true",
help="Quantize model to NF4 using BnB and then use NF4 weights for text-generation",
)
quant_parser_group.add_argument(
"--disk_offload",
action="store_true",
Expand Down
16 changes: 16 additions & 0 deletions examples/text-generation/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -299,6 +299,22 @@ def setup_model(args, model_dtype, model_kwargs, logger):
model = AutoModelForCausalLM.from_pretrained(
args.model_name_or_path, torch_dtype=model_dtype, quantization_config=quantization_config, **model_kwargs
)
elif args.quantize_with_bnb:
from transformers import BitsAndBytesConfig

nf4_config = BitsAndBytesConfig(
load_in_4bit=True,
bnb_4bit_use_double_quant=True,
bnb_4bit_quant_type="nf4",
bnb_4bit_compute_dtype=torch.bfloat16,
)
model = AutoModelForCausalLM.from_pretrained(
args.model_name_or_path,
quantization_config=nf4_config,
device_map={"": "hpu"},
torch_dtype=model_dtype,
**model_kwargs,
)
elif args.load_quantized_model_with_inc:
# TODO: This will be removed in v1.20 Synapse release
# Override neural_compressor split_rank_state_dict for loading neural_magic models on multi-cards.
Expand Down
8 changes: 0 additions & 8 deletions tests/baselines/fixture/tests/test_bnb_inference.json

This file was deleted.

12 changes: 12 additions & 0 deletions tests/baselines/fixture/tests/test_text_generation_example.json
Original file line number Diff line number Diff line change
Expand Up @@ -672,5 +672,17 @@
"output": "DeepSpeed is a machine learning framework that enables the training of large-scale models with reduced computational resources. It achieves this by using a technique called model parallelism, which allows the model to be split across multiple GPUs. This makes it possible to train models that are too large to fit on a single GPU.\n## What is DeepSpeed?\nDeepSpeed is a machine learning framework that enables the training of large-scale models with reduced computational resources. It achieves this by using a technique called model parallelism, which allows the model to be split across multiple GPUs",
"throughput": 94.70370546821054
}
},
"tests/test_text_generation_example.py::test_text_generation_bnb[unsloth/Meta-Llama-3.1-70B-bnb-4bit-1-20-False-True]": {
"gaudi2": {
"output": "DeepSpeed is a machine learning framework that makes distributed training easy, efficient, and flexible. DeepSpeed can train BERT-Large on",
"throughput": 0.7572952
}
},
"tests/test_text_generation_example.py::test_text_generation_bnb[meta-llama/Llama-3.1-70B-1-20-True-True]": {
"gaudi2": {
"output": "DeepSpeed is a machine learning framework that makes distributed training easy, efficient, and effective. It is a deep learning optimization library that makes",
"throughput": 0.7583387
}
}
}
67 changes: 0 additions & 67 deletions tests/test_bnb_inference.py

This file was deleted.

34 changes: 34 additions & 0 deletions tests/test_text_generation_example.py
Original file line number Diff line number Diff line change
Expand Up @@ -90,6 +90,10 @@
"load_quantized_model_with_autoawq": [
("TheBloke/Llama-2-7b-Chat-AWQ", 1, 10, False, 128, 2048),
],
"run_model_with_bnb": [
("unsloth/Meta-Llama-3.1-70B-bnb-4bit", 1, 20, False, True),
("meta-llama/Llama-3.1-70B", 1, 20, True, True),
],
"deepspeed": [
pytest.param("bigscience/bloomz", 8, 1, marks=pytest.mark.x8),
# pytest.param("meta-llama/Llama-2-70b-hf", 8, 1, marks=pytest.mark.x8),
Expand Down Expand Up @@ -141,6 +145,7 @@
"fp8": [],
"load_quantized_model_with_autogptq": [],
"load_quantized_model_with_autoawq": [],
"run_model_with_bnb": [],
"deepspeed": [
("bigscience/bloomz-7b1", 8, 1),
],
Expand All @@ -166,6 +171,7 @@ def _test_text_generation(
fp8: bool = False,
load_quantized_model_with_autogptq: bool = False,
load_quantized_model_with_autoawq: bool = False,
quantize_with_bnb: bool = False,
max_input_tokens: int = 0,
max_output_tokens: int = 100,
parallel_strategy: str = None,
Expand Down Expand Up @@ -304,6 +310,8 @@ def _test_text_generation(
command += ["--load_quantized_model_with_autogptq"]
if load_quantized_model_with_autoawq:
command += ["--load_quantized_model_with_autoawq"]
if quantize_with_bnb:
command += ["--quantize_with_bnb"]
if parallel_strategy is not None:
command += [
f"--parallel_strategy={parallel_strategy}",
Expand Down Expand Up @@ -496,6 +504,32 @@ def test_text_generation_awq(
)


@pytest.mark.skipif(condition=bool("gaudi1" == OH_DEVICE_CONTEXT), reason=f"Skipping test for {OH_DEVICE_CONTEXT}")
Comment thread
vivekgoe marked this conversation as resolved.
@pytest.mark.parametrize(
"model_name, world_size, output_len, quantize_with_bnb, check_output",
MODELS_TO_TEST["run_model_with_bnb"],
)
def test_text_generation_bnb(
model_name: str,
world_size: int,
output_len: int,
quantize_with_bnb: bool,
check_output: bool,
baseline,
token,
):
_test_text_generation(
model_name,
baseline,
token,
world_size=world_size,
torch_compile=True,
quantize_with_bnb=quantize_with_bnb,
max_output_tokens=output_len,
check_output=check_output,
)


@pytest.mark.parametrize("model_name, world_size, batch_size", MODELS_TO_TEST["deepspeed"])
def test_text_generation_deepspeed(model_name: str, world_size: int, batch_size: int, baseline, token):
_test_text_generation(model_name, baseline, token, deepspeed=True, world_size=world_size, batch_size=batch_size)
Expand Down