From 3d66604567e822198b5f3fba3e76d6f2e8e1dc1b Mon Sep 17 00:00:00 2001 From: yan tomsinsky Date: Thu, 15 Aug 2024 11:01:28 +0300 Subject: [PATCH 1/5] Remove HQT from OHF --- examples/image-to-text/README.md | 8 ++++++ examples/text-generation/utils.py | 47 ++++++++++++------------------- 2 files changed, 26 insertions(+), 29 deletions(-) diff --git a/examples/image-to-text/README.md b/examples/image-to-text/README.md index 0f1a2624d4..7337096f19 100644 --- a/examples/image-to-text/README.md +++ b/examples/image-to-text/README.md @@ -75,6 +75,14 @@ python3 run_pipeline.py \ ### Inference with FP8 Inference for Llava-1.5-7b, Llava-1.5-13b, Llava-v1.6-mistral-7b and Llava-v1.6-vicuna-13b in FP8 precision are enabled using the Quantization Toolkit (HQT), which provides model measurement and quantization capabilities in PyTorch. +Models that have been validated: + - [nlpconnect/vit-gpt2-image-captioning](https://huggingface.co/nlpconnect/vit-gpt2-image-captioning) + - [Salesforce/blip-image-captioning-large](https://huggingface.co/Salesforce/blip-image-captioning-large) + - [Salesforce/blip-image-captioning-base](https://huggingface.co/Salesforce/blip-image-captioning-base) + +### Running with FP8 + +Llava-1.5-7b and Llava-1.5-13b in FP8 are enabled using the Intel Neural Compressor (INC), which provides model measurement and quantization capabilities in PyTorch. More information on enabling FP8 in SynapseAI is available here: https://docs.habana.ai/en/latest/PyTorch/Inference_on_PyTorch/Inference_Using_FP8.html diff --git a/examples/text-generation/utils.py b/examples/text-generation/utils.py index 056f18f1a8..3e0b4608e0 100644 --- a/examples/text-generation/utils.py +++ b/examples/text-generation/utils.py @@ -187,41 +187,30 @@ def get_torch_compiled_model(model): def setup_quantization(model, args): - if os.getenv("USE_INC", "1") != "0": - try: - from neural_compressor.torch.quantization import FP8Config, convert, prepare - except ImportError: - raise ImportError( - "Module neural_compressor is missing. Please use a newer Synapse version to use quantization, or set the environment variable to USE_INC=0" - ) - - config = FP8Config.from_json_file(args.quant_config) - if config.measure: - model = prepare(model, config) - elif config.quantize: - model = convert(model, config) - else: - import habana_quantization_toolkit + try: + from neural_compressor.torch.quantization import FP8Config, convert, prepare + except ImportError: + raise ImportError( + "Module neural_compressor is missing. Please use a newer Synapse version to use quantization, or set the environment variable to USE_INC=0" + ) - habana_quantization_toolkit.prep_model(model) + config = FP8Config.from_json_file(args.quant_config) + if config.measure: + model = prepare(model, config) + if config.quantize or args.assistant_model is not None: + model = convert(model, config) return model def finalize_quantization(model): - if os.getenv("USE_INC", "1") != "0": - try: - from neural_compressor.torch.quantization import finalize_calibration - except ImportError: - raise ImportError( - "Module neural_compressor is missing. Please use a newer Synapse version to use quantization, or set the environment variable to USE_INC=0" - ) - - finalize_calibration(model) - else: - import habana_quantization_toolkit - - habana_quantization_toolkit.finish_measurements(model) + try: + from neural_compressor.torch.quantization import finalize_calibration + except ImportError: + raise ImportError( + "Module neural_compressor is missing. Please use a newer Synapse version to use quantization, or set the environment variable to USE_INC=0" + ) + finalize_calibration(model) def setup_model(args, model_dtype, model_kwargs, logger): From 476a08417a7d725d2ac398b7640a4720fb249a5b Mon Sep 17 00:00:00 2001 From: yan tomsinsky Date: Thu, 15 Aug 2024 11:22:56 +0300 Subject: [PATCH 2/5] merge fixes --- examples/image-to-text/README.md | 2 +- examples/text-generation/utils.py | 4 ++-- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/examples/image-to-text/README.md b/examples/image-to-text/README.md index 7337096f19..45aec5c632 100644 --- a/examples/image-to-text/README.md +++ b/examples/image-to-text/README.md @@ -74,7 +74,7 @@ python3 run_pipeline.py \ ### Inference with FP8 -Inference for Llava-1.5-7b, Llava-1.5-13b, Llava-v1.6-mistral-7b and Llava-v1.6-vicuna-13b in FP8 precision are enabled using the Quantization Toolkit (HQT), which provides model measurement and quantization capabilities in PyTorch. +Inference for Llava-1.5-7b, Llava-1.5-13b, Llava-v1.6-mistral-7b and Llava-v1.6-vicuna-13b in FP8 precision are enabled using the Intel Neural Compressor (INC), which provides model measurement and quantization capabilities in PyTorch. Models that have been validated: - [nlpconnect/vit-gpt2-image-captioning](https://huggingface.co/nlpconnect/vit-gpt2-image-captioning) - [Salesforce/blip-image-captioning-large](https://huggingface.co/Salesforce/blip-image-captioning-large) diff --git a/examples/text-generation/utils.py b/examples/text-generation/utils.py index 3e0b4608e0..9f7219e49b 100644 --- a/examples/text-generation/utils.py +++ b/examples/text-generation/utils.py @@ -191,7 +191,7 @@ def setup_quantization(model, args): from neural_compressor.torch.quantization import FP8Config, convert, prepare except ImportError: raise ImportError( - "Module neural_compressor is missing. Please use a newer Synapse version to use quantization, or set the environment variable to USE_INC=0" + "Module neural_compressor is missing. Please use a newer Synapse version to use quantization." ) config = FP8Config.from_json_file(args.quant_config) @@ -208,7 +208,7 @@ def finalize_quantization(model): from neural_compressor.torch.quantization import finalize_calibration except ImportError: raise ImportError( - "Module neural_compressor is missing. Please use a newer Synapse version to use quantization, or set the environment variable to USE_INC=0" + "Module neural_compressor is missing. Please use a newer Synapse version to use quantization." ) finalize_calibration(model) From 2d0047cefe046cb96722ba428690b0e971a4d51f Mon Sep 17 00:00:00 2001 From: yan tomsinsky Date: Mon, 9 Sep 2024 11:20:39 +0300 Subject: [PATCH 3/5] pr fixes --- examples/image-to-text/README.md | 8 -------- examples/text-generation/utils.py | 2 +- 2 files changed, 1 insertion(+), 9 deletions(-) diff --git a/examples/image-to-text/README.md b/examples/image-to-text/README.md index 45aec5c632..6ae9c87688 100644 --- a/examples/image-to-text/README.md +++ b/examples/image-to-text/README.md @@ -75,14 +75,6 @@ python3 run_pipeline.py \ ### Inference with FP8 Inference for Llava-1.5-7b, Llava-1.5-13b, Llava-v1.6-mistral-7b and Llava-v1.6-vicuna-13b in FP8 precision are enabled using the Intel Neural Compressor (INC), which provides model measurement and quantization capabilities in PyTorch. -Models that have been validated: - - [nlpconnect/vit-gpt2-image-captioning](https://huggingface.co/nlpconnect/vit-gpt2-image-captioning) - - [Salesforce/blip-image-captioning-large](https://huggingface.co/Salesforce/blip-image-captioning-large) - - [Salesforce/blip-image-captioning-base](https://huggingface.co/Salesforce/blip-image-captioning-base) - -### Running with FP8 - -Llava-1.5-7b and Llava-1.5-13b in FP8 are enabled using the Intel Neural Compressor (INC), which provides model measurement and quantization capabilities in PyTorch. More information on enabling FP8 in SynapseAI is available here: https://docs.habana.ai/en/latest/PyTorch/Inference_on_PyTorch/Inference_Using_FP8.html diff --git a/examples/text-generation/utils.py b/examples/text-generation/utils.py index 9f7219e49b..e2db1eb889 100644 --- a/examples/text-generation/utils.py +++ b/examples/text-generation/utils.py @@ -197,7 +197,7 @@ def setup_quantization(model, args): config = FP8Config.from_json_file(args.quant_config) if config.measure: model = prepare(model, config) - if config.quantize or args.assistant_model is not None: + if config.quantize: model = convert(model, config) return model From d0b795f63eb9cf671f924bd7c9e8a7ff55398ecd Mon Sep 17 00:00:00 2001 From: Adam Stachowicz Date: Wed, 25 Sep 2024 16:31:08 +0300 Subject: [PATCH 4/5] Adding image-to-text changes --- examples/image-to-text/run_pipeline.py | 26 ++++++++++++++++++++------ 1 file changed, 20 insertions(+), 6 deletions(-) diff --git a/examples/image-to-text/run_pipeline.py b/examples/image-to-text/run_pipeline.py index 239d6fa4e4..fde8bf9f12 100644 --- a/examples/image-to-text/run_pipeline.py +++ b/examples/image-to-text/run_pipeline.py @@ -36,6 +36,23 @@ logger = logging.getLogger(__name__) +def setup_quantization(model, args): + from neural_compressor.torch.quantization import FP8Config, convert, prepare + + config = FP8Config.from_json_file(args.quant_config) + if config.measure: + model = prepare(model, config) + elif config.quantize: + model = convert(model, config) + return model + + +def finalize_quantization(model): + from neural_compressor.torch.quantization import finalize_calibration + + finalize_calibration(model) + + def main(): parser = argparse.ArgumentParser() @@ -101,7 +118,7 @@ def main(): # set args.quant_config with env variable if it is set args.quant_config = os.getenv("QUANT_CONFIG", "") - + os.environ.setdefault("EXPERIMENTAL_WEIGHT_SHARING", "FALSE") adapt_transformers_to_gaudi() model_type = AutoConfig.from_pretrained(args.model_name_or_path).model_type @@ -163,10 +180,7 @@ def main(): generator.model = wrap_in_hpu_graph(generator.model) if args.quant_config: - import habana_quantization_toolkit - - habana_quantization_toolkit.prep_model(generator.model) - + generator.model = setup_quantization(generator.model, args) htcore.hpu_initialize(generator.model) # warm up @@ -174,7 +188,7 @@ def main(): generator(images, prompt=args.prompt, batch_size=args.batch_size, generate_kwargs=generate_kwargs) torch.hpu.synchronize() if args.quant_config: - habana_quantization_toolkit.finish_measurements(generator.model) + finalize_quantization(generator.model) start = time.perf_counter() for i in range(args.n_iterations): From 08c0237e3bdd743797cf8f8087cdfda675b0542e Mon Sep 17 00:00:00 2001 From: Yeonsil Yoon Date: Mon, 7 Oct 2024 16:41:20 +0000 Subject: [PATCH 5/5] Make style --- examples/image-to-text/run_pipeline.py | 1 + 1 file changed, 1 insertion(+) diff --git a/examples/image-to-text/run_pipeline.py b/examples/image-to-text/run_pipeline.py index 3d443a3f44..d3000a7e21 100644 --- a/examples/image-to-text/run_pipeline.py +++ b/examples/image-to-text/run_pipeline.py @@ -50,6 +50,7 @@ def setup_quantization(model, args): def finalize_quantization(model): from neural_compressor.torch.quantization import finalize_calibration + finalize_calibration(model)