diff --git a/examples/image-to-text/README.md b/examples/image-to-text/README.md index 2ac99dc829..b5e261f32a 100644 --- a/examples/image-to-text/README.md +++ b/examples/image-to-text/README.md @@ -93,7 +93,7 @@ python3 run_pipeline.py \ ``` ### Inference with FP8 -Inference for Llava-1.5-7b, Llava-1.5-13b, Llava-v1.6-mistral-7b and Llava-v1.6-vicuna-13b in FP8 precision are enabled using [Intel Neural Compressor (INC)](https://docs.habana.ai/en/latest/PyTorch/Inference_on_PyTorch/Inference_Using_FP8.html), which provides model measurement and quantization capabilities in PyTorch. INC is used by default for measuring and quantization. Habana Quantization Toolkit (HQT), which was used earlier, will be removed in future releases. To use HQT, disable INC by setting the following environment variable: `USE_INC=0`. +Inference for Llava-1.5-7b, Llava-1.5-13b, Llava-v1.6-mistral-7b and Llava-v1.6-vicuna-13b in FP8 precision are enabled using [Intel Neural Compressor (INC)](https://docs.habana.ai/en/latest/PyTorch/Inference_on_PyTorch/Inference_Using_FP8.html), which provides model measurement and quantization capabilities in PyTorch. More information on enabling FP8 in SynapseAI is available here: https://docs.habana.ai/en/latest/PyTorch/Inference_on_PyTorch/Inference_Using_FP8.html diff --git a/examples/image-to-text/run_pipeline.py b/examples/image-to-text/run_pipeline.py index 9161285881..d3000a7e21 100644 --- a/examples/image-to-text/run_pipeline.py +++ b/examples/image-to-text/run_pipeline.py @@ -37,43 +37,21 @@ def setup_quantization(model, args): - if os.getenv("USE_INC", "1") != "0": - try: - from neural_compressor.torch.quantization import FP8Config, convert, prepare - except ImportError: - raise ImportError( - "Module neural_compressor is missing. Please use a newer Synapse version to use quantization, or set the environment variable to USE_INC=0" - ) - - config = FP8Config.from_json_file(args.quant_config) - if config.measure: - model = prepare(model, config) - elif config.quantize: - model = convert(model, config) - else: - import habana_frameworks.torch.core as htcore - import habana_quantization_toolkit + from neural_compressor.torch.quantization import FP8Config, convert, prepare - habana_quantization_toolkit.prep_model(model) - htcore.hpu_initialize(model) + config = FP8Config.from_json_file(args.quant_config) + if config.measure: + model = prepare(model, config) + elif config.quantize: + model = convert(model, config) return model def finalize_quantization(model): - if os.getenv("USE_INC", "1") != "0": - try: - from neural_compressor.torch.quantization import finalize_calibration - except ImportError: - raise ImportError( - "Module neural_compressor is missing. Please use a newer Synapse version to use quantization, or set the environment variable to USE_INC=0" - ) - - finalize_calibration(model) - else: - import habana_quantization_toolkit + from neural_compressor.torch.quantization import finalize_calibration - habana_quantization_toolkit.finish_measurements(model) + finalize_calibration(model) def main(): @@ -151,7 +129,7 @@ def main(): # set args.quant_config with env variable if it is set args.quant_config = os.getenv("QUANT_CONFIG", "") - + os.environ.setdefault("EXPERIMENTAL_WEIGHT_SHARING", "FALSE") adapt_transformers_to_gaudi() model_type = AutoConfig.from_pretrained(args.model_name_or_path).model_type @@ -225,6 +203,7 @@ def main(): if args.quant_config: generator.model = setup_quantization(generator.model, args) + htcore.hpu_initialize(generator.model) # warm up for i in range(args.warmup): diff --git a/examples/text-generation/utils.py b/examples/text-generation/utils.py index 6be7b3541e..61066dbaf9 100644 --- a/examples/text-generation/utils.py +++ b/examples/text-generation/utils.py @@ -187,41 +187,30 @@ def get_torch_compiled_model(model): def setup_quantization(model, args): - if os.getenv("USE_INC", "1") != "0": - try: - from neural_compressor.torch.quantization import FP8Config, convert, prepare - except ImportError: - raise ImportError( - "Module neural_compressor is missing. Please use a newer Synapse version to use quantization, or set the environment variable to USE_INC=0" - ) - - config = FP8Config.from_json_file(args.quant_config) - if config.measure: - model = prepare(model, config) - elif config.quantize: - model = convert(model, config) - else: - import habana_quantization_toolkit + try: + from neural_compressor.torch.quantization import FP8Config, convert, prepare + except ImportError: + raise ImportError( + "Module neural_compressor is missing. Please use a newer Synapse version to use quantization." + ) - habana_quantization_toolkit.prep_model(model) + config = FP8Config.from_json_file(args.quant_config) + if config.measure: + model = prepare(model, config) + if config.quantize: + model = convert(model, config) return model def finalize_quantization(model): - if os.getenv("USE_INC", "1") != "0": - try: - from neural_compressor.torch.quantization import finalize_calibration - except ImportError: - raise ImportError( - "Module neural_compressor is missing. Please use a newer Synapse version to use quantization, or set the environment variable to USE_INC=0" - ) - - finalize_calibration(model) - else: - import habana_quantization_toolkit - - habana_quantization_toolkit.finish_measurements(model) + try: + from neural_compressor.torch.quantization import finalize_calibration + except ImportError: + raise ImportError( + "Module neural_compressor is missing. Please use a newer Synapse version to use quantization." + ) + finalize_calibration(model) def setup_model(args, model_dtype, model_kwargs, logger):