Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion examples/image-to-text/README.md
Original file line number Diff line number Diff line change
Expand Up @@ -93,7 +93,7 @@ python3 run_pipeline.py \
```

### Inference with FP8
Inference for Llava-1.5-7b, Llava-1.5-13b, Llava-v1.6-mistral-7b and Llava-v1.6-vicuna-13b in FP8 precision are enabled using [Intel Neural Compressor (INC)](https://docs.habana.ai/en/latest/PyTorch/Inference_on_PyTorch/Inference_Using_FP8.html), which provides model measurement and quantization capabilities in PyTorch. INC is used by default for measuring and quantization. Habana Quantization Toolkit (HQT), which was used earlier, will be removed in future releases. To use HQT, disable INC by setting the following environment variable: `USE_INC=0`.
Inference for Llava-1.5-7b, Llava-1.5-13b, Llava-v1.6-mistral-7b and Llava-v1.6-vicuna-13b in FP8 precision are enabled using [Intel Neural Compressor (INC)](https://docs.habana.ai/en/latest/PyTorch/Inference_on_PyTorch/Inference_Using_FP8.html), which provides model measurement and quantization capabilities in PyTorch.

Comment thread
Yantom1 marked this conversation as resolved.
More information on enabling FP8 in SynapseAI is available here:
https://docs.habana.ai/en/latest/PyTorch/Inference_on_PyTorch/Inference_Using_FP8.html
Expand Down
41 changes: 10 additions & 31 deletions examples/image-to-text/run_pipeline.py
Original file line number Diff line number Diff line change
Expand Up @@ -37,43 +37,21 @@


def setup_quantization(model, args):
if os.getenv("USE_INC", "1") != "0":
try:
from neural_compressor.torch.quantization import FP8Config, convert, prepare
except ImportError:
raise ImportError(
"Module neural_compressor is missing. Please use a newer Synapse version to use quantization, or set the environment variable to USE_INC=0"
)

config = FP8Config.from_json_file(args.quant_config)
if config.measure:
model = prepare(model, config)
elif config.quantize:
model = convert(model, config)
else:
import habana_frameworks.torch.core as htcore
import habana_quantization_toolkit
from neural_compressor.torch.quantization import FP8Config, convert, prepare

habana_quantization_toolkit.prep_model(model)
htcore.hpu_initialize(model)
config = FP8Config.from_json_file(args.quant_config)
if config.measure:
model = prepare(model, config)
elif config.quantize:
model = convert(model, config)

return model


def finalize_quantization(model):
if os.getenv("USE_INC", "1") != "0":
try:
from neural_compressor.torch.quantization import finalize_calibration
except ImportError:
raise ImportError(
"Module neural_compressor is missing. Please use a newer Synapse version to use quantization, or set the environment variable to USE_INC=0"
)

finalize_calibration(model)
else:
import habana_quantization_toolkit
from neural_compressor.torch.quantization import finalize_calibration

habana_quantization_toolkit.finish_measurements(model)
finalize_calibration(model)


def main():
Expand Down Expand Up @@ -151,7 +129,7 @@ def main():

# set args.quant_config with env variable if it is set
args.quant_config = os.getenv("QUANT_CONFIG", "")

os.environ.setdefault("EXPERIMENTAL_WEIGHT_SHARING", "FALSE")
adapt_transformers_to_gaudi()

model_type = AutoConfig.from_pretrained(args.model_name_or_path).model_type
Expand Down Expand Up @@ -225,6 +203,7 @@ def main():

if args.quant_config:
generator.model = setup_quantization(generator.model, args)
htcore.hpu_initialize(generator.model)

# warm up
for i in range(args.warmup):
Expand Down
47 changes: 18 additions & 29 deletions examples/text-generation/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -187,41 +187,30 @@ def get_torch_compiled_model(model):


def setup_quantization(model, args):
if os.getenv("USE_INC", "1") != "0":
try:
from neural_compressor.torch.quantization import FP8Config, convert, prepare
except ImportError:
raise ImportError(
"Module neural_compressor is missing. Please use a newer Synapse version to use quantization, or set the environment variable to USE_INC=0"
)

config = FP8Config.from_json_file(args.quant_config)
if config.measure:
model = prepare(model, config)
elif config.quantize:
model = convert(model, config)
else:
import habana_quantization_toolkit
try:
from neural_compressor.torch.quantization import FP8Config, convert, prepare
except ImportError:
raise ImportError(
"Module neural_compressor is missing. Please use a newer Synapse version to use quantization."
)

habana_quantization_toolkit.prep_model(model)
config = FP8Config.from_json_file(args.quant_config)
if config.measure:
model = prepare(model, config)
if config.quantize:
model = convert(model, config)

return model


def finalize_quantization(model):
if os.getenv("USE_INC", "1") != "0":
try:
from neural_compressor.torch.quantization import finalize_calibration
except ImportError:
raise ImportError(
"Module neural_compressor is missing. Please use a newer Synapse version to use quantization, or set the environment variable to USE_INC=0"
)

finalize_calibration(model)
else:
import habana_quantization_toolkit

habana_quantization_toolkit.finish_measurements(model)
try:
from neural_compressor.torch.quantization import finalize_calibration
except ImportError:
raise ImportError(
"Module neural_compressor is missing. Please use a newer Synapse version to use quantization."
)
finalize_calibration(model)


def setup_model(args, model_dtype, model_kwargs, logger):
Expand Down