From e42e92f7914963f9e5b375b68897be720ff2a142 Mon Sep 17 00:00:00 2001 From: yan tomsinsky Date: Wed, 17 Apr 2024 11:53:29 +0300 Subject: [PATCH] Remove --fp8 flag from script --- examples/text-generation/README.md | 6 ------ examples/text-generation/run_generation.py | 1 - tests/test_text_generation_example.py | 1 - 3 files changed, 8 deletions(-) diff --git a/examples/text-generation/README.md b/examples/text-generation/README.md index 2a5db4c926..0745c6b852 100644 --- a/examples/text-generation/README.md +++ b/examples/text-generation/README.md @@ -107,7 +107,6 @@ Here are a few settings you may be interested in: - `--prompt` to benchmark the model on one or several prompts of your choice - `--attn_softmax_bf16` to run attention softmax layer in bfloat16 precision provided that the model (such as Llama) supports it - `--trim_logits` to calculate logits only for the last token in the first time step provided that the model (such as Llama) supports it -- `--fp8` Enable Quantization to fp8 For example, you can reproduce the results presented in [this blog post](https://huggingface.co/blog/habana-gaudi-2-bloom) with the following command: ```bash @@ -273,7 +272,6 @@ QUANT_CONFIG=./quantization_config/maxabs_quant.json python ../gaudi_spawn.py \ --reuse_cache \ --bf16 \ --batch_size 1 \ ---fp8 ``` Alternatively, here is another example to quantize the model based on previous measurements for LLama2-70b: @@ -291,7 +289,6 @@ QUANT_CONFIG=./quantization_config/maxabs_quant.json python ../gaudi_spawn.py \ --max_new_tokens 2048 \ --max_input_tokens 2048 \ --limit_hpu_graphs \ ---fp8 ``` Here is an example to measure the tensor quantization statistics on Mixtral-8x7B with 1 card: @@ -318,7 +315,6 @@ QUANT_CONFIG=./quantization_config/maxabs_quant_mixtral.json python run_generati --max_new_tokens 2048 \ --batch_size 16 \ --bf16 \ ---fp8 ``` Here is an example to measure the tensor quantization statistics on Falcon-180B with 8 cards: @@ -350,9 +346,7 @@ QUANT_CONFIG=./quantization_config/maxabs_quant.json python ../gaudi_spawn.py \ --bf16 \ --reuse_cache \ --trim_logits \ ---fp8 ``` -`--fp8` is required to enable quantization in fp8. ### Using Habana Flash Attention diff --git a/examples/text-generation/run_generation.py b/examples/text-generation/run_generation.py index c9fc7ec868..bf5e3d996a 100644 --- a/examples/text-generation/run_generation.py +++ b/examples/text-generation/run_generation.py @@ -221,7 +221,6 @@ def setup_parser(parser): help="Preprocess on cpu, and some other optimizations. Useful to prevent recompilations when using dynamic prompts (simulate_dyn_prompt)", ) - parser.add_argument("--fp8", action="store_true", help="Enable Quantization to fp8") parser.add_argument( "--use_flash_attention", action="store_true", diff --git a/tests/test_text_generation_example.py b/tests/test_text_generation_example.py index 00602dbd0e..e4e0324822 100644 --- a/tests/test_text_generation_example.py +++ b/tests/test_text_generation_example.py @@ -109,7 +109,6 @@ def _test_text_generation( if fp8: command += [ - "--fp8", "--reuse_cache", "--trim_logits", ]