From e42e92f7914963f9e5b375b68897be720ff2a142 Mon Sep 17 00:00:00 2001
From: yan tomsinsky <ytomsinsky@habana.ai>
Date: Wed, 17 Apr 2024 11:53:29 +0300
Subject: [PATCH] Remove --fp8 flag from script

---
 examples/text-generation/README.md         | 6 ------
 examples/text-generation/run_generation.py | 1 -
 tests/test_text_generation_example.py      | 1 -
 3 files changed, 8 deletions(-)

diff --git a/examples/text-generation/README.md b/examples/text-generation/README.md
index 2a5db4c926..0745c6b852 100644
--- a/examples/text-generation/README.md
+++ b/examples/text-generation/README.md
@@ -107,7 +107,6 @@ Here are a few settings you may be interested in:
 - `--prompt` to benchmark the model on one or several prompts of your choice
 - `--attn_softmax_bf16` to run attention softmax layer in bfloat16 precision provided that the model (such as Llama) supports it
 - `--trim_logits` to calculate logits only for the last token in the first time step provided that the model (such as Llama) supports it
-- `--fp8` Enable Quantization to fp8
 
 For example, you can reproduce the results presented in [this blog post](https://huggingface.co/blog/habana-gaudi-2-bloom) with the following command:
 ```bash
@@ -273,7 +272,6 @@ QUANT_CONFIG=./quantization_config/maxabs_quant.json python ../gaudi_spawn.py \
 --reuse_cache \
 --bf16 \
 --batch_size 1 \
---fp8
 ```
 
 Alternatively, here is another example to quantize the model based on previous measurements for LLama2-70b:
@@ -291,7 +289,6 @@ QUANT_CONFIG=./quantization_config/maxabs_quant.json python ../gaudi_spawn.py \
 --max_new_tokens 2048 \
 --max_input_tokens 2048 \
 --limit_hpu_graphs \
---fp8
 ```
 
 Here is an example to measure the tensor quantization statistics on Mixtral-8x7B with 1 card:
@@ -318,7 +315,6 @@ QUANT_CONFIG=./quantization_config/maxabs_quant_mixtral.json python run_generati
 --max_new_tokens 2048 \
 --batch_size 16 \
 --bf16 \
---fp8
 ```
 
 Here is an example to measure the tensor quantization statistics on Falcon-180B with 8 cards:
@@ -350,9 +346,7 @@ QUANT_CONFIG=./quantization_config/maxabs_quant.json python ../gaudi_spawn.py \
 --bf16 \
 --reuse_cache \
 --trim_logits \
---fp8
 ```
-`--fp8` is required to enable quantization in fp8.
 
 ### Using Habana Flash Attention
 
diff --git a/examples/text-generation/run_generation.py b/examples/text-generation/run_generation.py
index c9fc7ec868..bf5e3d996a 100644
--- a/examples/text-generation/run_generation.py
+++ b/examples/text-generation/run_generation.py
@@ -221,7 +221,6 @@ def setup_parser(parser):
         help="Preprocess on cpu, and some other optimizations. Useful to prevent recompilations when using dynamic prompts (simulate_dyn_prompt)",
     )
 
-    parser.add_argument("--fp8", action="store_true", help="Enable Quantization to fp8")
     parser.add_argument(
         "--use_flash_attention",
         action="store_true",
diff --git a/tests/test_text_generation_example.py b/tests/test_text_generation_example.py
index 00602dbd0e..e4e0324822 100644
--- a/tests/test_text_generation_example.py
+++ b/tests/test_text_generation_example.py
@@ -109,7 +109,6 @@ def _test_text_generation(
 
     if fp8:
         command += [
-            "--fp8",
             "--reuse_cache",
             "--trim_logits",
         ]