From 852f7bdaaad10034a24b68088445c430b6a69914 Mon Sep 17 00:00:00 2001
From: Harish Subramony <hsubramony@habana.ai>
Date: Wed, 4 Dec 2024 15:06:52 -0800
Subject: [PATCH 1/4] add sdp_on_bf16 for tests, text_gen

---
 examples/text-generation/run_generation.py | 6 ++++++
 tests/test_examples.py                     | 9 +++++++++
 tests/test_fsdp_examples.py                | 2 ++
 tests/test_text_generation_example.py      | 4 ++++
 4 files changed, 21 insertions(+)

diff --git a/examples/text-generation/run_generation.py b/examples/text-generation/run_generation.py
index 4b2ab96842..a057261b32 100755
--- a/examples/text-generation/run_generation.py
+++ b/examples/text-generation/run_generation.py
@@ -320,6 +320,9 @@ def setup_parser(parser):
         action="store_true",
         help="Run the inference with dataset for specified --n_iterations(default:5)",
     )
+    parser.add_argument(
+        "--sdp_on_bf16", action="store_true", help="Allow pyTorch to use reduced precision in the SDPA math backend"
+    )
 
     quant_parser_group = parser.add_mutually_exclusive_group()
     quant_parser_group.add_argument(
@@ -389,6 +392,9 @@ def main():
 
     import habana_frameworks.torch.hpu as torch_hpu
 
+    if args.sdp_on_bf16:
+        torch._C._set_math_sdp_allow_fp16_bf16_reduction(True)
+
     if args.dataset_name is None:
         # Benchmark over the prompts below
         if args.prompt:
diff --git a/tests/test_examples.py b/tests/test_examples.py
index b6f07b0512..e9d8e57852 100644
--- a/tests/test_examples.py
+++ b/tests/test_examples.py
@@ -520,6 +520,15 @@ def test(self):
                 env_variables["PT_HPU_LAZY_MODE"] = "0"
                 env_variables["PT_ENABLE_INT64_SUPPORT"] = "1"
 
+            if self.EXAMPLE_NAME == "run_glue":
+                if model_name == "bert-large-uncased-whole-word-masking":
+                    extra_command_line_arguments.append("--sdp_on_bf16")
+
+            if self.EXAMPLE_NAME == "run_qa":
+                if model_name == "bert-large-uncased-whole-word-masking" or \
+                   model_name == "albert-large-v2":
+                    extra_command_line_arguments.append("--sdp_on_bf16")
+
             with TemporaryDirectory() as tmp_dir:
                 cmd_line = self._create_command_line(
                     multi_card,
diff --git a/tests/test_fsdp_examples.py b/tests/test_fsdp_examples.py
index 6335f28ebf..a0197834ef 100644
--- a/tests/test_fsdp_examples.py
+++ b/tests/test_fsdp_examples.py
@@ -97,6 +97,7 @@ def _test_fsdp(
             f"--gaudi_config_name {gaudi_config}",
             "--throughput_warmup_steps 100",
             "--do_eval",
+            "--sdp_on_bf16",
         ]
     else:
         command += [
@@ -126,6 +127,7 @@ def _test_fsdp(
             "--use_flash_attention True",
             "--flash_attention_causal_mask True",
             f"--token {token.value}",
+            "--sdp_on_bf16",
         ]
 
     with TemporaryDirectory() as tmp_dir:
diff --git a/tests/test_text_generation_example.py b/tests/test_text_generation_example.py
index 1fcadba9b0..0fddd2a746 100644
--- a/tests/test_text_generation_example.py
+++ b/tests/test_text_generation_example.py
@@ -221,6 +221,10 @@ def _test_text_generation(
 
     if "gemma" in model_name.lower():
         command += ["--use_flash_attention"]
+        command += ["--sdp_on_bf16"]
+
+    if "decilm" in model_name.lower():
+        command += ["--sdp_on_bf16"]
 
     if (reuse_cache or torch_compile) and not parallel_strategy == "tp" and not is_starcoder_first_gen_model:
         command += ["--reuse_cache"]

From 957195902aeceafc678c5071fdf5860f5bb6419c Mon Sep 17 00:00:00 2001
From: Harish Subramony <hsubramony@habana.ai>
Date: Thu, 5 Dec 2024 20:00:18 -0800
Subject: [PATCH 2/4] add sdp_on_bf16 to run_pipeline, more tests

---
 examples/image-to-text/run_pipeline.py |  6 ++++++
 tests/test_encoder_decoder.py          |  3 +++
 tests/test_examples.py                 | 11 +++++++++++
 tests/test_image_to_text_example.py    |  5 +++++
 tests/test_text_generation_example.py  |  3 +++
 5 files changed, 28 insertions(+)

diff --git a/examples/image-to-text/run_pipeline.py b/examples/image-to-text/run_pipeline.py
index ae76fdbe55..6ab1314eb9 100644
--- a/examples/image-to-text/run_pipeline.py
+++ b/examples/image-to-text/run_pipeline.py
@@ -174,9 +174,15 @@ def main():
         action="store_true",
         help="Whether to use the key/value cache for decoding. It should speed up generation.",
     )
+    parser.add_argument(
+        "--sdp_on_bf16", action="store_true", help="Allow pyTorch to use reduced precision in the SDPA math backend"
+    )
 
     args = parser.parse_args()
 
+    if args.sdp_on_bf16:
+        torch._C._set_math_sdp_allow_fp16_bf16_reduction(True)
+
     # set args.quant_config with env variable if it is set
     args.quant_config = os.getenv("QUANT_CONFIG", "")
 
diff --git a/tests/test_encoder_decoder.py b/tests/test_encoder_decoder.py
index 27dd1b75c2..20d808b69f 100644
--- a/tests/test_encoder_decoder.py
+++ b/tests/test_encoder_decoder.py
@@ -206,6 +206,9 @@ def _test_text_translation(
         if "opus-mt-zh-en" in model_name:
             command_args.append("--max_source_length 512")
 
+        if "Babelscape/mrebel-large" in model_name or "nllb-200-distilled-600M" in model_name:
+            command_args.append("--sdp_on_bf16")
+
         command = self._build_command(
             task=task,
             deepspeed=deepspeed,
diff --git a/tests/test_examples.py b/tests/test_examples.py
index e9d8e57852..37d1fb8d8a 100644
--- a/tests/test_examples.py
+++ b/tests/test_examples.py
@@ -529,6 +529,17 @@ def test(self):
                    model_name == "albert-large-v2":
                     extra_command_line_arguments.append("--sdp_on_bf16")
 
+            if self.EXAMPLE_NAME == "run_bridgetower":
+                if model_name == "BridgeTower/bridgetower-large-itm-mlm-itc":
+                    extra_command_line_arguments.append("--sdp_on_bf16")
+
+            if self.EXAMPLE_NAME == "run_speech_recognition_seq2seq":
+                if model_name == "openai/whisper-small":
+                    extra_command_line_arguments.append("--sdp_on_bf16")
+
+            if self.EXAMPLE_NAME == "run_clip":
+                extra_command_line_arguments.append("--sdp_on_bf16")
+
             with TemporaryDirectory() as tmp_dir:
                 cmd_line = self._create_command_line(
                     multi_card,
diff --git a/tests/test_image_to_text_example.py b/tests/test_image_to_text_example.py
index e8aa562907..8e78707219 100644
--- a/tests/test_image_to_text_example.py
+++ b/tests/test_image_to_text_example.py
@@ -67,6 +67,11 @@ def _test_image_to_text(
         "--use_hpu_graphs",
     ]
 
+    if "meta-llama/Llama-3.2-11B-Vision-Instruct" in model_name or "tiiuae/falcon-11B-vlm" in model_name:
+        command += [
+            "--sdp_on_bf16",
+        ]
+
     command.append("--bf16")
 
     with TemporaryDirectory() as tmp_dir:
diff --git a/tests/test_text_generation_example.py b/tests/test_text_generation_example.py
index 0fddd2a746..df6c604b69 100644
--- a/tests/test_text_generation_example.py
+++ b/tests/test_text_generation_example.py
@@ -226,6 +226,9 @@ def _test_text_generation(
     if "decilm" in model_name.lower():
         command += ["--sdp_on_bf16"]
 
+    if "mamba-130m-hf" in model_name.lower():
+        command += ["--sdp_on_bf16"]
+
     if (reuse_cache or torch_compile) and not parallel_strategy == "tp" and not is_starcoder_first_gen_model:
         command += ["--reuse_cache"]
 

From c59935d74599d56439a01cd2000aeff43ec4b835 Mon Sep 17 00:00:00 2001
From: Harish Subramony <hsubramony@habana.ai>
Date: Fri, 6 Dec 2024 10:13:29 -0800
Subject: [PATCH 3/4] code cleanup

---
 tests/test_fsdp_examples.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/tests/test_fsdp_examples.py b/tests/test_fsdp_examples.py
index a0197834ef..3e04d95f14 100644
--- a/tests/test_fsdp_examples.py
+++ b/tests/test_fsdp_examples.py
@@ -97,7 +97,6 @@ def _test_fsdp(
             f"--gaudi_config_name {gaudi_config}",
             "--throughput_warmup_steps 100",
             "--do_eval",
-            "--sdp_on_bf16",
         ]
     else:
         command += [
@@ -127,9 +126,10 @@ def _test_fsdp(
             "--use_flash_attention True",
             "--flash_attention_causal_mask True",
             f"--token {token.value}",
-            "--sdp_on_bf16",
         ]
 
+    command.append("--sdp_on_bf16")
+
     with TemporaryDirectory() as tmp_dir:
         command.append(f"--output_dir {tmp_dir}")
         print(f"\n\nCommand to test: {' '.join(command)}\n")

From ae719ad57fc44dafae69fc0cb20a257f159f8a40 Mon Sep 17 00:00:00 2001
From: Harish Subramony <hsubramony@habana.ai>
Date: Fri, 6 Dec 2024 11:54:45 -0800
Subject: [PATCH 4/4] review cleanup

---
 examples/image-to-text/run_pipeline.py | 6 ------
 tests/test_fsdp_examples.py            | 3 +--
 2 files changed, 1 insertion(+), 8 deletions(-)

diff --git a/examples/image-to-text/run_pipeline.py b/examples/image-to-text/run_pipeline.py
index 6ab1314eb9..ae76fdbe55 100644
--- a/examples/image-to-text/run_pipeline.py
+++ b/examples/image-to-text/run_pipeline.py
@@ -174,15 +174,9 @@ def main():
         action="store_true",
         help="Whether to use the key/value cache for decoding. It should speed up generation.",
     )
-    parser.add_argument(
-        "--sdp_on_bf16", action="store_true", help="Allow pyTorch to use reduced precision in the SDPA math backend"
-    )
 
     args = parser.parse_args()
 
-    if args.sdp_on_bf16:
-        torch._C._set_math_sdp_allow_fp16_bf16_reduction(True)
-
     # set args.quant_config with env variable if it is set
     args.quant_config = os.getenv("QUANT_CONFIG", "")
 
diff --git a/tests/test_fsdp_examples.py b/tests/test_fsdp_examples.py
index 3e04d95f14..180a2bb3f9 100644
--- a/tests/test_fsdp_examples.py
+++ b/tests/test_fsdp_examples.py
@@ -97,6 +97,7 @@ def _test_fsdp(
             f"--gaudi_config_name {gaudi_config}",
             "--throughput_warmup_steps 100",
             "--do_eval",
+            "--sdp_on_bf16",
         ]
     else:
         command += [
@@ -128,8 +129,6 @@ def _test_fsdp(
             f"--token {token.value}",
         ]
 
-    command.append("--sdp_on_bf16")
-
     with TemporaryDirectory() as tmp_dir:
         command.append(f"--output_dir {tmp_dir}")
         print(f"\n\nCommand to test: {' '.join(command)}\n")