From d9d4fc079cc065c3d0f42efe2c5eca0e6bc4ea3b Mon Sep 17 00:00:00 2001
From: regisss <15324346+regisss@users.noreply.github.com>
Date: Fri, 29 Mar 2024 23:08:38 +0000
Subject: [PATCH 1/7] Release: v1.11.0

---
 optimum/habana/version.py | 2 +-
 setup.py                  | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/optimum/habana/version.py b/optimum/habana/version.py
index 497fd1da79..714a1d7075 100644
--- a/optimum/habana/version.py
+++ b/optimum/habana/version.py
@@ -13,4 +13,4 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-__version__ = "1.11.0.dev0"
+__version__ = "1.11.0"
diff --git a/setup.py b/setup.py
index 5904ba2ea1..c822c41e7f 100644
--- a/setup.py
+++ b/setup.py
@@ -49,7 +49,7 @@
 
 QUALITY_REQUIRES = [
     "ruff",
-    "hf_doc_builder @ git+https://github.com/huggingface/doc-builder.git",
+    "hf_doc_builder",
 ]
 
 EXTRAS_REQUIRE = {

From 4160e9c8a068a515e3981b37690cebe1551ffe8c Mon Sep 17 00:00:00 2001
From: Libin Tang <litang@habana.ai>
Date: Sat, 30 Mar 2024 02:30:23 -0700
Subject: [PATCH 2/7] Fix fp8 ci (#852)

Co-authored-by: regisss <15324346+regisss@users.noreply.github.com>
---
 tests/test_text_generation_example.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tests/test_text_generation_example.py b/tests/test_text_generation_example.py
index 8f3da77526..48150d635c 100644
--- a/tests/test_text_generation_example.py
+++ b/tests/test_text_generation_example.py
@@ -136,7 +136,7 @@ def _test_text_generation(
             env_variables["QUANT_CONFIG"] = os.path.join(
                 path_to_example_dir, "text-generation/quantization_config/maxabs_quant.json"
             )
-            command.insert(-1, "--fp8")
+            command.insert(-2, "--fp8")
 
         proc = subprocess.run(command, env=env_variables)
 

From b0eefc55da46276b75fdf17b75f67d33ff92f0f3 Mon Sep 17 00:00:00 2001
From: regisss <15324346+regisss@users.noreply.github.com>
Date: Sat, 30 Mar 2024 10:41:13 +0100
Subject: [PATCH 3/7] Fix PR #848 (#853)

---
 tests/test_examples.py | 10 +++++-----
 1 file changed, 5 insertions(+), 5 deletions(-)

diff --git a/tests/test_examples.py b/tests/test_examples.py
index 5cf2559f5f..6eefb5c571 100755
--- a/tests/test_examples.py
+++ b/tests/test_examples.py
@@ -276,7 +276,7 @@ def test(self):
                 self.assertEqual(return_code, 0)
                 return
             elif self.EXAMPLE_NAME == "run_clip":
-                if not os.environ.get("DATA_CACHE", "0"):
+                if os.environ.get("DATA_CACHE", None) is None:
                     from .clip_coco_utils import COCO_URLS, download_files
 
                     download_files(COCO_URLS)
@@ -327,8 +327,8 @@ def test(self):
 
             extra_command_line_arguments = baseline.get("distribution").get(distribution).get("extra_arguments", [])
 
-            if os.environ.get("DATA_CACHE", "0") and self.EXAMPLE_NAME == "run_clip":
-                extra_command_line_arguments[0] = "--data_dir {}".format(os.environ.get("DATA_CACHE", "$PWD"))
+            if os.environ.get("DATA_CACHE", None) is not None and self.EXAMPLE_NAME == "run_clip":
+                extra_command_line_arguments[0] = "--data_dir {}".format(os.environ["DATA_CACHE"])
 
             with TemporaryDirectory() as tmp_dir:
                 cmd_line = self._create_command_line(
@@ -410,7 +410,7 @@ def _create_command_line(
         task: Optional[str] = None,
         extra_command_line_arguments: Optional[List[str]] = None,
     ) -> List[str]:
-        dataset_name = self.DATASET_NAME if self.DATASET_NAME else task
+        dataset_name = self.DATASET_NAME if self.DATASET_NAME is not None else task
         task_option = f"--{self.DATASET_PARAMETER_NAME} {dataset_name}" if task else " "
 
         cmd_line = ["python3"]
@@ -583,7 +583,7 @@ class MultiCardSpeechRecognitionExampleTester(
     ExampleTesterBase, metaclass=ExampleTestMeta, example_name="run_speech_recognition_ctc", multi_card=True
 ):
     TASK_NAME = "regisss/librispeech_asr_for_optimum_habana_ci"
-    DATASET_NAME = os.environ.get("DATA_CACHE", 0)
+    DATASET_NAME = os.environ.get("DATA_CACHE", None)
 
 
 class MultiCardSummarizationExampleTester(

From 8ee87de9085d13593c1b20ada9b1b99a0715da69 Mon Sep 17 00:00:00 2001
From: regisss <15324346+regisss@users.noreply.github.com>
Date: Sat, 30 Mar 2024 11:00:50 +0100
Subject: [PATCH 4/7] Disable safe loading tests in CI (#854)

---
 tests/test_trainer.py | 17 +++++++++++++++++
 1 file changed, 17 insertions(+)

diff --git a/tests/test_trainer.py b/tests/test_trainer.py
index 1963f805c7..2cb3147523 100644
--- a/tests/test_trainer.py
+++ b/tests/test_trainer.py
@@ -49,6 +49,7 @@
     get_gpu_count,
     get_tests_dir,
     is_staging_test,
+    parse_flag_from_env,
     require_optuna,
     require_safetensors,
     require_sentencepiece,
@@ -90,6 +91,20 @@
 PATH_SAMPLE_TEXT = f"{get_tests_dir()}/fixtures/sample_text.txt"
 
 
+_run_safe_loading_tests_ = parse_flag_from_env("SAFE_LOADING_TESTS", default=False)
+
+
+def safe_loading_test(test_case):
+    """
+    Decorator marking a test as needing custom bf16 ops.
+    Custom bf16 ops must be declared before `habana_frameworks.torch.core` is imported, which is not possible if some other tests are executed before.
+
+    Such tests are skipped by default. Set the CUSTOM_BF16_OPS environment variable to a truthy value to run them.
+
+    """
+    return unittest.skipUnless(_run_safe_loading_tests_, "test requires SAFE_LOADING_TESTS")(test_case)
+
+
 class RegressionDataset:
     def __init__(self, a=2, b=3, length=64, seed=42, label_names=None):
         np.random.seed(seed)
@@ -1465,6 +1480,7 @@ def test_training_with_resume_from_checkpoint_false(self):
 
         trainer.train(resume_from_checkpoint=False)
 
+    @safe_loading_test
     @require_safetensors
     def test_resume_training_with_safe_checkpoint(self):
         # This test will fail for more than 2 GPUs since the batch size will get bigger and with the number of
@@ -1658,6 +1674,7 @@ def test_load_best_model_at_end(self):
             self.check_saved_checkpoints(tmpdir, 5, total, is_pretrained=False)
             self.check_best_model_has_been_loaded(tmpdir, 5, total, trainer, "eval_loss", is_pretrained=False)
 
+    @safe_loading_test
     @require_safetensors
     def test_load_best_model_from_safetensors(self):
         total = int(self.n_epochs * 64 / self.batch_size)

From 221ac07a811fe1326f7cdb9f168fb12e957e3eac Mon Sep 17 00:00:00 2001
From: Libin Tang <litang@habana.ai>
Date: Sat, 30 Mar 2024 19:10:15 +0000
Subject: [PATCH 5/7] Add warmup_step for eval throughput calculation.

---
 optimum/habana/transformers/trainer.py | 79 ++++++++++++++++++++++++++
 1 file changed, 79 insertions(+)

diff --git a/optimum/habana/transformers/trainer.py b/optimum/habana/transformers/trainer.py
index 4f22bdde37..e44884754b 100644
--- a/optimum/habana/transformers/trainer.py
+++ b/optimum/habana/transformers/trainer.py
@@ -1621,6 +1621,79 @@ def _save(self, output_dir: Optional[str] = None, state_dict=None):
         # Good practice: save your training arguments together with the trained model
         torch.save(self.args, os.path.join(output_dir, TRAINING_ARGS_NAME))
 
+    def evaluate(
+        self,
+        eval_dataset: Optional[Union[Dataset, Dict[str, Dataset]]] = None,
+        ignore_keys: Optional[List[str]] = None,
+        metric_key_prefix: str = "eval",
+    ) -> Dict[str, float]:
+        """
+        From https://github.com/huggingface/transformers/blob/v4.38.2/src/transformers/trainer.py#L3162 with the following modification
+        1. comment out TPU related
+        2. use throughput_warmup_steps in evaulation throughput calculation
+        """
+        # handle multipe eval datasets
+        eval_dataset = eval_dataset if eval_dataset is not None else self.eval_dataset
+        if isinstance(eval_dataset, dict):
+            metrics = {}
+            for eval_dataset_name, _eval_dataset in eval_dataset.items():
+                dataset_metrics = self.evaluate(
+                    eval_dataset=_eval_dataset,
+                    ignore_keys=ignore_keys,
+                    metric_key_prefix=f"{metric_key_prefix}_{eval_dataset_name}",
+                )
+                metrics.update(dataset_metrics)
+            return metrics
+
+        # memory metrics - must set up as early as possible
+        self._memory_tracker.start()
+
+        eval_dataloader = self.get_eval_dataloader(eval_dataset)
+        # if self.is_fsdp_xla_v2_enabled:
+        #    eval_dataloader = tpu_spmd_dataloader(eval_dataloader)
+
+        start_time = time.time()
+        self.start_time_after_warmup = None
+
+        eval_loop = self.prediction_loop if self.args.use_legacy_prediction_loop else self.evaluation_loop
+        output = eval_loop(
+            eval_dataloader,
+            description="Evaluation",
+            # No point gathering the predictions if there are no metrics, otherwise we defer to
+            # self.args.prediction_loss_only
+            prediction_loss_only=True if self.compute_metrics is None else None,
+            ignore_keys=ignore_keys,
+            metric_key_prefix=metric_key_prefix,
+        )
+
+        total_batch_size = self.args.eval_batch_size * self.args.world_size
+        if f"{metric_key_prefix}_jit_compilation_time" in output.metrics:
+            start_time += output.metrics[f"{metric_key_prefix}_jit_compilation_time"]
+        num_samples = output.num_samples - self.args.throughput_warmup_steps * total_batch_size
+        num_steps = math.ceil(output.num_samples / total_batch_size) - self.args.throughput_warmup_steps
+
+        output.metrics.update(
+            speed_metrics(
+                metric_key_prefix,
+                start_time,
+                num_samples=num_samples,
+                num_steps=num_steps,
+                start_time_after_warmup=self.start_time_after_warmup,
+            )
+        )
+
+        self.log(output.metrics)
+
+        # if DebugOption.TPU_METRICS_DEBUG in self.args.debug:
+        # tpu-comment: Logging debug metrics for PyTorch/XLA (compile, execute times, ops, etc.)
+        # xm.master_print(met.metrics_report())
+
+        self.control = self.callback_handler.on_evaluate(self.args, self.state, self.control, output.metrics)
+
+        self._memory_tracker.stop_and_update_metrics(output.metrics)
+
+        return output.metrics
+
     def evaluation_loop(
         self,
         dataloader: DataLoader,
@@ -1716,6 +1789,12 @@ def evaluation_loop(
         observed_num_examples = 0
         # Main evaluation loop
         for step, inputs in enumerate(dataloader):
+            if (
+                self.args.throughput_warmup_steps > 0
+                and not self.is_in_train
+                and step == self.args.throughput_warmup_steps
+            ):
+                self.start_time_after_warmup = time.time()
             # Update the observed num examples
             observed_batch_size = find_batch_size(inputs)
             if observed_batch_size is not None:

From ea79a3f82dec8aafcc64575a36870c45ba273201 Mon Sep 17 00:00:00 2001
From: regisss <15324346+regisss@users.noreply.github.com>
Date: Sun, 31 Mar 2024 12:50:50 +0200
Subject: [PATCH 6/7] Remove commented lines

---
 optimum/habana/transformers/trainer.py | 8 +-------
 1 file changed, 1 insertion(+), 7 deletions(-)

diff --git a/optimum/habana/transformers/trainer.py b/optimum/habana/transformers/trainer.py
index e44884754b..c91d0724f8 100644
--- a/optimum/habana/transformers/trainer.py
+++ b/optimum/habana/transformers/trainer.py
@@ -1630,7 +1630,7 @@ def evaluate(
         """
         From https://github.com/huggingface/transformers/blob/v4.38.2/src/transformers/trainer.py#L3162 with the following modification
         1. comment out TPU related
-        2. use throughput_warmup_steps in evaulation throughput calculation
+        2. use throughput_warmup_steps in evaluation throughput calculation
         """
         # handle multipe eval datasets
         eval_dataset = eval_dataset if eval_dataset is not None else self.eval_dataset
@@ -1649,8 +1649,6 @@ def evaluate(
         self._memory_tracker.start()
 
         eval_dataloader = self.get_eval_dataloader(eval_dataset)
-        # if self.is_fsdp_xla_v2_enabled:
-        #    eval_dataloader = tpu_spmd_dataloader(eval_dataloader)
 
         start_time = time.time()
         self.start_time_after_warmup = None
@@ -1684,10 +1682,6 @@ def evaluate(
 
         self.log(output.metrics)
 
-        # if DebugOption.TPU_METRICS_DEBUG in self.args.debug:
-        # tpu-comment: Logging debug metrics for PyTorch/XLA (compile, execute times, ops, etc.)
-        # xm.master_print(met.metrics_report())
-
         self.control = self.callback_handler.on_evaluate(self.args, self.state, self.control, output.metrics)
 
         self._memory_tracker.stop_and_update_metrics(output.metrics)

From 96d39e58e1dca4056ac9ae29bbbf0161be78296b Mon Sep 17 00:00:00 2001
From: regisss <15324346+regisss@users.noreply.github.com>
Date: Sun, 31 Mar 2024 12:52:26 +0200
Subject: [PATCH 7/7] Adapt for main

---
 optimum/habana/version.py | 2 +-
 setup.py                  | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/optimum/habana/version.py b/optimum/habana/version.py
index 714a1d7075..497fd1da79 100644
--- a/optimum/habana/version.py
+++ b/optimum/habana/version.py
@@ -13,4 +13,4 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-__version__ = "1.11.0"
+__version__ = "1.11.0.dev0"
diff --git a/setup.py b/setup.py
index c822c41e7f..5904ba2ea1 100644
--- a/setup.py
+++ b/setup.py
@@ -49,7 +49,7 @@
 
 QUALITY_REQUIRES = [
     "ruff",
-    "hf_doc_builder",
+    "hf_doc_builder @ git+https://github.com/huggingface/doc-builder.git",
 ]
 
 EXTRAS_REQUIRE = {