From d9d4fc079cc065c3d0f42efe2c5eca0e6bc4ea3b Mon Sep 17 00:00:00 2001 From: regisss <15324346+regisss@users.noreply.github.com> Date: Fri, 29 Mar 2024 23:08:38 +0000 Subject: [PATCH 1/7] Release: v1.11.0 --- optimum/habana/version.py | 2 +- setup.py | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/optimum/habana/version.py b/optimum/habana/version.py index 497fd1da79..714a1d7075 100644 --- a/optimum/habana/version.py +++ b/optimum/habana/version.py @@ -13,4 +13,4 @@ # See the License for the specific language governing permissions and # limitations under the License. -__version__ = "1.11.0.dev0" +__version__ = "1.11.0" diff --git a/setup.py b/setup.py index 5904ba2ea1..c822c41e7f 100644 --- a/setup.py +++ b/setup.py @@ -49,7 +49,7 @@ QUALITY_REQUIRES = [ "ruff", - "hf_doc_builder @ git+https://github.com/huggingface/doc-builder.git", + "hf_doc_builder", ] EXTRAS_REQUIRE = { From 4160e9c8a068a515e3981b37690cebe1551ffe8c Mon Sep 17 00:00:00 2001 From: Libin Tang Date: Sat, 30 Mar 2024 02:30:23 -0700 Subject: [PATCH 2/7] Fix fp8 ci (#852) Co-authored-by: regisss <15324346+regisss@users.noreply.github.com> --- tests/test_text_generation_example.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/test_text_generation_example.py b/tests/test_text_generation_example.py index 8f3da77526..48150d635c 100644 --- a/tests/test_text_generation_example.py +++ b/tests/test_text_generation_example.py @@ -136,7 +136,7 @@ def _test_text_generation( env_variables["QUANT_CONFIG"] = os.path.join( path_to_example_dir, "text-generation/quantization_config/maxabs_quant.json" ) - command.insert(-1, "--fp8") + command.insert(-2, "--fp8") proc = subprocess.run(command, env=env_variables) From b0eefc55da46276b75fdf17b75f67d33ff92f0f3 Mon Sep 17 00:00:00 2001 From: regisss <15324346+regisss@users.noreply.github.com> Date: Sat, 30 Mar 2024 10:41:13 +0100 Subject: [PATCH 3/7] Fix PR #848 (#853) --- tests/test_examples.py | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/tests/test_examples.py b/tests/test_examples.py index 5cf2559f5f..6eefb5c571 100755 --- a/tests/test_examples.py +++ b/tests/test_examples.py @@ -276,7 +276,7 @@ def test(self): self.assertEqual(return_code, 0) return elif self.EXAMPLE_NAME == "run_clip": - if not os.environ.get("DATA_CACHE", "0"): + if os.environ.get("DATA_CACHE", None) is None: from .clip_coco_utils import COCO_URLS, download_files download_files(COCO_URLS) @@ -327,8 +327,8 @@ def test(self): extra_command_line_arguments = baseline.get("distribution").get(distribution).get("extra_arguments", []) - if os.environ.get("DATA_CACHE", "0") and self.EXAMPLE_NAME == "run_clip": - extra_command_line_arguments[0] = "--data_dir {}".format(os.environ.get("DATA_CACHE", "$PWD")) + if os.environ.get("DATA_CACHE", None) is not None and self.EXAMPLE_NAME == "run_clip": + extra_command_line_arguments[0] = "--data_dir {}".format(os.environ["DATA_CACHE"]) with TemporaryDirectory() as tmp_dir: cmd_line = self._create_command_line( @@ -410,7 +410,7 @@ def _create_command_line( task: Optional[str] = None, extra_command_line_arguments: Optional[List[str]] = None, ) -> List[str]: - dataset_name = self.DATASET_NAME if self.DATASET_NAME else task + dataset_name = self.DATASET_NAME if self.DATASET_NAME is not None else task task_option = f"--{self.DATASET_PARAMETER_NAME} {dataset_name}" if task else " " cmd_line = ["python3"] @@ -583,7 +583,7 @@ class MultiCardSpeechRecognitionExampleTester( ExampleTesterBase, metaclass=ExampleTestMeta, example_name="run_speech_recognition_ctc", multi_card=True ): TASK_NAME = "regisss/librispeech_asr_for_optimum_habana_ci" - DATASET_NAME = os.environ.get("DATA_CACHE", 0) + DATASET_NAME = os.environ.get("DATA_CACHE", None) class MultiCardSummarizationExampleTester( From 8ee87de9085d13593c1b20ada9b1b99a0715da69 Mon Sep 17 00:00:00 2001 From: regisss <15324346+regisss@users.noreply.github.com> Date: Sat, 30 Mar 2024 11:00:50 +0100 Subject: [PATCH 4/7] Disable safe loading tests in CI (#854) --- tests/test_trainer.py | 17 +++++++++++++++++ 1 file changed, 17 insertions(+) diff --git a/tests/test_trainer.py b/tests/test_trainer.py index 1963f805c7..2cb3147523 100644 --- a/tests/test_trainer.py +++ b/tests/test_trainer.py @@ -49,6 +49,7 @@ get_gpu_count, get_tests_dir, is_staging_test, + parse_flag_from_env, require_optuna, require_safetensors, require_sentencepiece, @@ -90,6 +91,20 @@ PATH_SAMPLE_TEXT = f"{get_tests_dir()}/fixtures/sample_text.txt" +_run_safe_loading_tests_ = parse_flag_from_env("SAFE_LOADING_TESTS", default=False) + + +def safe_loading_test(test_case): + """ + Decorator marking a test as needing custom bf16 ops. + Custom bf16 ops must be declared before `habana_frameworks.torch.core` is imported, which is not possible if some other tests are executed before. + + Such tests are skipped by default. Set the CUSTOM_BF16_OPS environment variable to a truthy value to run them. + + """ + return unittest.skipUnless(_run_safe_loading_tests_, "test requires SAFE_LOADING_TESTS")(test_case) + + class RegressionDataset: def __init__(self, a=2, b=3, length=64, seed=42, label_names=None): np.random.seed(seed) @@ -1465,6 +1480,7 @@ def test_training_with_resume_from_checkpoint_false(self): trainer.train(resume_from_checkpoint=False) + @safe_loading_test @require_safetensors def test_resume_training_with_safe_checkpoint(self): # This test will fail for more than 2 GPUs since the batch size will get bigger and with the number of @@ -1658,6 +1674,7 @@ def test_load_best_model_at_end(self): self.check_saved_checkpoints(tmpdir, 5, total, is_pretrained=False) self.check_best_model_has_been_loaded(tmpdir, 5, total, trainer, "eval_loss", is_pretrained=False) + @safe_loading_test @require_safetensors def test_load_best_model_from_safetensors(self): total = int(self.n_epochs * 64 / self.batch_size) From 221ac07a811fe1326f7cdb9f168fb12e957e3eac Mon Sep 17 00:00:00 2001 From: Libin Tang Date: Sat, 30 Mar 2024 19:10:15 +0000 Subject: [PATCH 5/7] Add warmup_step for eval throughput calculation. --- optimum/habana/transformers/trainer.py | 79 ++++++++++++++++++++++++++ 1 file changed, 79 insertions(+) diff --git a/optimum/habana/transformers/trainer.py b/optimum/habana/transformers/trainer.py index 4f22bdde37..e44884754b 100644 --- a/optimum/habana/transformers/trainer.py +++ b/optimum/habana/transformers/trainer.py @@ -1621,6 +1621,79 @@ def _save(self, output_dir: Optional[str] = None, state_dict=None): # Good practice: save your training arguments together with the trained model torch.save(self.args, os.path.join(output_dir, TRAINING_ARGS_NAME)) + def evaluate( + self, + eval_dataset: Optional[Union[Dataset, Dict[str, Dataset]]] = None, + ignore_keys: Optional[List[str]] = None, + metric_key_prefix: str = "eval", + ) -> Dict[str, float]: + """ + From https://github.com/huggingface/transformers/blob/v4.38.2/src/transformers/trainer.py#L3162 with the following modification + 1. comment out TPU related + 2. use throughput_warmup_steps in evaulation throughput calculation + """ + # handle multipe eval datasets + eval_dataset = eval_dataset if eval_dataset is not None else self.eval_dataset + if isinstance(eval_dataset, dict): + metrics = {} + for eval_dataset_name, _eval_dataset in eval_dataset.items(): + dataset_metrics = self.evaluate( + eval_dataset=_eval_dataset, + ignore_keys=ignore_keys, + metric_key_prefix=f"{metric_key_prefix}_{eval_dataset_name}", + ) + metrics.update(dataset_metrics) + return metrics + + # memory metrics - must set up as early as possible + self._memory_tracker.start() + + eval_dataloader = self.get_eval_dataloader(eval_dataset) + # if self.is_fsdp_xla_v2_enabled: + # eval_dataloader = tpu_spmd_dataloader(eval_dataloader) + + start_time = time.time() + self.start_time_after_warmup = None + + eval_loop = self.prediction_loop if self.args.use_legacy_prediction_loop else self.evaluation_loop + output = eval_loop( + eval_dataloader, + description="Evaluation", + # No point gathering the predictions if there are no metrics, otherwise we defer to + # self.args.prediction_loss_only + prediction_loss_only=True if self.compute_metrics is None else None, + ignore_keys=ignore_keys, + metric_key_prefix=metric_key_prefix, + ) + + total_batch_size = self.args.eval_batch_size * self.args.world_size + if f"{metric_key_prefix}_jit_compilation_time" in output.metrics: + start_time += output.metrics[f"{metric_key_prefix}_jit_compilation_time"] + num_samples = output.num_samples - self.args.throughput_warmup_steps * total_batch_size + num_steps = math.ceil(output.num_samples / total_batch_size) - self.args.throughput_warmup_steps + + output.metrics.update( + speed_metrics( + metric_key_prefix, + start_time, + num_samples=num_samples, + num_steps=num_steps, + start_time_after_warmup=self.start_time_after_warmup, + ) + ) + + self.log(output.metrics) + + # if DebugOption.TPU_METRICS_DEBUG in self.args.debug: + # tpu-comment: Logging debug metrics for PyTorch/XLA (compile, execute times, ops, etc.) + # xm.master_print(met.metrics_report()) + + self.control = self.callback_handler.on_evaluate(self.args, self.state, self.control, output.metrics) + + self._memory_tracker.stop_and_update_metrics(output.metrics) + + return output.metrics + def evaluation_loop( self, dataloader: DataLoader, @@ -1716,6 +1789,12 @@ def evaluation_loop( observed_num_examples = 0 # Main evaluation loop for step, inputs in enumerate(dataloader): + if ( + self.args.throughput_warmup_steps > 0 + and not self.is_in_train + and step == self.args.throughput_warmup_steps + ): + self.start_time_after_warmup = time.time() # Update the observed num examples observed_batch_size = find_batch_size(inputs) if observed_batch_size is not None: From ea79a3f82dec8aafcc64575a36870c45ba273201 Mon Sep 17 00:00:00 2001 From: regisss <15324346+regisss@users.noreply.github.com> Date: Sun, 31 Mar 2024 12:50:50 +0200 Subject: [PATCH 6/7] Remove commented lines --- optimum/habana/transformers/trainer.py | 8 +------- 1 file changed, 1 insertion(+), 7 deletions(-) diff --git a/optimum/habana/transformers/trainer.py b/optimum/habana/transformers/trainer.py index e44884754b..c91d0724f8 100644 --- a/optimum/habana/transformers/trainer.py +++ b/optimum/habana/transformers/trainer.py @@ -1630,7 +1630,7 @@ def evaluate( """ From https://github.com/huggingface/transformers/blob/v4.38.2/src/transformers/trainer.py#L3162 with the following modification 1. comment out TPU related - 2. use throughput_warmup_steps in evaulation throughput calculation + 2. use throughput_warmup_steps in evaluation throughput calculation """ # handle multipe eval datasets eval_dataset = eval_dataset if eval_dataset is not None else self.eval_dataset @@ -1649,8 +1649,6 @@ def evaluate( self._memory_tracker.start() eval_dataloader = self.get_eval_dataloader(eval_dataset) - # if self.is_fsdp_xla_v2_enabled: - # eval_dataloader = tpu_spmd_dataloader(eval_dataloader) start_time = time.time() self.start_time_after_warmup = None @@ -1684,10 +1682,6 @@ def evaluate( self.log(output.metrics) - # if DebugOption.TPU_METRICS_DEBUG in self.args.debug: - # tpu-comment: Logging debug metrics for PyTorch/XLA (compile, execute times, ops, etc.) - # xm.master_print(met.metrics_report()) - self.control = self.callback_handler.on_evaluate(self.args, self.state, self.control, output.metrics) self._memory_tracker.stop_and_update_metrics(output.metrics) From 96d39e58e1dca4056ac9ae29bbbf0161be78296b Mon Sep 17 00:00:00 2001 From: regisss <15324346+regisss@users.noreply.github.com> Date: Sun, 31 Mar 2024 12:52:26 +0200 Subject: [PATCH 7/7] Adapt for main --- optimum/habana/version.py | 2 +- setup.py | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/optimum/habana/version.py b/optimum/habana/version.py index 714a1d7075..497fd1da79 100644 --- a/optimum/habana/version.py +++ b/optimum/habana/version.py @@ -13,4 +13,4 @@ # See the License for the specific language governing permissions and # limitations under the License. -__version__ = "1.11.0" +__version__ = "1.11.0.dev0" diff --git a/setup.py b/setup.py index c822c41e7f..5904ba2ea1 100644 --- a/setup.py +++ b/setup.py @@ -49,7 +49,7 @@ QUALITY_REQUIRES = [ "ruff", - "hf_doc_builder", + "hf_doc_builder @ git+https://github.com/huggingface/doc-builder.git", ] EXTRAS_REQUIRE = {