From 45d4c171ff29b857c8daa6604e8b66b89cc97a70 Mon Sep 17 00:00:00 2001 From: Luca-Calabria Date: Tue, 22 Oct 2024 14:01:05 +0200 Subject: [PATCH 1/8] Create CI Eager/Lazy for Language Modeling --- tests/test_language_modeling_example.py | 92 +++++++++++++++++++++++++ 1 file changed, 92 insertions(+) create mode 100644 tests/test_language_modeling_example.py diff --git a/tests/test_language_modeling_example.py b/tests/test_language_modeling_example.py new file mode 100644 index 0000000000..ee089467e6 --- /dev/null +++ b/tests/test_language_modeling_example.py @@ -0,0 +1,92 @@ +import json +import os +import re +import subprocess +from pathlib import Path +from tempfile import TemporaryDirectory + +import pytest + +from .test_examples import TIME_PERF_FACTOR + + +prev_quant_model_name = None +prev_quant_rank = 0 + +if os.environ.get("GAUDI2_CI", "0") == "1": + # Gaudi2 CI baselines + MODELS_TO_TEST = { + "bf16_1x": [ + ("google/gemma-2b-it", "mamamiya405/finred", "Habana/gpt2", True, 9.5, 31.5), + ("google/gemma-2b-it", "mamamiya405/finred", "Habana/gpt2", False, 9.5, 31.5), + ], + } + +def _test_language_modeling( + model_name: str, + baseline_train: float, + baseline_eval: float, + token: str, + dataset_name: str, + gaudi_config_name: str, + use_lazy_mode: bool = False, + per_device_train_batch_size: int = 16, + per_device_eval_batch_size: int = 16, + num_train_epochs: int = 1, +): + command = ["python3"] + path_to_example_dir = Path(__file__).resolve().parent.parent / "examples" + env_variables = os.environ.copy() + + if not use_lazy_mode: + env_variables["PT_HPU_LAZY_MODE"] = "0" + + command += [ + f"{path_to_example_dir / 'language-modeling' / 'run_clm.py'}", + f"--model_name_or_path {model_name}", + f"--per_device_train_batch_size {per_device_train_batch_size}", + f"--per_device_eval_batch_size {per_device_eval_batch_size}", + f"--dataset_name {dataset_name}", + "--use_habana", + "--do_train", + "--do_eval", + f"--gaudi_config_name {gaudi_config_name}", + "--gradient_checkpointing", + "--bf16", + f"--num_train_epochs {num_train_epochs}", + f"--use_lazy_mode {use_lazy_mode}", + ] + + with TemporaryDirectory() as tmp_dir: + command.append(f"--output_dir {tmp_dir} --overwrite_output_dir") + command.append(f"--token {token.value}") + pattern = re.compile(r"([\"\'].+?[\"\'])|\s") + + command = [x for y in command for x in re.split(pattern, y) if x] + print(f"\n\nCommand to test: {' '.join(command[:-2])}\n") + proc = subprocess.run(command, env=env_variables) + + # Ensure the run finished without any issue + # Use try-except to avoid logging the token if used + try: + assert proc.returncode == 0 + except AssertionError as e: + if "'--token', 'hf_" in e.args[0]: + e.args = (f"The following command failed:\n{' '.join(command[:-2])}",) + raise + + with open(Path(tmp_dir) / "train_results.json") as fp: + results = json.load(fp) + + # Ensure performance requirements (throughput) are met + assert results["train_samples_per_second"] >= (2 - TIME_PERF_FACTOR) * baseline_train + + with open(Path(tmp_dir) / "eval_results.json") as fp: + results = json.load(fp) + + assert results["eval_samples_per_second"] >= (2 - TIME_PERF_FACTOR) * baseline_eval + + +@pytest.mark.parametrize("model_name, dataset_name, gaudi_config_name, use_lazy_mode, baseline_train, baseline_eval", MODELS_TO_TEST["bf16_1x"]) +def test_language_modeling_bf16_1x(model_name: str, baseline_train: float, baseline_eval: float, dataset_name: str, gaudi_config_name: str, use_lazy_mode: bool, token: str): + _test_language_modeling(model_name, baseline_train, baseline_eval, token, dataset_name, gaudi_config_name, use_lazy_mode) From 1d5a80fcb28f6a9505eaa9abfd083c474c87285e Mon Sep 17 00:00:00 2001 From: Luca-Calabria Date: Tue, 22 Oct 2024 14:48:39 +0200 Subject: [PATCH 2/8] fix style --- tests/test_language_modeling_example.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/test_language_modeling_example.py b/tests/test_language_modeling_example.py index ee089467e6..814ce629dc 100644 --- a/tests/test_language_modeling_example.py +++ b/tests/test_language_modeling_example.py @@ -83,7 +83,7 @@ def _test_language_modeling( with open(Path(tmp_dir) / "eval_results.json") as fp: results = json.load(fp) - + assert results["eval_samples_per_second"] >= (2 - TIME_PERF_FACTOR) * baseline_eval From 1a9360fa279123173b39a46268eff5c5d16b1fdb Mon Sep 17 00:00:00 2001 From: Luca-Calabria Date: Tue, 22 Oct 2024 15:23:36 +0200 Subject: [PATCH 3/8] updated Eager baseline --- tests/test_language_modeling_example.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/test_language_modeling_example.py b/tests/test_language_modeling_example.py index 814ce629dc..ebd72a22c0 100644 --- a/tests/test_language_modeling_example.py +++ b/tests/test_language_modeling_example.py @@ -18,7 +18,7 @@ MODELS_TO_TEST = { "bf16_1x": [ ("google/gemma-2b-it", "mamamiya405/finred", "Habana/gpt2", True, 9.5, 31.5), - ("google/gemma-2b-it", "mamamiya405/finred", "Habana/gpt2", False, 9.5, 31.5), + ("google/gemma-2b-it", "mamamiya405/finred", "Habana/gpt2", False, 6.5, 24.01), ], } From d2b34937d20663d668e74d8d9ee42e93ce8d6546 Mon Sep 17 00:00:00 2001 From: Luca-Calabria Date: Tue, 5 Nov 2024 12:23:25 +0100 Subject: [PATCH 4/8] remove redundant argument --- tests/test_language_modeling_example.py | 1 - 1 file changed, 1 deletion(-) diff --git a/tests/test_language_modeling_example.py b/tests/test_language_modeling_example.py index ebd72a22c0..80a7be1414 100644 --- a/tests/test_language_modeling_example.py +++ b/tests/test_language_modeling_example.py @@ -54,7 +54,6 @@ def _test_language_modeling( "--gradient_checkpointing", "--bf16", f"--num_train_epochs {num_train_epochs}", - f"--use_lazy_mode {use_lazy_mode}", ] with TemporaryDirectory() as tmp_dir: From 962d0c5ab71bfafe4305f6d922acc16f8e981717 Mon Sep 17 00:00:00 2001 From: Luca-Calabria Date: Wed, 6 Nov 2024 18:07:07 +0100 Subject: [PATCH 5/8] added gemma test case to test_examples --- tests/baselines/gemma_2b_it.json | 32 +++++++++ tests/test_examples.py | 2 + tests/test_language_modeling_example.py | 91 ------------------------- tests/utils.py | 3 +- 4 files changed, 36 insertions(+), 92 deletions(-) create mode 100644 tests/baselines/gemma_2b_it.json delete mode 100644 tests/test_language_modeling_example.py diff --git a/tests/baselines/gemma_2b_it.json b/tests/baselines/gemma_2b_it.json new file mode 100644 index 0000000000..f103c2e7ac --- /dev/null +++ b/tests/baselines/gemma_2b_it.json @@ -0,0 +1,32 @@ +{ + "gaudi2": { + "wikitext": { + "num_train_epochs": 2, + "eval_batch_size": 4, + "distribution": { + "single_card": { + "learning_rate": 2e-4, + "train_batch_size": 4, + "perplexity": 26.39, + "train_runtime": 356.07, + "train_samples_per_second": 14.06, + "extra_arguments": [ + "--dataset_config_name wikitext-2-raw-v1", + "--use_hpu_graphs_for_inference" + ] + }, + "multi_card": { + "learning_rate": 8e-4, + "train_batch_size": 4, + "perplexity": 954.5995, + "train_runtime": 82.6617, + "train_samples_per_second": 94.524, + "extra_arguments": [ + "--dataset_config_name wikitext-2-raw-v1", + "--use_hpu_graphs_for_inference" + ] + } + } + } + } +} \ No newline at end of file diff --git a/tests/test_examples.py b/tests/test_examples.py index c5668e5b7c..49a68103bd 100644 --- a/tests/test_examples.py +++ b/tests/test_examples.py @@ -305,6 +305,8 @@ def to_test( return True elif "ast-finetuned-speech-commands-v2" in model_name and IS_GAUDI2: return True + elif "gemma" in model_name and IS_GAUDI2: + return True return False diff --git a/tests/test_language_modeling_example.py b/tests/test_language_modeling_example.py deleted file mode 100644 index 80a7be1414..0000000000 --- a/tests/test_language_modeling_example.py +++ /dev/null @@ -1,91 +0,0 @@ -import json -import os -import re -import subprocess -from pathlib import Path -from tempfile import TemporaryDirectory - -import pytest - -from .test_examples import TIME_PERF_FACTOR - - -prev_quant_model_name = None -prev_quant_rank = 0 - -if os.environ.get("GAUDI2_CI", "0") == "1": - # Gaudi2 CI baselines - MODELS_TO_TEST = { - "bf16_1x": [ - ("google/gemma-2b-it", "mamamiya405/finred", "Habana/gpt2", True, 9.5, 31.5), - ("google/gemma-2b-it", "mamamiya405/finred", "Habana/gpt2", False, 6.5, 24.01), - ], - } - -def _test_language_modeling( - model_name: str, - baseline_train: float, - baseline_eval: float, - token: str, - dataset_name: str, - gaudi_config_name: str, - use_lazy_mode: bool = False, - per_device_train_batch_size: int = 16, - per_device_eval_batch_size: int = 16, - num_train_epochs: int = 1, -): - command = ["python3"] - path_to_example_dir = Path(__file__).resolve().parent.parent / "examples" - env_variables = os.environ.copy() - - if not use_lazy_mode: - env_variables["PT_HPU_LAZY_MODE"] = "0" - - command += [ - f"{path_to_example_dir / 'language-modeling' / 'run_clm.py'}", - f"--model_name_or_path {model_name}", - f"--per_device_train_batch_size {per_device_train_batch_size}", - f"--per_device_eval_batch_size {per_device_eval_batch_size}", - f"--dataset_name {dataset_name}", - "--use_habana", - "--do_train", - "--do_eval", - f"--gaudi_config_name {gaudi_config_name}", - "--gradient_checkpointing", - "--bf16", - f"--num_train_epochs {num_train_epochs}", - ] - - with TemporaryDirectory() as tmp_dir: - command.append(f"--output_dir {tmp_dir} --overwrite_output_dir") - command.append(f"--token {token.value}") - pattern = re.compile(r"([\"\'].+?[\"\'])|\s") - - command = [x for y in command for x in re.split(pattern, y) if x] - print(f"\n\nCommand to test: {' '.join(command[:-2])}\n") - proc = subprocess.run(command, env=env_variables) - - # Ensure the run finished without any issue - # Use try-except to avoid logging the token if used - try: - assert proc.returncode == 0 - except AssertionError as e: - if "'--token', 'hf_" in e.args[0]: - e.args = (f"The following command failed:\n{' '.join(command[:-2])}",) - raise - - with open(Path(tmp_dir) / "train_results.json") as fp: - results = json.load(fp) - - # Ensure performance requirements (throughput) are met - assert results["train_samples_per_second"] >= (2 - TIME_PERF_FACTOR) * baseline_train - - with open(Path(tmp_dir) / "eval_results.json") as fp: - results = json.load(fp) - - assert results["eval_samples_per_second"] >= (2 - TIME_PERF_FACTOR) * baseline_eval - - -@pytest.mark.parametrize("model_name, dataset_name, gaudi_config_name, use_lazy_mode, baseline_train, baseline_eval", MODELS_TO_TEST["bf16_1x"]) -def test_language_modeling_bf16_1x(model_name: str, baseline_train: float, baseline_eval: float, dataset_name: str, gaudi_config_name: str, use_lazy_mode: bool, token: str): - _test_language_modeling(model_name, baseline_train, baseline_eval, token, dataset_name, gaudi_config_name, use_lazy_mode) diff --git a/tests/utils.py b/tests/utils.py index 7eab1b06be..85adf2ee06 100644 --- a/tests/utils.py +++ b/tests/utils.py @@ -63,6 +63,7 @@ "qwen2": [("Qwen/Qwen2-7B", "Habana/qwen"), ("Qwen/Qwen2-72B", "Habana/qwen")], "idefics2": [("HuggingFaceM4/idefics2-8b", "Habana/gpt2")], "mllama": [("meta-llama/Llama-3.2-11B-Vision-Instruct", "Habana/gpt2")], + "gemma": [("google/gemma-2b-it", "Habana/gpt2")], } MODELS_TO_TEST_FOR_QUESTION_ANSWERING = [ @@ -81,7 +82,7 @@ # "distilbert", ] -MODELS_TO_TEST_FOR_CAUSAL_LANGUAGE_MODELING = ["gpt2", "gpt_neox", "bloom", "code_llama"] +MODELS_TO_TEST_FOR_CAUSAL_LANGUAGE_MODELING = ["gpt2", "gpt_neox", "bloom", "code_llama", "gemma"] MODELS_TO_TEST_FOR_SEQ2SEQ = ["t5"] From 63e930591ce0f0d23958062a539e7b4ba5c83f61 Mon Sep 17 00:00:00 2001 From: Luca-Calabria Date: Fri, 15 Nov 2024 15:49:38 +0100 Subject: [PATCH 6/8] added Eager Mode test case --- tests/baselines/gemma_2b_it_eager.json | 20 +++++++++++++ tests/test_examples.py | 40 ++++++++++++++++++++++---- 2 files changed, 55 insertions(+), 5 deletions(-) create mode 100644 tests/baselines/gemma_2b_it_eager.json diff --git a/tests/baselines/gemma_2b_it_eager.json b/tests/baselines/gemma_2b_it_eager.json new file mode 100644 index 0000000000..54ba546ccc --- /dev/null +++ b/tests/baselines/gemma_2b_it_eager.json @@ -0,0 +1,20 @@ +{ + "gaudi2": { + "wikitext": { + "num_train_epochs": 2, + "eval_batch_size": 4, + "distribution": { + "single_card": { + "learning_rate": 2e-4, + "train_batch_size": 4, + "perplexity": 26.69, + "train_runtime": 560.8188, + "train_samples_per_second": 8.597, + "extra_arguments": [ + "--dataset_config_name wikitext-2-raw-v1" + ] + } + } + } + } +} \ No newline at end of file diff --git a/tests/test_examples.py b/tests/test_examples.py index 49a68103bd..02f4e0708d 100644 --- a/tests/test_examples.py +++ b/tests/test_examples.py @@ -219,7 +219,14 @@ class ExampleTestMeta(type): @staticmethod def to_test( - model_name: str, multi_card: bool, deepspeed: bool, example_name: str, fsdp: bool, fp8: bool, task_name: str + model_name: str, + multi_card: bool, + deepspeed: bool, + example_name: str, + fsdp: bool, + fp8: bool, + eager_mode: bool, + task_name: str, ): models_with_specific_rules = [ "albert-xxlarge-v1", @@ -247,6 +254,8 @@ def to_test( "run_image2text_lora_finetune", ] + models_measured_on_eager_mode = ["google/gemma-2b-it"] + if (fsdp or fp8) and not IS_GAUDI2: return False elif ( @@ -271,6 +280,8 @@ def to_test( "ln_tuning", ): return False + elif eager_mode and not model_name in models_measured_on_eager_mode: + return False elif model_name not in models_with_specific_rules and not deepspeed: return True elif model_name == "gpt2-xl" and deepspeed: @@ -321,6 +332,7 @@ def __new__( fsdp=False, torch_compile=False, fp8=False, + eager_mode=False, ): distribution = "single_card" if multi_card: @@ -340,7 +352,7 @@ def __new__( ) for model_name, gaudi_config_name in models_to_test: - if cls.to_test(model_name, multi_card, deepspeed, example_name, fsdp, fp8, attrs["TASK_NAME"]): + if cls.to_test(model_name, multi_card, deepspeed, example_name, fsdp, fp8, eager_mode, attrs["TASK_NAME"]): attrs[f"test_{example_name}_{model_name.split('/')[-1]}_{distribution}"] = cls._create_test( model_name, gaudi_config_name, multi_card, deepspeed, fsdp, torch_compile, fp8 ) @@ -424,9 +436,15 @@ def test(self): create_clip_roberta_model() self._install_requirements(example_script.parent / "requirements.txt") - path_to_baseline = BASELINE_DIRECTORY / Path( - model_name.split("/")[-1].replace("-", "_").replace(".", "_") - ).with_suffix(".json") + + # collect baseline from _eager.json if eager_mode is True + if self.EAGER_MODE: + baseline_name = model_name.split("/")[-1].replace("-", "_").replace(".", "_") + "_eager" + else: + baseline_name = model_name.split("/")[-1].replace("-", "_").replace(".", "_") + + path_to_baseline = BASELINE_DIRECTORY / Path(baseline_name).with_suffix(".json") + with path_to_baseline.open("r") as json_file: device = "gaudi2" if IS_GAUDI2 else "gaudi" baseline = json.load(json_file)[device] @@ -474,6 +492,10 @@ def test(self): extra_command_line_arguments = baseline.get("distribution").get(distribution).get("extra_arguments", []) + if self.EAGER_MODE: + env_variables["PT_HPU_LAZY_MODE"] = "0" + if "--use_hpu_graphs_for_inference" in extra_command_line_arguments: + extra_command_line_arguments.remove("--use_hpu_graphs_for_inference") if os.environ.get("DATA_CACHE", None) is not None and self.EXAMPLE_NAME == "run_clip": extra_command_line_arguments[0] = "--data_dir {}".format(os.environ["DATA_CACHE"]) elif torch_compile and ( @@ -548,6 +570,7 @@ class ExampleTesterBase(TestCase): "train_samples_per_second": (TestCase.assertGreaterEqual, 2 - TIME_PERF_FACTOR), "eval_samples_per_second": (TestCase.assertGreaterEqual, 2 - TIME_PERF_FACTOR), } + EAGER_MODE = None def _create_command_line( self, @@ -723,6 +746,13 @@ class MultiCardQuestionAnsweringExampleTester( TASK_NAME = "squad" +class EagerModeCausalLanguageModelingExampleTester( + ExampleTesterBase, metaclass=ExampleTestMeta, example_name="run_clm", eager_mode=True +): + TASK_NAME = "wikitext" + EAGER_MODE = True + + class CausalLanguageModelingExampleTester(ExampleTesterBase, metaclass=ExampleTestMeta, example_name="run_clm"): TASK_NAME = "wikitext" From 51e97a1214a4573f1b3019c879b1f2a5cc29b7d9 Mon Sep 17 00:00:00 2001 From: Luca-Calabria Date: Mon, 25 Nov 2024 18:14:39 +0100 Subject: [PATCH 7/8] change default EAGER_MODE variable --- tests/test_examples.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/test_examples.py b/tests/test_examples.py index 02f4e0708d..5413295175 100644 --- a/tests/test_examples.py +++ b/tests/test_examples.py @@ -570,7 +570,7 @@ class ExampleTesterBase(TestCase): "train_samples_per_second": (TestCase.assertGreaterEqual, 2 - TIME_PERF_FACTOR), "eval_samples_per_second": (TestCase.assertGreaterEqual, 2 - TIME_PERF_FACTOR), } - EAGER_MODE = None + EAGER_MODE = False def _create_command_line( self, From 2a92d5ac9651ecd3575510be6a9f43833d825556 Mon Sep 17 00:00:00 2001 From: Luca-Calabria Date: Mon, 2 Dec 2024 11:30:37 +0100 Subject: [PATCH 8/8] fix by "make style" --- tests/test_examples.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/test_examples.py b/tests/test_examples.py index def374f8a1..663d4bcd38 100644 --- a/tests/test_examples.py +++ b/tests/test_examples.py @@ -285,7 +285,7 @@ def to_test( "ln_tuning", ): return False - elif eager_mode and not model_name in models_measured_on_eager_mode: + elif eager_mode and model_name not in models_measured_on_eager_mode: return False elif model_name not in models_with_specific_rules and not deepspeed: return True