diff --git a/examples/audio-classification/README.md b/examples/audio-classification/README.md index 1b7161d15e..ec227545a7 100644 --- a/examples/audio-classification/README.md +++ b/examples/audio-classification/README.md @@ -94,7 +94,8 @@ python ../gaudi_spawn.py \ --use_hpu_graphs_for_inference \ --gaudi_config_name Habana/wav2vec2 \ --throughput_warmup_steps 3 \ - --bf16 + --bf16 \ + --trust_remote_code True ``` On 8 HPUs, this script should run in ~12 minutes and yield an accuracy of **80.49%**. @@ -141,7 +142,8 @@ python ../gaudi_spawn.py \ --use_hpu_graphs_for_inference \ --gaudi_config_name Habana/wav2vec2 \ --throughput_warmup_steps 3 \ - --deepspeed ../../tests/configs/deepspeed_zero_2.json + --deepspeed ../../tests/configs/deepspeed_zero_2.json \ + --trust_remote_code True ``` [The documentation](https://huggingface.co/docs/optimum/habana/usage_guides/deepspeed) provides more information about how to use DeepSpeed within Optimum Habana. diff --git a/examples/audio-classification/run_audio_classification.py b/examples/audio-classification/run_audio_classification.py index b8c1e146c9..74e148efd5 100644 --- a/examples/audio-classification/run_audio_classification.py +++ b/examples/audio-classification/run_audio_classification.py @@ -167,9 +167,9 @@ class ModelArguments: default=False, metadata={ "help": ( - "Whether or not to allow for custom models defined on the Hub in their own modeling files. This option " - "should only be set to `True` for repositories you trust and in which you have read the code, as it will " - "execute code present on the Hub on your local machine." + "Whether to trust the execution of code from datasets/models defined on the Hub." + " This option should only be set to `True` for repositories you trust and in which you have read the" + " code, as it will execute code present on the Hub on your local machine." ) }, ) @@ -254,12 +254,14 @@ def main(): data_args.dataset_config_name, split=data_args.train_split_name, token=model_args.token, + trust_remote_code=model_args.trust_remote_code, ) raw_datasets["eval"] = load_dataset( data_args.dataset_name, data_args.dataset_config_name, split=data_args.eval_split_name, token=model_args.token, + trust_remote_code=model_args.trust_remote_code, ) if data_args.audio_column_name not in raw_datasets["train"].column_names: diff --git a/examples/contrastive-image-text/run_bridgetower.py b/examples/contrastive-image-text/run_bridgetower.py index dd4b7e3fba..0d3498a511 100644 --- a/examples/contrastive-image-text/run_bridgetower.py +++ b/examples/contrastive-image-text/run_bridgetower.py @@ -102,9 +102,9 @@ class ModelArguments: default=False, metadata={ "help": ( - "Whether or not to allow for custom models defined on the Hub in their own modeling files. This option " - "should only be set to `True` for repositories you trust and in which you have read the code, as it will " - "execute code present on the Hub on your local machine." + "Whether to trust the execution of code from datasets/models defined on the Hub." + " This option should only be set to `True` for repositories you trust and in which you have read the" + " code, as it will execute code present on the Hub on your local machine." ) }, ) @@ -203,9 +203,9 @@ def __post_init__(self): if self.validation_file is not None: extension = self.validation_file.split(".")[-1] assert extension in ["csv", "json"], "`validation_file` should be a csv or a json file." - if self.validation_file is not None: - extension = self.validation_file.split(".")[-1] - assert extension == "json", "`validation_file` should be a json file." + if self.test_file is not None: + extension = self.test_file.split(".")[-1] + assert extension in ["csv", "json"], "`test_file` should be a csv or a json file." dataset_name_mapping = { @@ -328,6 +328,7 @@ def main(): data_dir=data_args.data_dir, token=model_args.token, revision=data_args.dataset_revision, + trust_remote_code=model_args.trust_remote_code, ) else: data_files = {} diff --git a/examples/contrastive-image-text/run_clip.py b/examples/contrastive-image-text/run_clip.py index 8d3b3a28a5..c1abae0011 100644 --- a/examples/contrastive-image-text/run_clip.py +++ b/examples/contrastive-image-text/run_clip.py @@ -107,9 +107,9 @@ class ModelArguments: default=False, metadata={ "help": ( - "Whether or not to allow for custom models defined on the Hub in their own modeling files. This option " - "should only be set to `True` for repositories you trust and in which you have read the code, as it will " - "execute code present on the Hub on your local machine." + "Whether to trust the execution of code from datasets/models defined on the Hub." + " This option should only be set to `True` for repositories you trust and in which you have read the" + " code, as it will execute code present on the Hub on your local machine." ) }, ) @@ -201,9 +201,9 @@ def __post_init__(self): if self.validation_file is not None: extension = self.validation_file.split(".")[-1] assert extension in ["csv", "json"], "`validation_file` should be a csv or a json file." - if self.validation_file is not None: - extension = self.validation_file.split(".")[-1] - assert extension == "json", "`validation_file` should be a json file." + if self.test_file is not None: + extension = self.test_file.split(".")[-1] + assert extension in ["csv", "json"], "`test_file` should be a csv or a json file." dataset_name_mapping = { @@ -325,6 +325,7 @@ def main(): keep_in_memory=False, data_dir=data_args.data_dir, token=model_args.token, + trust_remote_code=model_args.trust_remote_code, ) else: data_files = {} diff --git a/examples/image-classification/run_image_classification.py b/examples/image-classification/run_image_classification.py index 9f25269b4b..30779dbc03 100644 --- a/examples/image-classification/run_image_classification.py +++ b/examples/image-classification/run_image_classification.py @@ -172,9 +172,9 @@ class ModelArguments: default=False, metadata={ "help": ( - "Whether or not to allow for custom models defined on the Hub in their own modeling files. This option " - "should only be set to `True` for repositories you trust and in which you have read the code, as it will " - "execute code present on the Hub on your local machine." + "Whether to trust the execution of code from datasets/models defined on the Hub." + " This option should only be set to `True` for repositories you trust and in which you have read the" + " code, as it will execute code present on the Hub on your local machine." ) }, ) @@ -259,6 +259,7 @@ def main(): data_args.dataset_config_name, cache_dir=model_args.cache_dir, token=model_args.token, + trust_remote_code=model_args.trust_remote_code, ) else: data_files = {} diff --git a/examples/language-modeling/run_clm.py b/examples/language-modeling/run_clm.py index fb00e93fb2..9433e8f3bf 100644 --- a/examples/language-modeling/run_clm.py +++ b/examples/language-modeling/run_clm.py @@ -131,9 +131,9 @@ class ModelArguments: default=False, metadata={ "help": ( - "Whether or not to allow for custom models defined on the Hub in their own modeling files. This option " - "should only be set to `True` for repositories you trust and in which you have read the code, as it will " - "execute code present on the Hub on your local machine." + "Whether to trust the execution of code from datasets/models defined on the Hub." + " This option should only be set to `True` for repositories you trust and in which you have read the" + " code, as it will execute code present on the Hub on your local machine." ) }, ) @@ -341,6 +341,7 @@ def main(): cache_dir=model_args.cache_dir, token=model_args.token, streaming=data_args.streaming, + trust_remote_code=model_args.trust_remote_code, ) if "validation" not in raw_datasets.keys(): raw_datasets["validation"] = load_dataset( @@ -350,6 +351,7 @@ def main(): cache_dir=model_args.cache_dir, token=model_args.token, streaming=data_args.streaming, + trust_remote_code=model_args.trust_remote_code, ) raw_datasets["train"] = load_dataset( data_args.dataset_name, @@ -358,6 +360,7 @@ def main(): cache_dir=model_args.cache_dir, token=model_args.token, streaming=data_args.streaming, + trust_remote_code=model_args.trust_remote_code, ) else: data_files = {} diff --git a/examples/language-modeling/run_lora_clm.py b/examples/language-modeling/run_lora_clm.py index afc386b865..e7989f6d80 100644 --- a/examples/language-modeling/run_lora_clm.py +++ b/examples/language-modeling/run_lora_clm.py @@ -103,7 +103,11 @@ class ModelArguments: trust_remote_code: bool = field( default=False, metadata={ - "help": "should enable when using custom model architecture that is not yet part of the Hugging Face transformers package like MPT)." + "help": ( + "Whether to trust the execution of code from datasets/models defined on the Hub." + " This option should only be set to `True` for repositories you trust and in which you have read the" + " code, as it will execute code present on the Hub on your local machine." + ) }, ) use_cache: bool = field( @@ -502,6 +506,7 @@ def main(): data_args.dataset_config_name, cache_dir=model_args.cache_dir, token=model_args.token, + trust_remote_code=model_args.trust_remote_code, ) if "validation" not in raw_datasets.keys() and training_args.do_eval: @@ -511,6 +516,7 @@ def main(): split=f"train[:{data_args.validation_split_percentage}%]", cache_dir=model_args.cache_dir, token=model_args.token, + trust_remote_code=model_args.trust_remote_code, ) raw_datasets["train"] = load_dataset( data_args.dataset_name, @@ -518,6 +524,7 @@ def main(): split=f"train[{data_args.validation_split_percentage}%:]", cache_dir=model_args.cache_dir, token=model_args.token, + trust_remote_code=model_args.trust_remote_code, ) else: data_files = {} diff --git a/examples/language-modeling/run_mlm.py b/examples/language-modeling/run_mlm.py index 17c2432760..18015ca515 100644 --- a/examples/language-modeling/run_mlm.py +++ b/examples/language-modeling/run_mlm.py @@ -129,9 +129,9 @@ class ModelArguments: default=False, metadata={ "help": ( - "Whether or not to allow for custom models defined on the Hub in their own modeling files. This option " - "should only be set to `True` for repositories you trust and in which you have read the code, as it will " - "execute code present on the Hub on your local machine." + "Whether to trust the execution of code from datasets/models defined on the Hub." + " This option should only be set to `True` for repositories you trust and in which you have read the" + " code, as it will execute code present on the Hub on your local machine." ) }, ) @@ -340,6 +340,7 @@ def main(): cache_dir=model_args.cache_dir, token=model_args.token, streaming=data_args.streaming, + trust_remote_code=model_args.trust_remote_code, ) if "validation" not in raw_datasets.keys(): raw_datasets["validation"] = load_dataset( @@ -349,6 +350,7 @@ def main(): cache_dir=model_args.cache_dir, token=model_args.token, streaming=data_args.streaming, + trust_remote_code=model_args.trust_remote_code, ) raw_datasets["train"] = load_dataset( data_args.dataset_name, @@ -357,6 +359,7 @@ def main(): cache_dir=model_args.cache_dir, token=model_args.token, streaming=data_args.streaming, + trust_remote_code=model_args.trust_remote_code, ) else: data_files = {} diff --git a/examples/language-modeling/run_prompt_tuning_clm.py b/examples/language-modeling/run_prompt_tuning_clm.py index 49043f3930..42798c0d5e 100644 --- a/examples/language-modeling/run_prompt_tuning_clm.py +++ b/examples/language-modeling/run_prompt_tuning_clm.py @@ -114,9 +114,9 @@ class ModelArguments: default=False, metadata={ "help": ( - "Whether or not to allow for custom models defined on the Hub in their own modeling files. This option " - "should only be set to `True` for repositories you trust and in which you have read the code, as it will " - "execute code present on the Hub on your local machine." + "Whether to trust the execution of code from datasets/models defined on the Hub." + " This option should only be set to `True` for repositories you trust and in which you have read the" + " code, as it will execute code present on the Hub on your local machine." ) }, ) @@ -248,6 +248,7 @@ def main(): cache_dir=model_args.cache_dir, token=model_args.token, streaming=data_args.streaming, + trust_remote_code=model_args.trust_remote_code, ) if data_args.dataset_name == "ought/raft" and data_args.dataset_config_name == "twitter_complaints": text_column = "Tweet text" diff --git a/examples/question-answering/run_qa.py b/examples/question-answering/run_qa.py index e58f7f42a2..b7022310c2 100644 --- a/examples/question-answering/run_qa.py +++ b/examples/question-answering/run_qa.py @@ -102,9 +102,9 @@ class ModelArguments: default=False, metadata={ "help": ( - "Whether or not to allow for custom models defined on the Hub in their own modeling files. This option " - "should only be set to `True` for repositories you trust and in which you have read the code, as it will " - "execute code present on the Hub on your local machine." + "Whether to trust the execution of code from datasets/models defined on the Hub." + " This option should only be set to `True` for repositories you trust and in which you have read the" + " code, as it will execute code present on the Hub on your local machine." ) }, ) @@ -319,6 +319,7 @@ def main(): data_args.dataset_config_name, cache_dir=model_args.cache_dir, token=model_args.token, + trust_remote_code=model_args.trust_remote_code, ) else: data_files = {} diff --git a/examples/question-answering/run_seq2seq_qa.py b/examples/question-answering/run_seq2seq_qa.py index 50880a1f7c..ff56d5b4e6 100644 --- a/examples/question-answering/run_seq2seq_qa.py +++ b/examples/question-answering/run_seq2seq_qa.py @@ -102,9 +102,9 @@ class ModelArguments: default=False, metadata={ "help": ( - "Whether or not to allow for custom models defined on the Hub in their own modeling files. This option " - "should only be set to `True` for repositories you trust and in which you have read the code, as it will " - "execute code present on the Hub on your local machine." + "Whether to trust the execution of code from datasets/models defined on the Hub." + " This option should only be set to `True` for repositories you trust and in which you have read the" + " code, as it will execute code present on the Hub on your local machine." ) }, ) @@ -364,6 +364,7 @@ def main(): data_args.dataset_config_name, cache_dir=model_args.cache_dir, token=model_args.token, + trust_remote_code=model_args.trust_remote_code, ) else: data_files = {} diff --git a/examples/speech-recognition/run_speech_recognition_ctc.py b/examples/speech-recognition/run_speech_recognition_ctc.py index 048da1dd5d..c01778d6d9 100644 --- a/examples/speech-recognition/run_speech_recognition_ctc.py +++ b/examples/speech-recognition/run_speech_recognition_ctc.py @@ -261,9 +261,9 @@ class DataTrainingArguments: default=False, metadata={ "help": ( - "Whether or not to allow for custom models defined on the Hub in their own modeling files. This option " - "should only be set to `True` for repositories you trust and in which you have read the code, as it will " - "execute code present on the Hub on your local machine." + "Whether to trust the execution of code from datasets/models defined on the Hub." + " This option should only be set to `True` for repositories you trust and in which you have read the" + " code, as it will execute code present on the Hub on your local machine." ) }, ) @@ -467,6 +467,7 @@ def main(): data_args.dataset_config_name, split=data_args.train_split_name, token=data_args.token, + trust_remote_code=data_args.trust_remote_code, ) if data_args.audio_column_name not in raw_datasets["train"].column_names: @@ -492,6 +493,7 @@ def main(): data_args.dataset_config_name, split=data_args.eval_split_name, token=data_args.token, + trust_remote_code=data_args.trust_remote_code, ) if data_args.max_eval_samples is not None: diff --git a/examples/speech-recognition/run_speech_recognition_seq2seq.py b/examples/speech-recognition/run_speech_recognition_seq2seq.py index 06733f8e7c..5985825601 100755 --- a/examples/speech-recognition/run_speech_recognition_seq2seq.py +++ b/examples/speech-recognition/run_speech_recognition_seq2seq.py @@ -106,9 +106,9 @@ class ModelArguments: default=False, metadata={ "help": ( - "Whether or not to allow for custom models defined on the Hub in their own modeling files. This option " - "should only be set to `True` for repositories you trust and in which you have read the code, as it will " - "execute code present on the Hub on your local machine." + "Whether to trust the execution of code from datasets/models defined on the Hub." + " This option should only be set to `True` for repositories you trust and in which you have read the" + " code, as it will execute code present on the Hub on your local machine." ) }, ) @@ -372,6 +372,7 @@ def main(): split=data_args.train_split_name, cache_dir=model_args.cache_dir, token=model_args.token, + trust_remote_code=model_args.trust_remote_code, ) if training_args.do_eval: @@ -381,6 +382,7 @@ def main(): split=data_args.eval_split_name, cache_dir=model_args.cache_dir, token=model_args.token, + trust_remote_code=model_args.trust_remote_code, ) if data_args.audio_column_name not in next(iter(raw_datasets.values())).column_names: diff --git a/examples/summarization/run_summarization.py b/examples/summarization/run_summarization.py index db7a4913c9..2d83568092 100755 --- a/examples/summarization/run_summarization.py +++ b/examples/summarization/run_summarization.py @@ -124,9 +124,9 @@ class ModelArguments: default=False, metadata={ "help": ( - "Whether or not to allow for custom models defined on the Hub in their own modeling files. This option " - "should only be set to `True` for repositories you trust and in which you have read the code, as it will " - "execute code present on the Hub on your local machine." + "Whether to trust the execution of code from datasets/models defined on the Hub." + " This option should only be set to `True` for repositories you trust and in which you have read the" + " code, as it will execute code present on the Hub on your local machine." ) }, ) @@ -428,6 +428,7 @@ def main(): data_args.dataset_config_name, cache_dir=model_args.cache_dir, token=model_args.token, + trust_remote_code=model_args.trust_remote_code, ) else: data_files = {} diff --git a/examples/text-classification/run_glue.py b/examples/text-classification/run_glue.py index 155d1dd650..69a15579b0 100755 --- a/examples/text-classification/run_glue.py +++ b/examples/text-classification/run_glue.py @@ -213,9 +213,9 @@ class ModelArguments: default=False, metadata={ "help": ( - "Whether or not to allow for custom models defined on the Hub in their own modeling files. This option " - "should only be set to `True` for repositories you trust and in which you have read the code, as it will " - "execute code present on the Hub on your local machine." + "Whether to trust the execution of code from datasets/models defined on the Hub." + " This option should only be set to `True` for repositories you trust and in which you have read the" + " code, as it will execute code present on the Hub on your local machine." ) }, ) @@ -325,6 +325,7 @@ def main(): data_args.dataset_config_name, cache_dir=model_args.cache_dir, token=model_args.token, + trust_remote_code=model_args.trust_remote_code, ) else: # Loading a dataset from your local files. @@ -458,7 +459,7 @@ def main(): label_to_id = {i: int(label_name_to_id[label_list[i]]) for i in range(num_labels)} else: logger.warning( - "Your model seems to have been trained with labels, but they don't match the dataset: ", + "Your model seems to have been trained with labels, but they don't match the dataset: " f"model labels: {sorted(label_name_to_id.keys())}, dataset labels: {sorted(label_list)}." "\nIgnoring the model labels as a result.", ) diff --git a/examples/text-generation/run_generation.py b/examples/text-generation/run_generation.py index b30c2e4447..0a24058f2a 100755 --- a/examples/text-generation/run_generation.py +++ b/examples/text-generation/run_generation.py @@ -285,7 +285,7 @@ def setup_parser(parser): parser.add_argument( "--trust_remote_code", action="store_true", - help="Whether or not to allow for custom models defined on the Hub in their own modeling files.", + help="Whether to trust the execution of code from datasets/models defined on the Hub. This option should only be set to `True` for repositories you trust and in which you have read the code, as it will execute code present on the Hub on your local machine.", ) args = parser.parse_args() diff --git a/examples/translation/run_translation.py b/examples/translation/run_translation.py index db40ef8f28..942503c4ad 100644 --- a/examples/translation/run_translation.py +++ b/examples/translation/run_translation.py @@ -118,9 +118,9 @@ class ModelArguments: default=False, metadata={ "help": ( - "Whether or not to allow for custom models defined on the Hub in their own modeling files. This option " - "should only be set to `True` for repositories you trust and in which you have read the code, as it will " - "execute code present on the Hub on your local machine." + "Whether to trust the execution of code from datasets/models defined on the Hub." + " This option should only be set to `True` for repositories you trust and in which you have read the" + " code, as it will execute code present on the Hub on your local machine." ) }, ) @@ -380,6 +380,7 @@ def main(): data_args.dataset_config_name, cache_dir=model_args.cache_dir, token=model_args.token, + trust_remote_code=model_args.trust_remote_code, ) else: data_files = {} diff --git a/setup.py b/setup.py index dbf06aa333..d0899959a7 100644 --- a/setup.py +++ b/setup.py @@ -35,7 +35,6 @@ "accelerate < 0.28.0", "diffusers >= 0.26.0, < 0.27.0", "huggingface_hub < 0.23.0", - "datasets < 2.20.0", ] TESTS_REQUIRE = [ diff --git a/tests/baselines/wav2vec2_base.json b/tests/baselines/wav2vec2_base.json index 3927ec4a5b..64c16c70de 100644 --- a/tests/baselines/wav2vec2_base.json +++ b/tests/baselines/wav2vec2_base.json @@ -21,7 +21,8 @@ "--seed 0", "--dataloader_num_workers 1", "--use_hpu_graphs_for_training", - "--use_hpu_graphs_for_inference" + "--use_hpu_graphs_for_inference", + "--trust_remote_code True" ] } } @@ -49,7 +50,8 @@ "--seed 0", "--dataloader_num_workers 1", "--use_hpu_graphs_for_training", - "--use_hpu_graphs_for_inference" + "--use_hpu_graphs_for_inference", + "--trust_remote_code True" ] } } diff --git a/tests/example_diff/run_audio_classification.txt b/tests/example_diff/run_audio_classification.txt index 278d3485ff..d7b474164d 100644 --- a/tests/example_diff/run_audio_classification.txt +++ b/tests/example_diff/run_audio_classification.txt @@ -30,7 +30,7 @@ > 47,48c48,50 < # Will error if the minimal version of Transformers is not installed. Remove at your own risks. -< check_min_version("4.42.0.dev0") +< check_min_version("4.44.0.dev0") --- > # Will error if the minimal version of Transformers and Optimum Habana are not installed. Remove at your own risks. > check_min_version("4.40.0") @@ -76,13 +76,13 @@ > f"Process rank: {training_args.local_rank}, device: {training_args.device}, " > + f"distributed training: {training_args.parallel_mode.value == 'distributed'}, " > + f"mixed-precision training: {mixed_precision}" -302a296,298 +304a298,300 > # Max input length > max_length = int(round(feature_extractor.sampling_rate * data_args.max_length_seconds)) > -307a304 +309a306 > -313c310,316 +315c312,318 < inputs = feature_extractor(subsampled_wavs, sampling_rate=feature_extractor.sampling_rate) --- > inputs = feature_extractor( @@ -92,7 +92,7 @@ > padding="max_length", > truncation=True, > ) -322c325,331 +324c327,333 < inputs = feature_extractor(wavs, sampling_rate=feature_extractor.sampling_rate) --- > inputs = feature_extractor( @@ -102,15 +102,15 @@ > padding="max_length", > truncation=True, > ) -368,369c377,378 +370,371c379,380 < # freeze the convolutional waveform encoder < if model_args.freeze_feature_encoder: --- > # freeze the convolutional waveform encoder if supported by model > if hasattr(model, "freeze_feature_encoder") and model_args.freeze_feature_encoder: -389c398 +391c400 < trainer = Trainer( --- > trainer = GaudiTrainer( -390a400 +392a402 > gaudi_config=gaudi_config, diff --git a/tests/example_diff/run_clip.txt b/tests/example_diff/run_clip.txt index 2eebcc2d7b..1099d3c94a 100644 --- a/tests/example_diff/run_clip.txt +++ b/tests/example_diff/run_clip.txt @@ -25,7 +25,7 @@ > 56,57c63,65 < # Will error if the minimal version of Transformers is not installed. Remove at your own risks. -< check_min_version("4.42.0.dev0") +< check_min_version("4.44.0.dev0") --- > # Will error if the minimal version of Transformers and Optimum Habana are not installed. Remove at your own risks. > check_min_version("4.40.0") @@ -55,9 +55,9 @@ > f"Process rank: {training_args.local_rank}, device: {training_args.device}, " > + f"distributed training: {training_args.parallel_mode.value == 'distributed'}, " > + f"mixed-precision training: {mixed_precision}" -419d438 +420d439 < image_transformations = torch.jit.script(image_transformations) -466,467c485,493 +467,468c486,494 < # Transform images on the fly as doing it on the whole dataset takes too much time. < train_dataset.set_transform(transform_images) --- @@ -70,7 +70,7 @@ > else: > # Transform images on the fly as doing it on the whole dataset takes too much time. > train_dataset.set_transform(transform_images) -489,490c515,523 +490,491c516,524 < # Transform images on the fly as doing it on the whole dataset takes too much time. < eval_dataset.set_transform(transform_images) --- @@ -83,7 +83,7 @@ > else: > # Transform images on the fly as doing it on the whole dataset takes too much time. > eval_dataset.set_transform(transform_images) -513a547,555 +514a548,556 > if data_args.mediapipe_dataloader: > test_dataset.image_mean = image_processor.image_mean > test_dataset.image_std = image_processor.image_std @@ -93,10 +93,10 @@ > else: > # Transform images on the fly as doing it on the whole dataset takes too much time. > test_dataset.set_transform(transform_images) -516c558,559 +517c559,560 < trainer = Trainer( --- > trainer_cls = HabanaDataloaderTrainer if data_args.mediapipe_dataloader else GaudiTrainer > trainer = trainer_cls( -517a561 +518a562 > gaudi_config=gaudi_config, diff --git a/tests/example_diff/run_clm.txt b/tests/example_diff/run_clm.txt index 7db8099ecf..c91df2d5cd 100644 --- a/tests/example_diff/run_clm.txt +++ b/tests/example_diff/run_clm.txt @@ -25,7 +25,7 @@ > from optimum.habana.utils import set_seed 57,58d52 < # Will error if the minimal version of Transformers is not installed. Remove at your own risks. -< check_min_version("4.42.0.dev0") +< check_min_version("4.44.0.dev0") 60c54,60 < require_version("datasets>=2.14.0", "To fix: pip install -r examples/pytorch/language-modeling/requirements.txt") --- @@ -92,25 +92,25 @@ > f"Process rank: {training_args.local_rank}, device: {training_args.device}, " > + f"distributed training: {training_args.parallel_mode.value == 'distributed'}, " > + f"mixed-precision training: {mixed_precision}" -387a417 +390a420 > "use_cache": False if training_args.gradient_checkpointing else model_args.use_cache, -483a514 +486a517 > -547a579,582 +550a582,585 > > def tensor_mapper(x): > return {i: torch.tensor(x[i], dtype=torch.int32) for i in x} > -550a586,587 +553a589,590 > if training_args.resume_from_checkpoint is not None and training_args.resume_from_checkpoint != "": > train_dataset = train_dataset.map(tensor_mapper) -581c618 +584c621 < trainer = Trainer( --- > trainer = GaudiTrainer( -582a620 +585a623 > gaudi_config=gaudi_config, -589,592c627,628 +592,595c630,631 < compute_metrics=compute_metrics if training_args.do_eval and not is_torch_xla_available() else None, < preprocess_logits_for_metrics=preprocess_logits_for_metrics < if training_args.do_eval and not is_torch_xla_available() @@ -118,12 +118,12 @@ --- > compute_metrics=compute_metrics if training_args.do_eval else None, > preprocess_logits_for_metrics=preprocess_logits_for_metrics if training_args.do_eval else None, -603c639,640 +606c642,643 < trainer.save_model() # Saves the tokenizer too for easy upload --- > if data_args.save_last_ckpt: > trainer.save_model() # Saves the tokenizer too for easy upload -607,610c644,650 +610,613c647,653 < max_train_samples = ( < data_args.max_train_samples if data_args.max_train_samples is not None else len(train_dataset) < ) @@ -136,9 +136,9 @@ > data_args.max_train_samples if data_args.max_train_samples is not None else len(train_dataset) > ) > metrics["train_samples"] = min(max_train_samples, len(train_dataset)) -619d658 +622d661 < -622,623c661,666 +625,626c664,669 < max_eval_samples = data_args.max_eval_samples if data_args.max_eval_samples is not None else len(eval_dataset) < metrics["eval_samples"] = min(max_eval_samples, len(eval_dataset)) --- @@ -148,7 +148,7 @@ > ) > metrics["eval_samples"] = min(max_eval_samples, len(eval_dataset)) > -646,650d688 +649,653d691 < < < def _mp_fn(index): diff --git a/tests/example_diff/run_generation.txt b/tests/example_diff/run_generation.txt index 5da903f6e8..e1745cf95b 100644 --- a/tests/example_diff/run_generation.txt +++ b/tests/example_diff/run_generation.txt @@ -551,7 +551,7 @@ > parser.add_argument( > "--trust_remote_code", > action="store_true", -> help="Whether or not to allow for custom models defined on the Hub in their own modeling files.", +> help="Whether to trust the execution of code from datasets/models defined on the Hub. This option should only be set to `True` for repositories you trust and in which you have read the code, as it will execute code present on the Hub on your local machine.", 333d289 < parser.add_argument("--jit", action="store_true", help="Whether or not to use jit trace to accelerate inference") 336,339c292,293 diff --git a/tests/example_diff/run_glue.txt b/tests/example_diff/run_glue.txt index 78cafcf01c..46005ba396 100644 --- a/tests/example_diff/run_glue.txt +++ b/tests/example_diff/run_glue.txt @@ -21,7 +21,7 @@ > return () 50,51c56,61 < # Will error if the minimal version of Transformers is not installed. Remove at your own risks. -< check_min_version("4.42.0.dev0") +< check_min_version("4.44.0.dev0") --- > > logger = logging.getLogger(__name__) @@ -63,21 +63,21 @@ > f"Process rank: {training_args.local_rank}, device: {training_args.device}, " > + f"distributed training: {training_args.parallel_mode.value == 'distributed'}, " > + f"mixed-precision training: {mixed_precision}" -375a401 +376a402 > problem_type=data_args.problem_type, -416a443,447 +417a444,448 > if model_args.add_pad_token: > if not model.config.pad_token_id and not tokenizer.pad_token: > tokenizer.pad_token = tokenizer.eos_token > model.config.pad_token_id = tokenizer.eos_token_id > -527c558 +528c559 < trainer = Trainer( --- > trainer = GaudiTrainer( -528a560 +529a561 > gaudi_config=gaudi_config, -628,632d659 +629,633d660 < < < def _mp_fn(index): diff --git a/tests/example_diff/run_image_classification.txt b/tests/example_diff/run_image_classification.txt index 0dbbe3f6c2..49ab2bb6a1 100644 --- a/tests/example_diff/run_image_classification.txt +++ b/tests/example_diff/run_image_classification.txt @@ -25,7 +25,7 @@ < """ Fine-tuning a 🤗 Transformers model for image classification""" 58,59c65,67 < # Will error if the minimal version of Transformers is not installed. Remove at your own risks. -< check_min_version("4.42.0.dev0") +< check_min_version("4.44.0.dev0") --- > # Will error if the minimal version of Transformers and Optimum Habana are not installed. Remove at your own risks. > check_min_version("4.40.0") @@ -51,9 +51,9 @@ > f"Process rank: {training_args.local_rank}, device: {training_args.device}, " > + f"distributed training: {training_args.parallel_mode.value == 'distributed'}, " > + f"mixed-precision training: {mixed_precision}" -392c409 +393c410 < trainer = Trainer( --- > trainer = GaudiTrainer( -393a411 +394a412 > gaudi_config=gaudi_config, diff --git a/tests/example_diff/run_mlm.txt b/tests/example_diff/run_mlm.txt index 372a913834..3e4f6c5863 100644 --- a/tests/example_diff/run_mlm.txt +++ b/tests/example_diff/run_mlm.txt @@ -20,7 +20,7 @@ > from optimum.habana.utils import set_seed 56,57d51 < # Will error if the minimal version of Transformers is not installed. Remove at your own risks. -< check_min_version("4.42.0.dev0") +< check_min_version("4.44.0.dev0") 59c53,59 < require_version("datasets>=2.14.0", "To fix: pip install -r examples/pytorch/language-modeling/requirements.txt") --- @@ -75,13 +75,13 @@ > + f"mixed-precision training: {mixed_precision}" 289d305 < # Set the verbosity to info of the Transformers logger (on main process only): -617c633 +620c636 < trainer = Trainer( --- > trainer = GaudiTrainer( -618a635 +621a638 > gaudi_config=gaudi_config, -624,627c641,642 +627,630c644,645 < compute_metrics=compute_metrics if training_args.do_eval and not is_torch_xla_available() else None, < preprocess_logits_for_metrics=preprocess_logits_for_metrics < if training_args.do_eval and not is_torch_xla_available() @@ -89,7 +89,7 @@ --- > compute_metrics=compute_metrics if training_args.do_eval else None, > preprocess_logits_for_metrics=preprocess_logits_for_metrics if training_args.do_eval else None, -641,644c656,662 +644,647c659,665 < max_train_samples = ( < data_args.max_train_samples if data_args.max_train_samples is not None else len(train_dataset) < ) @@ -102,9 +102,9 @@ > data_args.max_train_samples if data_args.max_train_samples is not None else len(train_dataset) > ) > metrics["train_samples"] = min(max_train_samples, len(train_dataset)) -653d670 +656d673 < -656,657c673,678 +659,660c676,681 < max_eval_samples = data_args.max_eval_samples if data_args.max_eval_samples is not None else len(eval_dataset) < metrics["eval_samples"] = min(max_eval_samples, len(eval_dataset)) --- @@ -114,7 +114,7 @@ > ) > metrics["eval_samples"] = min(max_eval_samples, len(eval_dataset)) > -680,684d700 +683,687d703 < < < def _mp_fn(index): diff --git a/tests/example_diff/run_qa.txt b/tests/example_diff/run_qa.txt index 118add46a1..961785aaac 100644 --- a/tests/example_diff/run_qa.txt +++ b/tests/example_diff/run_qa.txt @@ -19,7 +19,7 @@ > from optimum.habana.utils import set_seed 52,53d50 < # Will error if the minimal version of Transformers is not installed. Remove at your own risks. -< check_min_version("4.42.0.dev0") +< check_min_version("4.44.0.dev0") 55c52,58 < require_version("datasets>=1.8.0", "To fix: pip install -r examples/pytorch/question-answering/requirements.txt") --- @@ -62,14 +62,14 @@ > f"Process rank: {training_args.local_rank}, device: {training_args.device}, " > + f"distributed training: {training_args.parallel_mode.value == 'distributed'}, " > + f"mixed-precision training: {mixed_precision}" -346a365,368 +347a366,369 > if config.model_type == "llama": > if tokenizer.pad_token is None: > tokenizer.add_special_tokens({"pad_token": "[PAD]"}) > tokenizer.cls_token = tokenizer.bos_token -637a660 +638a661 > gaudi_config=gaudi_config, -706,710d728 +707,711d729 < < < def _mp_fn(index): diff --git a/tests/example_diff/run_seq2seq_qa.txt b/tests/example_diff/run_seq2seq_qa.txt index 817c72b5a9..322661ff62 100644 --- a/tests/example_diff/run_seq2seq_qa.txt +++ b/tests/example_diff/run_seq2seq_qa.txt @@ -11,7 +11,7 @@ > from optimum.habana.utils import set_seed 48,49d46 < # Will error if the minimal version of Transformers is not installed. Remove at your own risks. -< check_min_version("4.42.0.dev0") +< check_min_version("4.44.0.dev0") 51c48,54 < require_version("datasets>=1.8.0", "To fix: pip install -r examples/pytorch/question-answering/requirements.txt") --- @@ -54,9 +54,9 @@ > f"Process rank: {training_args.local_rank}, device: {training_args.device}, " > + f"distributed training: {training_args.parallel_mode.value == 'distributed'}, " > + f"mixed-precision training: {mixed_precision}" -660a679 +661a680 > gaudi_config=gaudi_config, -734,738d752 +735,739d753 < < < def _mp_fn(index): diff --git a/tests/example_diff/run_speech_recognition_ctc.txt b/tests/example_diff/run_speech_recognition_ctc.txt index 1fab0abcf2..a99ee732b3 100644 --- a/tests/example_diff/run_speech_recognition_ctc.txt +++ b/tests/example_diff/run_speech_recognition_ctc.txt @@ -13,7 +13,7 @@ > from optimum.habana.utils import set_seed 52,53d49 < # Will error if the minimal version of Transformers is not installed. Remove at your own risks. -< check_min_version("4.42.0.dev0") +< check_min_version("4.44.0.dev0") 55c51,56 < require_version("datasets>=1.18.0", "To fix: pip install -r examples/pytorch/speech-recognition/requirements.txt") --- @@ -56,13 +56,14 @@ > f"Process rank: {training_args.local_rank}, device: {training_args.device}, " > + f"distributed training: {training_args.parallel_mode.value == 'distributed'}, " > + f"mixed-precision training: {mixed_precision}" -451,457c465,470 +451,458c465,471 < if training_args.do_train: < raw_datasets["train"] = load_dataset( < data_args.dataset_name, < data_args.dataset_config_name, < split=data_args.train_split_name, < token=data_args.token, +< trust_remote_code=data_args.trust_remote_code, < ) --- > raw_datasets["train"] = load_dataset( @@ -70,8 +71,9 @@ > data_args.dataset_config_name, > split=data_args.train_split_name, > token=data_args.token, +> trust_remote_code=data_args.trust_remote_code, > ) -459,464c472,477 +460,465c473,478 < if data_args.audio_column_name not in raw_datasets["train"].column_names: < raise ValueError( < f"--audio_column_name '{data_args.audio_column_name}' not found in dataset '{data_args.dataset_name}'." @@ -85,7 +87,7 @@ > " Make sure to set `--audio_column_name` to the correct audio column - one of" > f" {', '.join(raw_datasets['train'].column_names)}." > ) -466,471c479,484 +467,472c480,485 < if data_args.text_column_name not in raw_datasets["train"].column_names: < raise ValueError( < f"--text_column_name {data_args.text_column_name} not found in dataset '{data_args.dataset_name}'. " @@ -99,32 +101,32 @@ > "Make sure to set `--text_column_name` to the correct text column - one of " > f"{', '.join(raw_datasets['train'].column_names)}." > ) -473,474c486,487 +474,475c487,488 < if data_args.max_train_samples is not None: < raw_datasets["train"] = raw_datasets["train"].select(range(data_args.max_train_samples)) --- > if data_args.max_train_samples is not None: > raw_datasets["train"] = raw_datasets["train"].select(range(data_args.max_train_samples)) -492c505 +494c507 < f'[{"".join(data_args.chars_to_ignore)}]' if data_args.chars_to_ignore is not None else None --- > f'[{"".join(data_args.chars_to_ignore).replace(" ", "")}]' if data_args.chars_to_ignore is not None else None -631a645,649 +633a647,651 > raise RuntimeError( > f"The dataset sampling rate ({dataset_sampling_rate}) is different from the feature extractor one" > f" ({feature_extractor.sampling_rate}).Data resampling should be done. The Datasets library does not" > " support it on HPUs yet." > ) -741c759,762 +743c761,764 < processor=processor, feature_extractor_input_name=feature_extractor_input_name --- > processor=processor, > feature_extractor_input_name=feature_extractor_input_name, > pad_to_multiple_of=int(max_input_length), > pad_to_multiple_of_labels=500, -745c766 +747c768 < trainer = Trainer( --- > trainer = GaudiTrainer( -746a768 +748a770 > gaudi_config=gaudi_config, diff --git a/tests/example_diff/run_speech_recognition_seq2seq.txt b/tests/example_diff/run_speech_recognition_seq2seq.txt index 45b00bef9b..196d356171 100644 --- a/tests/example_diff/run_speech_recognition_seq2seq.txt +++ b/tests/example_diff/run_speech_recognition_seq2seq.txt @@ -20,7 +20,7 @@ > return () > 51c58,59 -< check_min_version("4.42.0.dev0") +< check_min_version("4.44.0.dev0") --- > check_min_version("4.40.0") > check_optimum_habana_min_version("1.11.0") @@ -59,18 +59,18 @@ > f"Process rank: {training_args.local_rank}, device: {training_args.device}, " > + f"distributed training: {training_args.parallel_mode.value == 'distributed'}, " > + f"mixed-precision training: {mixed_precision}" -442d466 +444d468 < model.generation_config.forced_decoder_ids = model_args.forced_decoder_ids -456a481,484 +458a483,486 > logger.warning( > f"The dataset sampling rate ({dataset_sampling_rate}) is different from the feature extractor one" > f" ({feature_extractor.sampling_rate}).Data resampling should be done." > ) -561a590 +563a592 > label_features_max_length=data_args.label_features_max_length, -565c594 +567c596 < trainer = Seq2SeqTrainer( --- > trainer = GaudiSeq2SeqTrainer( -566a596 +568a598 > gaudi_config=gaudi_config, diff --git a/tests/example_diff/run_summarization.txt b/tests/example_diff/run_summarization.txt index 9f01193b14..81868ab221 100644 --- a/tests/example_diff/run_summarization.txt +++ b/tests/example_diff/run_summarization.txt @@ -23,7 +23,7 @@ > from optimum.habana.utils import set_seed 54,55d55 < # Will error if the minimal version of Transformers is not installed. Remove at your own risks. -< check_min_version("4.42.0.dev0") +< check_min_version("4.44.0.dev0") 57c57,63 < require_version("datasets>=1.8.0", "To fix: pip install -r examples/pytorch/summarization/requirements.txt") --- @@ -78,16 +78,16 @@ > f"Process rank: {training_args.local_rank}, device: {training_args.device}, " > + f"distributed training: {training_args.parallel_mode.value == 'distributed'}, " > + f"mixed-precision training: {mixed_precision}" -431a463 +432a464 > use_cache=False if training_args.gradient_checkpointing else model_args.use_cache, -450a483,488 +451a484,489 > is_bart = model.config.model_type == "bart" > if is_bart and training_args.do_train: > raise ValueError( > "Training is not yet supported for BART. Eval or predict can be enabled with `--do_eval` and `--do_predict`." > ) > -453c491,498 +454c492,499 < embedding_size = model.get_input_embeddings().weight.shape[0] --- > embeddings = model.get_input_embeddings() @@ -98,16 +98,16 @@ > embedding_size = embeddings.weight.shape[0] > else: > embedding_size = embeddings.weight.shape[0] -486a532 +487a533 > suffix = data_args.source_suffix if data_args.source_suffix is not None else "" -557a604,605 +558a605,606 > else: > raise ValueError("Found case where either text or summary is missing.") -559c607 +560c608 < inputs = [prefix + inp for inp in inputs] --- > inputs = [prefix + inp + suffix for inp in inputs] -574a623,662 +575a624,663 > def preprocess_bucketing_function(examples): > # remove pairs where at least one record is None > @@ -148,22 +148,22 @@ > model_inputs["labels"] = labels["input_ids"] > return model_inputs > -589a678,683 +590a679,684 > def wrapper_preprocess_function(examples): > if model.config.is_encoder_decoder: > return preprocess_bucketing_function(examples) > else: > return preprocess_function(examples) > -598c692 +599c693 < preprocess_function, --- > wrapper_preprocess_function, -614c708 +615c709 < preprocess_function, --- > wrapper_preprocess_function, -624,629c718,726 +625,630c719,727 < data_collator = DataCollatorForSeq2Seq( < tokenizer, < model=model, @@ -180,7 +180,7 @@ > label_pad_token_id=label_pad_token_id, > pad_to_multiple_of=8 if training_args.fp16 else None, > ) -664,671c761,769 +665,672c762,773 < training_args.generation_max_length = ( < training_args.generation_max_length < if training_args.generation_max_length is not None @@ -196,16 +196,19 @@ > else: > training_args.generation_config.max_length = data_args.val_max_target_length > if data_args.num_beams is not None: +> if data_args.num_beams == 1: +> training_args.generation_config.length_penalty = None +> training_args.generation_config.early_stopping = False > training_args.generation_config.num_beams = data_args.num_beams > elif training_args.generation_num_beams is not None: > training_args.generation_config.num_beams = training_args.generation_num_beams -674c772 +675c776 < trainer = Seq2SeqTrainer( --- > trainer = GaudiSeq2SeqTrainer( -675a774 +676a778 > gaudi_config=gaudi_config, -764,768d862 +765,769d866 < < < def _mp_fn(index): diff --git a/tests/example_diff/run_translation.txt b/tests/example_diff/run_translation.txt index 1aa504c06f..e7038d847c 100644 --- a/tests/example_diff/run_translation.txt +++ b/tests/example_diff/run_translation.txt @@ -15,7 +15,7 @@ > from optimum.habana.utils import set_seed 54,55d52 < # Will error if the minimal version of Transformers is not installed. Remove at your own risks. -< check_min_version("4.42.0.dev0") +< check_min_version("4.44.0.dev0") 57c54,60 < require_version("datasets>=1.8.0", "To fix: pip install -r examples/pytorch/translation/requirements.txt") --- @@ -79,19 +79,19 @@ > f"Process rank: {training_args.local_rank}, device: {training_args.device}, " > + f"distributed training: {training_args.parallel_mode.value == 'distributed'}, " > + f"mixed-precision training: {mixed_precision}" -384a419 +385a420 > use_cache=False if training_args.gradient_checkpointing else model_args.use_cache, -456c491 +457c492 < # Check the whether the source target length fits in the model, if it has absolute positional embeddings --- > # Check whether the source target length fits in the model, if it has absolute positional embeddings -594c629 +595c630 < trainer = Seq2SeqTrainer( --- > trainer = GaudiSeq2SeqTrainer( -595a631 +596a632 > gaudi_config=gaudi_config, -688,692d723 +689,693d724 < < < def _mp_fn(index):