diff --git a/examples/audio-classification/README.md b/examples/audio-classification/README.md
index 1b7161d15e..ec227545a7 100644
--- a/examples/audio-classification/README.md
+++ b/examples/audio-classification/README.md
@@ -94,7 +94,8 @@ python ../gaudi_spawn.py \
     --use_hpu_graphs_for_inference \
     --gaudi_config_name Habana/wav2vec2 \
     --throughput_warmup_steps 3 \
-    --bf16
+    --bf16 \
+    --trust_remote_code True
 ```
 
 On 8 HPUs, this script should run in ~12 minutes and yield an accuracy of **80.49%**.
@@ -141,7 +142,8 @@ python ../gaudi_spawn.py \
     --use_hpu_graphs_for_inference \
     --gaudi_config_name Habana/wav2vec2 \
     --throughput_warmup_steps 3 \
-    --deepspeed ../../tests/configs/deepspeed_zero_2.json
+    --deepspeed ../../tests/configs/deepspeed_zero_2.json \
+    --trust_remote_code True
 ```
 
 [The documentation](https://huggingface.co/docs/optimum/habana/usage_guides/deepspeed) provides more information about how to use DeepSpeed within Optimum Habana.
diff --git a/examples/audio-classification/run_audio_classification.py b/examples/audio-classification/run_audio_classification.py
index b8c1e146c9..74e148efd5 100644
--- a/examples/audio-classification/run_audio_classification.py
+++ b/examples/audio-classification/run_audio_classification.py
@@ -167,9 +167,9 @@ class ModelArguments:
         default=False,
         metadata={
             "help": (
-                "Whether or not to allow for custom models defined on the Hub in their own modeling files. This option "
-                "should only be set to `True` for repositories you trust and in which you have read the code, as it will "
-                "execute code present on the Hub on your local machine."
+                "Whether to trust the execution of code from datasets/models defined on the Hub."
+                " This option should only be set to `True` for repositories you trust and in which you have read the"
+                " code, as it will execute code present on the Hub on your local machine."
             )
         },
     )
@@ -254,12 +254,14 @@ def main():
         data_args.dataset_config_name,
         split=data_args.train_split_name,
         token=model_args.token,
+        trust_remote_code=model_args.trust_remote_code,
     )
     raw_datasets["eval"] = load_dataset(
         data_args.dataset_name,
         data_args.dataset_config_name,
         split=data_args.eval_split_name,
         token=model_args.token,
+        trust_remote_code=model_args.trust_remote_code,
     )
 
     if data_args.audio_column_name not in raw_datasets["train"].column_names:
diff --git a/examples/contrastive-image-text/run_bridgetower.py b/examples/contrastive-image-text/run_bridgetower.py
index dd4b7e3fba..0d3498a511 100644
--- a/examples/contrastive-image-text/run_bridgetower.py
+++ b/examples/contrastive-image-text/run_bridgetower.py
@@ -102,9 +102,9 @@ class ModelArguments:
         default=False,
         metadata={
             "help": (
-                "Whether or not to allow for custom models defined on the Hub in their own modeling files. This option "
-                "should only be set to `True` for repositories you trust and in which you have read the code, as it will "
-                "execute code present on the Hub on your local machine."
+                "Whether to trust the execution of code from datasets/models defined on the Hub."
+                " This option should only be set to `True` for repositories you trust and in which you have read the"
+                " code, as it will execute code present on the Hub on your local machine."
             )
         },
     )
@@ -203,9 +203,9 @@ def __post_init__(self):
             if self.validation_file is not None:
                 extension = self.validation_file.split(".")[-1]
                 assert extension in ["csv", "json"], "`validation_file` should be a csv or a json file."
-            if self.validation_file is not None:
-                extension = self.validation_file.split(".")[-1]
-                assert extension == "json", "`validation_file` should be a json file."
+            if self.test_file is not None:
+                extension = self.test_file.split(".")[-1]
+                assert extension in ["csv", "json"], "`test_file` should be a csv or a json file."
 
 
 dataset_name_mapping = {
@@ -328,6 +328,7 @@ def main():
             data_dir=data_args.data_dir,
             token=model_args.token,
             revision=data_args.dataset_revision,
+            trust_remote_code=model_args.trust_remote_code,
         )
     else:
         data_files = {}
diff --git a/examples/contrastive-image-text/run_clip.py b/examples/contrastive-image-text/run_clip.py
index 8d3b3a28a5..c1abae0011 100644
--- a/examples/contrastive-image-text/run_clip.py
+++ b/examples/contrastive-image-text/run_clip.py
@@ -107,9 +107,9 @@ class ModelArguments:
         default=False,
         metadata={
             "help": (
-                "Whether or not to allow for custom models defined on the Hub in their own modeling files. This option "
-                "should only be set to `True` for repositories you trust and in which you have read the code, as it will "
-                "execute code present on the Hub on your local machine."
+                "Whether to trust the execution of code from datasets/models defined on the Hub."
+                " This option should only be set to `True` for repositories you trust and in which you have read the"
+                " code, as it will execute code present on the Hub on your local machine."
             )
         },
     )
@@ -201,9 +201,9 @@ def __post_init__(self):
             if self.validation_file is not None:
                 extension = self.validation_file.split(".")[-1]
                 assert extension in ["csv", "json"], "`validation_file` should be a csv or a json file."
-            if self.validation_file is not None:
-                extension = self.validation_file.split(".")[-1]
-                assert extension == "json", "`validation_file` should be a json file."
+            if self.test_file is not None:
+                extension = self.test_file.split(".")[-1]
+                assert extension in ["csv", "json"], "`test_file` should be a csv or a json file."
 
 
 dataset_name_mapping = {
@@ -325,6 +325,7 @@ def main():
             keep_in_memory=False,
             data_dir=data_args.data_dir,
             token=model_args.token,
+            trust_remote_code=model_args.trust_remote_code,
         )
     else:
         data_files = {}
diff --git a/examples/image-classification/run_image_classification.py b/examples/image-classification/run_image_classification.py
index 9f25269b4b..30779dbc03 100644
--- a/examples/image-classification/run_image_classification.py
+++ b/examples/image-classification/run_image_classification.py
@@ -172,9 +172,9 @@ class ModelArguments:
         default=False,
         metadata={
             "help": (
-                "Whether or not to allow for custom models defined on the Hub in their own modeling files. This option "
-                "should only be set to `True` for repositories you trust and in which you have read the code, as it will "
-                "execute code present on the Hub on your local machine."
+                "Whether to trust the execution of code from datasets/models defined on the Hub."
+                " This option should only be set to `True` for repositories you trust and in which you have read the"
+                " code, as it will execute code present on the Hub on your local machine."
             )
         },
     )
@@ -259,6 +259,7 @@ def main():
             data_args.dataset_config_name,
             cache_dir=model_args.cache_dir,
             token=model_args.token,
+            trust_remote_code=model_args.trust_remote_code,
         )
     else:
         data_files = {}
diff --git a/examples/language-modeling/run_clm.py b/examples/language-modeling/run_clm.py
index fb00e93fb2..9433e8f3bf 100644
--- a/examples/language-modeling/run_clm.py
+++ b/examples/language-modeling/run_clm.py
@@ -131,9 +131,9 @@ class ModelArguments:
         default=False,
         metadata={
             "help": (
-                "Whether or not to allow for custom models defined on the Hub in their own modeling files. This option "
-                "should only be set to `True` for repositories you trust and in which you have read the code, as it will "
-                "execute code present on the Hub on your local machine."
+                "Whether to trust the execution of code from datasets/models defined on the Hub."
+                " This option should only be set to `True` for repositories you trust and in which you have read the"
+                " code, as it will execute code present on the Hub on your local machine."
             )
         },
     )
@@ -341,6 +341,7 @@ def main():
             cache_dir=model_args.cache_dir,
             token=model_args.token,
             streaming=data_args.streaming,
+            trust_remote_code=model_args.trust_remote_code,
         )
         if "validation" not in raw_datasets.keys():
             raw_datasets["validation"] = load_dataset(
@@ -350,6 +351,7 @@ def main():
                 cache_dir=model_args.cache_dir,
                 token=model_args.token,
                 streaming=data_args.streaming,
+                trust_remote_code=model_args.trust_remote_code,
             )
             raw_datasets["train"] = load_dataset(
                 data_args.dataset_name,
@@ -358,6 +360,7 @@ def main():
                 cache_dir=model_args.cache_dir,
                 token=model_args.token,
                 streaming=data_args.streaming,
+                trust_remote_code=model_args.trust_remote_code,
             )
     else:
         data_files = {}
diff --git a/examples/language-modeling/run_lora_clm.py b/examples/language-modeling/run_lora_clm.py
index afc386b865..e7989f6d80 100644
--- a/examples/language-modeling/run_lora_clm.py
+++ b/examples/language-modeling/run_lora_clm.py
@@ -103,7 +103,11 @@ class ModelArguments:
     trust_remote_code: bool = field(
         default=False,
         metadata={
-            "help": "should enable when using custom model architecture that is not yet part of the Hugging Face transformers package like MPT)."
+            "help": (
+                "Whether to trust the execution of code from datasets/models defined on the Hub."
+                " This option should only be set to `True` for repositories you trust and in which you have read the"
+                " code, as it will execute code present on the Hub on your local machine."
+            )
         },
     )
     use_cache: bool = field(
@@ -502,6 +506,7 @@ def main():
             data_args.dataset_config_name,
             cache_dir=model_args.cache_dir,
             token=model_args.token,
+            trust_remote_code=model_args.trust_remote_code,
         )
 
         if "validation" not in raw_datasets.keys() and training_args.do_eval:
@@ -511,6 +516,7 @@ def main():
                 split=f"train[:{data_args.validation_split_percentage}%]",
                 cache_dir=model_args.cache_dir,
                 token=model_args.token,
+                trust_remote_code=model_args.trust_remote_code,
             )
             raw_datasets["train"] = load_dataset(
                 data_args.dataset_name,
@@ -518,6 +524,7 @@ def main():
                 split=f"train[{data_args.validation_split_percentage}%:]",
                 cache_dir=model_args.cache_dir,
                 token=model_args.token,
+                trust_remote_code=model_args.trust_remote_code,
             )
     else:
         data_files = {}
diff --git a/examples/language-modeling/run_mlm.py b/examples/language-modeling/run_mlm.py
index 17c2432760..18015ca515 100644
--- a/examples/language-modeling/run_mlm.py
+++ b/examples/language-modeling/run_mlm.py
@@ -129,9 +129,9 @@ class ModelArguments:
         default=False,
         metadata={
             "help": (
-                "Whether or not to allow for custom models defined on the Hub in their own modeling files. This option "
-                "should only be set to `True` for repositories you trust and in which you have read the code, as it will "
-                "execute code present on the Hub on your local machine."
+                "Whether to trust the execution of code from datasets/models defined on the Hub."
+                " This option should only be set to `True` for repositories you trust and in which you have read the"
+                " code, as it will execute code present on the Hub on your local machine."
             )
         },
     )
@@ -340,6 +340,7 @@ def main():
             cache_dir=model_args.cache_dir,
             token=model_args.token,
             streaming=data_args.streaming,
+            trust_remote_code=model_args.trust_remote_code,
         )
         if "validation" not in raw_datasets.keys():
             raw_datasets["validation"] = load_dataset(
@@ -349,6 +350,7 @@ def main():
                 cache_dir=model_args.cache_dir,
                 token=model_args.token,
                 streaming=data_args.streaming,
+                trust_remote_code=model_args.trust_remote_code,
             )
             raw_datasets["train"] = load_dataset(
                 data_args.dataset_name,
@@ -357,6 +359,7 @@ def main():
                 cache_dir=model_args.cache_dir,
                 token=model_args.token,
                 streaming=data_args.streaming,
+                trust_remote_code=model_args.trust_remote_code,
             )
     else:
         data_files = {}
diff --git a/examples/language-modeling/run_prompt_tuning_clm.py b/examples/language-modeling/run_prompt_tuning_clm.py
index 49043f3930..42798c0d5e 100644
--- a/examples/language-modeling/run_prompt_tuning_clm.py
+++ b/examples/language-modeling/run_prompt_tuning_clm.py
@@ -114,9 +114,9 @@ class ModelArguments:
         default=False,
         metadata={
             "help": (
-                "Whether or not to allow for custom models defined on the Hub in their own modeling files. This option "
-                "should only be set to `True` for repositories you trust and in which you have read the code, as it will "
-                "execute code present on the Hub on your local machine."
+                "Whether to trust the execution of code from datasets/models defined on the Hub."
+                " This option should only be set to `True` for repositories you trust and in which you have read the"
+                " code, as it will execute code present on the Hub on your local machine."
             )
         },
     )
@@ -248,6 +248,7 @@ def main():
         cache_dir=model_args.cache_dir,
         token=model_args.token,
         streaming=data_args.streaming,
+        trust_remote_code=model_args.trust_remote_code,
     )
     if data_args.dataset_name == "ought/raft" and data_args.dataset_config_name == "twitter_complaints":
         text_column = "Tweet text"
diff --git a/examples/question-answering/run_qa.py b/examples/question-answering/run_qa.py
index e58f7f42a2..b7022310c2 100644
--- a/examples/question-answering/run_qa.py
+++ b/examples/question-answering/run_qa.py
@@ -102,9 +102,9 @@ class ModelArguments:
         default=False,
         metadata={
             "help": (
-                "Whether or not to allow for custom models defined on the Hub in their own modeling files. This option "
-                "should only be set to `True` for repositories you trust and in which you have read the code, as it will "
-                "execute code present on the Hub on your local machine."
+                "Whether to trust the execution of code from datasets/models defined on the Hub."
+                " This option should only be set to `True` for repositories you trust and in which you have read the"
+                " code, as it will execute code present on the Hub on your local machine."
             )
         },
     )
@@ -319,6 +319,7 @@ def main():
             data_args.dataset_config_name,
             cache_dir=model_args.cache_dir,
             token=model_args.token,
+            trust_remote_code=model_args.trust_remote_code,
         )
     else:
         data_files = {}
diff --git a/examples/question-answering/run_seq2seq_qa.py b/examples/question-answering/run_seq2seq_qa.py
index 50880a1f7c..ff56d5b4e6 100644
--- a/examples/question-answering/run_seq2seq_qa.py
+++ b/examples/question-answering/run_seq2seq_qa.py
@@ -102,9 +102,9 @@ class ModelArguments:
         default=False,
         metadata={
             "help": (
-                "Whether or not to allow for custom models defined on the Hub in their own modeling files. This option "
-                "should only be set to `True` for repositories you trust and in which you have read the code, as it will "
-                "execute code present on the Hub on your local machine."
+                "Whether to trust the execution of code from datasets/models defined on the Hub."
+                " This option should only be set to `True` for repositories you trust and in which you have read the"
+                " code, as it will execute code present on the Hub on your local machine."
             )
         },
     )
@@ -364,6 +364,7 @@ def main():
             data_args.dataset_config_name,
             cache_dir=model_args.cache_dir,
             token=model_args.token,
+            trust_remote_code=model_args.trust_remote_code,
         )
     else:
         data_files = {}
diff --git a/examples/speech-recognition/run_speech_recognition_ctc.py b/examples/speech-recognition/run_speech_recognition_ctc.py
index 048da1dd5d..c01778d6d9 100644
--- a/examples/speech-recognition/run_speech_recognition_ctc.py
+++ b/examples/speech-recognition/run_speech_recognition_ctc.py
@@ -261,9 +261,9 @@ class DataTrainingArguments:
         default=False,
         metadata={
             "help": (
-                "Whether or not to allow for custom models defined on the Hub in their own modeling files. This option "
-                "should only be set to `True` for repositories you trust and in which you have read the code, as it will "
-                "execute code present on the Hub on your local machine."
+                "Whether to trust the execution of code from datasets/models defined on the Hub."
+                " This option should only be set to `True` for repositories you trust and in which you have read the"
+                " code, as it will execute code present on the Hub on your local machine."
             )
         },
     )
@@ -467,6 +467,7 @@ def main():
         data_args.dataset_config_name,
         split=data_args.train_split_name,
         token=data_args.token,
+        trust_remote_code=data_args.trust_remote_code,
     )
 
     if data_args.audio_column_name not in raw_datasets["train"].column_names:
@@ -492,6 +493,7 @@ def main():
             data_args.dataset_config_name,
             split=data_args.eval_split_name,
             token=data_args.token,
+            trust_remote_code=data_args.trust_remote_code,
         )
 
         if data_args.max_eval_samples is not None:
diff --git a/examples/speech-recognition/run_speech_recognition_seq2seq.py b/examples/speech-recognition/run_speech_recognition_seq2seq.py
index 06733f8e7c..5985825601 100755
--- a/examples/speech-recognition/run_speech_recognition_seq2seq.py
+++ b/examples/speech-recognition/run_speech_recognition_seq2seq.py
@@ -106,9 +106,9 @@ class ModelArguments:
         default=False,
         metadata={
             "help": (
-                "Whether or not to allow for custom models defined on the Hub in their own modeling files. This option "
-                "should only be set to `True` for repositories you trust and in which you have read the code, as it will "
-                "execute code present on the Hub on your local machine."
+                "Whether to trust the execution of code from datasets/models defined on the Hub."
+                " This option should only be set to `True` for repositories you trust and in which you have read the"
+                " code, as it will execute code present on the Hub on your local machine."
             )
         },
     )
@@ -372,6 +372,7 @@ def main():
             split=data_args.train_split_name,
             cache_dir=model_args.cache_dir,
             token=model_args.token,
+            trust_remote_code=model_args.trust_remote_code,
         )
 
     if training_args.do_eval:
@@ -381,6 +382,7 @@ def main():
             split=data_args.eval_split_name,
             cache_dir=model_args.cache_dir,
             token=model_args.token,
+            trust_remote_code=model_args.trust_remote_code,
         )
 
     if data_args.audio_column_name not in next(iter(raw_datasets.values())).column_names:
diff --git a/examples/summarization/run_summarization.py b/examples/summarization/run_summarization.py
index db7a4913c9..2d83568092 100755
--- a/examples/summarization/run_summarization.py
+++ b/examples/summarization/run_summarization.py
@@ -124,9 +124,9 @@ class ModelArguments:
         default=False,
         metadata={
             "help": (
-                "Whether or not to allow for custom models defined on the Hub in their own modeling files. This option "
-                "should only be set to `True` for repositories you trust and in which you have read the code, as it will "
-                "execute code present on the Hub on your local machine."
+                "Whether to trust the execution of code from datasets/models defined on the Hub."
+                " This option should only be set to `True` for repositories you trust and in which you have read the"
+                " code, as it will execute code present on the Hub on your local machine."
             )
         },
     )
@@ -428,6 +428,7 @@ def main():
             data_args.dataset_config_name,
             cache_dir=model_args.cache_dir,
             token=model_args.token,
+            trust_remote_code=model_args.trust_remote_code,
         )
     else:
         data_files = {}
diff --git a/examples/text-classification/run_glue.py b/examples/text-classification/run_glue.py
index 155d1dd650..69a15579b0 100755
--- a/examples/text-classification/run_glue.py
+++ b/examples/text-classification/run_glue.py
@@ -213,9 +213,9 @@ class ModelArguments:
         default=False,
         metadata={
             "help": (
-                "Whether or not to allow for custom models defined on the Hub in their own modeling files. This option "
-                "should only be set to `True` for repositories you trust and in which you have read the code, as it will "
-                "execute code present on the Hub on your local machine."
+                "Whether to trust the execution of code from datasets/models defined on the Hub."
+                " This option should only be set to `True` for repositories you trust and in which you have read the"
+                " code, as it will execute code present on the Hub on your local machine."
             )
         },
     )
@@ -325,6 +325,7 @@ def main():
             data_args.dataset_config_name,
             cache_dir=model_args.cache_dir,
             token=model_args.token,
+            trust_remote_code=model_args.trust_remote_code,
         )
     else:
         # Loading a dataset from your local files.
@@ -458,7 +459,7 @@ def main():
             label_to_id = {i: int(label_name_to_id[label_list[i]]) for i in range(num_labels)}
         else:
             logger.warning(
-                "Your model seems to have been trained with labels, but they don't match the dataset: ",
+                "Your model seems to have been trained with labels, but they don't match the dataset: "
                 f"model labels: {sorted(label_name_to_id.keys())}, dataset labels: {sorted(label_list)}."
                 "\nIgnoring the model labels as a result.",
             )
diff --git a/examples/text-generation/run_generation.py b/examples/text-generation/run_generation.py
index b30c2e4447..0a24058f2a 100755
--- a/examples/text-generation/run_generation.py
+++ b/examples/text-generation/run_generation.py
@@ -285,7 +285,7 @@ def setup_parser(parser):
     parser.add_argument(
         "--trust_remote_code",
         action="store_true",
-        help="Whether or not to allow for custom models defined on the Hub in their own modeling files.",
+        help="Whether to trust the execution of code from datasets/models defined on the Hub. This option should only be set to `True` for repositories you trust and in which you have read the code, as it will execute code present on the Hub on your local machine.",
     )
     args = parser.parse_args()
 
diff --git a/examples/translation/run_translation.py b/examples/translation/run_translation.py
index db40ef8f28..942503c4ad 100644
--- a/examples/translation/run_translation.py
+++ b/examples/translation/run_translation.py
@@ -118,9 +118,9 @@ class ModelArguments:
         default=False,
         metadata={
             "help": (
-                "Whether or not to allow for custom models defined on the Hub in their own modeling files. This option "
-                "should only be set to `True` for repositories you trust and in which you have read the code, as it will "
-                "execute code present on the Hub on your local machine."
+                "Whether to trust the execution of code from datasets/models defined on the Hub."
+                " This option should only be set to `True` for repositories you trust and in which you have read the"
+                " code, as it will execute code present on the Hub on your local machine."
             )
         },
     )
@@ -380,6 +380,7 @@ def main():
             data_args.dataset_config_name,
             cache_dir=model_args.cache_dir,
             token=model_args.token,
+            trust_remote_code=model_args.trust_remote_code,
         )
     else:
         data_files = {}
diff --git a/setup.py b/setup.py
index dbf06aa333..d0899959a7 100644
--- a/setup.py
+++ b/setup.py
@@ -35,7 +35,6 @@
     "accelerate < 0.28.0",
     "diffusers >= 0.26.0, < 0.27.0",
     "huggingface_hub < 0.23.0",
-    "datasets < 2.20.0",
 ]
 
 TESTS_REQUIRE = [
diff --git a/tests/baselines/wav2vec2_base.json b/tests/baselines/wav2vec2_base.json
index 3927ec4a5b..64c16c70de 100644
--- a/tests/baselines/wav2vec2_base.json
+++ b/tests/baselines/wav2vec2_base.json
@@ -21,7 +21,8 @@
                         "--seed 0",
                         "--dataloader_num_workers 1",
                         "--use_hpu_graphs_for_training",
-                        "--use_hpu_graphs_for_inference"
+                        "--use_hpu_graphs_for_inference",
+                        "--trust_remote_code True"
                     ]
                 }
             }
@@ -49,7 +50,8 @@
                         "--seed 0",
                         "--dataloader_num_workers 1",
                         "--use_hpu_graphs_for_training",
-                        "--use_hpu_graphs_for_inference"
+                        "--use_hpu_graphs_for_inference",
+                        "--trust_remote_code True"
                     ]
                 }
             }
diff --git a/tests/example_diff/run_audio_classification.txt b/tests/example_diff/run_audio_classification.txt
index 278d3485ff..d7b474164d 100644
--- a/tests/example_diff/run_audio_classification.txt
+++ b/tests/example_diff/run_audio_classification.txt
@@ -30,7 +30,7 @@
 > 
 47,48c48,50
 < # Will error if the minimal version of Transformers is not installed. Remove at your own risks.
-< check_min_version("4.42.0.dev0")
+< check_min_version("4.44.0.dev0")
 ---
 > # Will error if the minimal version of Transformers and Optimum Habana are not installed. Remove at your own risks.
 > check_min_version("4.40.0")
@@ -76,13 +76,13 @@
 >         f"Process rank: {training_args.local_rank}, device: {training_args.device}, "
 >         + f"distributed training: {training_args.parallel_mode.value == 'distributed'}, "
 >         + f"mixed-precision training: {mixed_precision}"
-302a296,298
+304a298,300
 >     # Max input length
 >     max_length = int(round(feature_extractor.sampling_rate * data_args.max_length_seconds))
 > 
-307a304
+309a306
 > 
-313c310,316
+315c312,318
 <         inputs = feature_extractor(subsampled_wavs, sampling_rate=feature_extractor.sampling_rate)
 ---
 >         inputs = feature_extractor(
@@ -92,7 +92,7 @@
 >             padding="max_length",
 >             truncation=True,
 >         )
-322c325,331
+324c327,333
 <         inputs = feature_extractor(wavs, sampling_rate=feature_extractor.sampling_rate)
 ---
 >         inputs = feature_extractor(
@@ -102,15 +102,15 @@
 >             padding="max_length",
 >             truncation=True,
 >         )
-368,369c377,378
+370,371c379,380
 <     # freeze the convolutional waveform encoder
 <     if model_args.freeze_feature_encoder:
 ---
 >     # freeze the convolutional waveform encoder if supported by model
 >     if hasattr(model, "freeze_feature_encoder") and model_args.freeze_feature_encoder:
-389c398
+391c400
 <     trainer = Trainer(
 ---
 >     trainer = GaudiTrainer(
-390a400
+392a402
 >         gaudi_config=gaudi_config,
diff --git a/tests/example_diff/run_clip.txt b/tests/example_diff/run_clip.txt
index 2eebcc2d7b..1099d3c94a 100644
--- a/tests/example_diff/run_clip.txt
+++ b/tests/example_diff/run_clip.txt
@@ -25,7 +25,7 @@
 > 
 56,57c63,65
 < # Will error if the minimal version of Transformers is not installed. Remove at your own risks.
-< check_min_version("4.42.0.dev0")
+< check_min_version("4.44.0.dev0")
 ---
 > # Will error if the minimal version of Transformers and Optimum Habana are not installed. Remove at your own risks.
 > check_min_version("4.40.0")
@@ -55,9 +55,9 @@
 >         f"Process rank: {training_args.local_rank}, device: {training_args.device}, "
 >         + f"distributed training: {training_args.parallel_mode.value == 'distributed'}, "
 >         + f"mixed-precision training: {mixed_precision}"
-419d438
+420d439
 <     image_transformations = torch.jit.script(image_transformations)
-466,467c485,493
+467,468c486,494
 <         # Transform images on the fly as doing it on the whole dataset takes too much time.
 <         train_dataset.set_transform(transform_images)
 ---
@@ -70,7 +70,7 @@
 >         else:
 >             # Transform images on the fly as doing it on the whole dataset takes too much time.
 >             train_dataset.set_transform(transform_images)
-489,490c515,523
+490,491c516,524
 <         # Transform images on the fly as doing it on the whole dataset takes too much time.
 <         eval_dataset.set_transform(transform_images)
 ---
@@ -83,7 +83,7 @@
 >         else:
 >             # Transform images on the fly as doing it on the whole dataset takes too much time.
 >             eval_dataset.set_transform(transform_images)
-513a547,555
+514a548,556
 >         if data_args.mediapipe_dataloader:
 >             test_dataset.image_mean = image_processor.image_mean
 >             test_dataset.image_std = image_processor.image_std
@@ -93,10 +93,10 @@
 >         else:
 >             # Transform images on the fly as doing it on the whole dataset takes too much time.
 >             test_dataset.set_transform(transform_images)
-516c558,559
+517c559,560
 <     trainer = Trainer(
 ---
 >     trainer_cls = HabanaDataloaderTrainer if data_args.mediapipe_dataloader else GaudiTrainer
 >     trainer = trainer_cls(
-517a561
+518a562
 >         gaudi_config=gaudi_config,
diff --git a/tests/example_diff/run_clm.txt b/tests/example_diff/run_clm.txt
index 7db8099ecf..c91df2d5cd 100644
--- a/tests/example_diff/run_clm.txt
+++ b/tests/example_diff/run_clm.txt
@@ -25,7 +25,7 @@
 > from optimum.habana.utils import set_seed
 57,58d52
 < # Will error if the minimal version of Transformers is not installed. Remove at your own risks.
-< check_min_version("4.42.0.dev0")
+< check_min_version("4.44.0.dev0")
 60c54,60
 < require_version("datasets>=2.14.0", "To fix: pip install -r examples/pytorch/language-modeling/requirements.txt")
 ---
@@ -92,25 +92,25 @@
 >         f"Process rank: {training_args.local_rank}, device: {training_args.device}, "
 >         + f"distributed training: {training_args.parallel_mode.value == 'distributed'}, "
 >         + f"mixed-precision training: {mixed_precision}"
-387a417
+390a420
 >         "use_cache": False if training_args.gradient_checkpointing else model_args.use_cache,
-483a514
+486a517
 > 
-547a579,582
+550a582,585
 > 
 >         def tensor_mapper(x):
 >             return {i: torch.tensor(x[i], dtype=torch.int32) for i in x}
 > 
-550a586,587
+553a589,590
 >         if training_args.resume_from_checkpoint is not None and training_args.resume_from_checkpoint != "":
 >             train_dataset = train_dataset.map(tensor_mapper)
-581c618
+584c621
 <     trainer = Trainer(
 ---
 >     trainer = GaudiTrainer(
-582a620
+585a623
 >         gaudi_config=gaudi_config,
-589,592c627,628
+592,595c630,631
 <         compute_metrics=compute_metrics if training_args.do_eval and not is_torch_xla_available() else None,
 <         preprocess_logits_for_metrics=preprocess_logits_for_metrics
 <         if training_args.do_eval and not is_torch_xla_available()
@@ -118,12 +118,12 @@
 ---
 >         compute_metrics=compute_metrics if training_args.do_eval else None,
 >         preprocess_logits_for_metrics=preprocess_logits_for_metrics if training_args.do_eval else None,
-603c639,640
+606c642,643
 <         trainer.save_model()  # Saves the tokenizer too for easy upload
 ---
 >         if data_args.save_last_ckpt:
 >             trainer.save_model()  # Saves the tokenizer too for easy upload
-607,610c644,650
+610,613c647,653
 <         max_train_samples = (
 <             data_args.max_train_samples if data_args.max_train_samples is not None else len(train_dataset)
 <         )
@@ -136,9 +136,9 @@
 >                 data_args.max_train_samples if data_args.max_train_samples is not None else len(train_dataset)
 >             )
 >             metrics["train_samples"] = min(max_train_samples, len(train_dataset))
-619d658
+622d661
 < 
-622,623c661,666
+625,626c664,669
 <         max_eval_samples = data_args.max_eval_samples if data_args.max_eval_samples is not None else len(eval_dataset)
 <         metrics["eval_samples"] = min(max_eval_samples, len(eval_dataset))
 ---
@@ -148,7 +148,7 @@
 >             )
 >             metrics["eval_samples"] = min(max_eval_samples, len(eval_dataset))
 > 
-646,650d688
+649,653d691
 < 
 < 
 < def _mp_fn(index):
diff --git a/tests/example_diff/run_generation.txt b/tests/example_diff/run_generation.txt
index 5da903f6e8..e1745cf95b 100644
--- a/tests/example_diff/run_generation.txt
+++ b/tests/example_diff/run_generation.txt
@@ -551,7 +551,7 @@
 >     parser.add_argument(
 >         "--trust_remote_code",
 >         action="store_true",
->         help="Whether or not to allow for custom models defined on the Hub in their own modeling files.",
+>         help="Whether to trust the execution of code from datasets/models defined on the Hub. This option should only be set to `True` for repositories you trust and in which you have read the code, as it will execute code present on the Hub on your local machine.",
 333d289
 <     parser.add_argument("--jit", action="store_true", help="Whether or not to use jit trace to accelerate inference")
 336,339c292,293
diff --git a/tests/example_diff/run_glue.txt b/tests/example_diff/run_glue.txt
index 78cafcf01c..46005ba396 100644
--- a/tests/example_diff/run_glue.txt
+++ b/tests/example_diff/run_glue.txt
@@ -21,7 +21,7 @@
 >         return ()
 50,51c56,61
 < # Will error if the minimal version of Transformers is not installed. Remove at your own risks.
-< check_min_version("4.42.0.dev0")
+< check_min_version("4.44.0.dev0")
 ---
 > 
 > logger = logging.getLogger(__name__)
@@ -63,21 +63,21 @@
 >         f"Process rank: {training_args.local_rank}, device: {training_args.device}, "
 >         + f"distributed training: {training_args.parallel_mode.value == 'distributed'}, "
 >         + f"mixed-precision training: {mixed_precision}"
-375a401
+376a402
 >         problem_type=data_args.problem_type,
-416a443,447
+417a444,448
 >     if model_args.add_pad_token:
 >         if not model.config.pad_token_id and not tokenizer.pad_token:
 >             tokenizer.pad_token = tokenizer.eos_token
 >             model.config.pad_token_id = tokenizer.eos_token_id
 > 
-527c558
+528c559
 <     trainer = Trainer(
 ---
 >     trainer = GaudiTrainer(
-528a560
+529a561
 >         gaudi_config=gaudi_config,
-628,632d659
+629,633d660
 < 
 < 
 < def _mp_fn(index):
diff --git a/tests/example_diff/run_image_classification.txt b/tests/example_diff/run_image_classification.txt
index 0dbbe3f6c2..49ab2bb6a1 100644
--- a/tests/example_diff/run_image_classification.txt
+++ b/tests/example_diff/run_image_classification.txt
@@ -25,7 +25,7 @@
 < """ Fine-tuning a 🤗 Transformers model for image classification"""
 58,59c65,67
 < # Will error if the minimal version of Transformers is not installed. Remove at your own risks.
-< check_min_version("4.42.0.dev0")
+< check_min_version("4.44.0.dev0")
 ---
 > # Will error if the minimal version of Transformers and Optimum Habana are not installed. Remove at your own risks.
 > check_min_version("4.40.0")
@@ -51,9 +51,9 @@
 >         f"Process rank: {training_args.local_rank}, device: {training_args.device}, "
 >         + f"distributed training: {training_args.parallel_mode.value == 'distributed'}, "
 >         + f"mixed-precision training: {mixed_precision}"
-392c409
+393c410
 <     trainer = Trainer(
 ---
 >     trainer = GaudiTrainer(
-393a411
+394a412
 >         gaudi_config=gaudi_config,
diff --git a/tests/example_diff/run_mlm.txt b/tests/example_diff/run_mlm.txt
index 372a913834..3e4f6c5863 100644
--- a/tests/example_diff/run_mlm.txt
+++ b/tests/example_diff/run_mlm.txt
@@ -20,7 +20,7 @@
 > from optimum.habana.utils import set_seed
 56,57d51
 < # Will error if the minimal version of Transformers is not installed. Remove at your own risks.
-< check_min_version("4.42.0.dev0")
+< check_min_version("4.44.0.dev0")
 59c53,59
 < require_version("datasets>=2.14.0", "To fix: pip install -r examples/pytorch/language-modeling/requirements.txt")
 ---
@@ -75,13 +75,13 @@
 >         + f"mixed-precision training: {mixed_precision}"
 289d305
 <     # Set the verbosity to info of the Transformers logger (on main process only):
-617c633
+620c636
 <     trainer = Trainer(
 ---
 >     trainer = GaudiTrainer(
-618a635
+621a638
 >         gaudi_config=gaudi_config,
-624,627c641,642
+627,630c644,645
 <         compute_metrics=compute_metrics if training_args.do_eval and not is_torch_xla_available() else None,
 <         preprocess_logits_for_metrics=preprocess_logits_for_metrics
 <         if training_args.do_eval and not is_torch_xla_available()
@@ -89,7 +89,7 @@
 ---
 >         compute_metrics=compute_metrics if training_args.do_eval else None,
 >         preprocess_logits_for_metrics=preprocess_logits_for_metrics if training_args.do_eval else None,
-641,644c656,662
+644,647c659,665
 <         max_train_samples = (
 <             data_args.max_train_samples if data_args.max_train_samples is not None else len(train_dataset)
 <         )
@@ -102,9 +102,9 @@
 >                 data_args.max_train_samples if data_args.max_train_samples is not None else len(train_dataset)
 >             )
 >             metrics["train_samples"] = min(max_train_samples, len(train_dataset))
-653d670
+656d673
 < 
-656,657c673,678
+659,660c676,681
 <         max_eval_samples = data_args.max_eval_samples if data_args.max_eval_samples is not None else len(eval_dataset)
 <         metrics["eval_samples"] = min(max_eval_samples, len(eval_dataset))
 ---
@@ -114,7 +114,7 @@
 >             )
 >             metrics["eval_samples"] = min(max_eval_samples, len(eval_dataset))
 > 
-680,684d700
+683,687d703
 < 
 < 
 < def _mp_fn(index):
diff --git a/tests/example_diff/run_qa.txt b/tests/example_diff/run_qa.txt
index 118add46a1..961785aaac 100644
--- a/tests/example_diff/run_qa.txt
+++ b/tests/example_diff/run_qa.txt
@@ -19,7 +19,7 @@
 > from optimum.habana.utils import set_seed
 52,53d50
 < # Will error if the minimal version of Transformers is not installed. Remove at your own risks.
-< check_min_version("4.42.0.dev0")
+< check_min_version("4.44.0.dev0")
 55c52,58
 < require_version("datasets>=1.8.0", "To fix: pip install -r examples/pytorch/question-answering/requirements.txt")
 ---
@@ -62,14 +62,14 @@
 >         f"Process rank: {training_args.local_rank}, device: {training_args.device}, "
 >         + f"distributed training: {training_args.parallel_mode.value == 'distributed'}, "
 >         + f"mixed-precision training: {mixed_precision}"
-346a365,368
+347a366,369
 >     if config.model_type == "llama":
 >         if tokenizer.pad_token is None:
 >             tokenizer.add_special_tokens({"pad_token": "[PAD]"})
 >         tokenizer.cls_token = tokenizer.bos_token
-637a660
+638a661
 >         gaudi_config=gaudi_config,
-706,710d728
+707,711d729
 < 
 < 
 < def _mp_fn(index):
diff --git a/tests/example_diff/run_seq2seq_qa.txt b/tests/example_diff/run_seq2seq_qa.txt
index 817c72b5a9..322661ff62 100644
--- a/tests/example_diff/run_seq2seq_qa.txt
+++ b/tests/example_diff/run_seq2seq_qa.txt
@@ -11,7 +11,7 @@
 > from optimum.habana.utils import set_seed
 48,49d46
 < # Will error if the minimal version of Transformers is not installed. Remove at your own risks.
-< check_min_version("4.42.0.dev0")
+< check_min_version("4.44.0.dev0")
 51c48,54
 < require_version("datasets>=1.8.0", "To fix: pip install -r examples/pytorch/question-answering/requirements.txt")
 ---
@@ -54,9 +54,9 @@
 >         f"Process rank: {training_args.local_rank}, device: {training_args.device}, "
 >         + f"distributed training: {training_args.parallel_mode.value == 'distributed'}, "
 >         + f"mixed-precision training: {mixed_precision}"
-660a679
+661a680
 >         gaudi_config=gaudi_config,
-734,738d752
+735,739d753
 < 
 < 
 < def _mp_fn(index):
diff --git a/tests/example_diff/run_speech_recognition_ctc.txt b/tests/example_diff/run_speech_recognition_ctc.txt
index 1fab0abcf2..a99ee732b3 100644
--- a/tests/example_diff/run_speech_recognition_ctc.txt
+++ b/tests/example_diff/run_speech_recognition_ctc.txt
@@ -13,7 +13,7 @@
 > from optimum.habana.utils import set_seed
 52,53d49
 < # Will error if the minimal version of Transformers is not installed. Remove at your own risks.
-< check_min_version("4.42.0.dev0")
+< check_min_version("4.44.0.dev0")
 55c51,56
 < require_version("datasets>=1.18.0", "To fix: pip install -r examples/pytorch/speech-recognition/requirements.txt")
 ---
@@ -56,13 +56,14 @@
 >         f"Process rank: {training_args.local_rank}, device: {training_args.device}, "
 >         + f"distributed training: {training_args.parallel_mode.value == 'distributed'}, "
 >         + f"mixed-precision training: {mixed_precision}"
-451,457c465,470
+451,458c465,471
 <     if training_args.do_train:
 <         raw_datasets["train"] = load_dataset(
 <             data_args.dataset_name,
 <             data_args.dataset_config_name,
 <             split=data_args.train_split_name,
 <             token=data_args.token,
+<             trust_remote_code=data_args.trust_remote_code,
 <         )
 ---
 >     raw_datasets["train"] = load_dataset(
@@ -70,8 +71,9 @@
 >         data_args.dataset_config_name,
 >         split=data_args.train_split_name,
 >         token=data_args.token,
+>         trust_remote_code=data_args.trust_remote_code,
 >     )
-459,464c472,477
+460,465c473,478
 <         if data_args.audio_column_name not in raw_datasets["train"].column_names:
 <             raise ValueError(
 <                 f"--audio_column_name '{data_args.audio_column_name}' not found in dataset '{data_args.dataset_name}'."
@@ -85,7 +87,7 @@
 >             " Make sure to set `--audio_column_name` to the correct audio column - one of"
 >             f" {', '.join(raw_datasets['train'].column_names)}."
 >         )
-466,471c479,484
+467,472c480,485
 <         if data_args.text_column_name not in raw_datasets["train"].column_names:
 <             raise ValueError(
 <                 f"--text_column_name {data_args.text_column_name} not found in dataset '{data_args.dataset_name}'. "
@@ -99,32 +101,32 @@
 >             "Make sure to set `--text_column_name` to the correct text column - one of "
 >             f"{', '.join(raw_datasets['train'].column_names)}."
 >         )
-473,474c486,487
+474,475c487,488
 <         if data_args.max_train_samples is not None:
 <             raw_datasets["train"] = raw_datasets["train"].select(range(data_args.max_train_samples))
 ---
 >     if data_args.max_train_samples is not None:
 >         raw_datasets["train"] = raw_datasets["train"].select(range(data_args.max_train_samples))
-492c505
+494c507
 <         f'[{"".join(data_args.chars_to_ignore)}]' if data_args.chars_to_ignore is not None else None
 ---
 >         f'[{"".join(data_args.chars_to_ignore).replace(" ", "")}]' if data_args.chars_to_ignore is not None else None
-631a645,649
+633a647,651
 >         raise RuntimeError(
 >             f"The dataset sampling rate ({dataset_sampling_rate}) is different from the feature extractor one"
 >             f" ({feature_extractor.sampling_rate}).Data resampling should be done. The Datasets library does not"
 >             " support it on HPUs yet."
 >         )
-741c759,762
+743c761,764
 <         processor=processor, feature_extractor_input_name=feature_extractor_input_name
 ---
 >         processor=processor,
 >         feature_extractor_input_name=feature_extractor_input_name,
 >         pad_to_multiple_of=int(max_input_length),
 >         pad_to_multiple_of_labels=500,
-745c766
+747c768
 <     trainer = Trainer(
 ---
 >     trainer = GaudiTrainer(
-746a768
+748a770
 >         gaudi_config=gaudi_config,
diff --git a/tests/example_diff/run_speech_recognition_seq2seq.txt b/tests/example_diff/run_speech_recognition_seq2seq.txt
index 45b00bef9b..196d356171 100644
--- a/tests/example_diff/run_speech_recognition_seq2seq.txt
+++ b/tests/example_diff/run_speech_recognition_seq2seq.txt
@@ -20,7 +20,7 @@
 >         return ()
 > 
 51c58,59
-< check_min_version("4.42.0.dev0")
+< check_min_version("4.44.0.dev0")
 ---
 > check_min_version("4.40.0")
 > check_optimum_habana_min_version("1.11.0")
@@ -59,18 +59,18 @@
 >         f"Process rank: {training_args.local_rank}, device: {training_args.device}, "
 >         + f"distributed training: {training_args.parallel_mode.value == 'distributed'}, "
 >         + f"mixed-precision training: {mixed_precision}"
-442d466
+444d468
 <         model.generation_config.forced_decoder_ids = model_args.forced_decoder_ids
-456a481,484
+458a483,486
 >         logger.warning(
 >             f"The dataset sampling rate ({dataset_sampling_rate}) is different from the feature extractor one"
 >             f" ({feature_extractor.sampling_rate}).Data resampling should be done."
 >         )
-561a590
+563a592
 >         label_features_max_length=data_args.label_features_max_length,
-565c594
+567c596
 <     trainer = Seq2SeqTrainer(
 ---
 >     trainer = GaudiSeq2SeqTrainer(
-566a596
+568a598
 >         gaudi_config=gaudi_config,
diff --git a/tests/example_diff/run_summarization.txt b/tests/example_diff/run_summarization.txt
index 9f01193b14..81868ab221 100644
--- a/tests/example_diff/run_summarization.txt
+++ b/tests/example_diff/run_summarization.txt
@@ -23,7 +23,7 @@
 > from optimum.habana.utils import set_seed
 54,55d55
 < # Will error if the minimal version of Transformers is not installed. Remove at your own risks.
-< check_min_version("4.42.0.dev0")
+< check_min_version("4.44.0.dev0")
 57c57,63
 < require_version("datasets>=1.8.0", "To fix: pip install -r examples/pytorch/summarization/requirements.txt")
 ---
@@ -78,16 +78,16 @@
 >         f"Process rank: {training_args.local_rank}, device: {training_args.device}, "
 >         + f"distributed training: {training_args.parallel_mode.value == 'distributed'}, "
 >         + f"mixed-precision training: {mixed_precision}"
-431a463
+432a464
 >         use_cache=False if training_args.gradient_checkpointing else model_args.use_cache,
-450a483,488
+451a484,489
 >     is_bart = model.config.model_type == "bart"
 >     if is_bart and training_args.do_train:
 >         raise ValueError(
 >             "Training is not yet supported for BART. Eval or predict can be enabled with `--do_eval` and `--do_predict`."
 >         )
 > 
-453c491,498
+454c492,499
 <     embedding_size = model.get_input_embeddings().weight.shape[0]
 ---
 >     embeddings = model.get_input_embeddings()
@@ -98,16 +98,16 @@
 >             embedding_size = embeddings.weight.shape[0]
 >     else:
 >         embedding_size = embeddings.weight.shape[0]
-486a532
+487a533
 >     suffix = data_args.source_suffix if data_args.source_suffix is not None else ""
-557a604,605
+558a605,606
 >             else:
 >                 raise ValueError("Found case where either text or summary is missing.")
-559c607
+560c608
 <         inputs = [prefix + inp for inp in inputs]
 ---
 >         inputs = [prefix + inp + suffix for inp in inputs]
-574a623,662
+575a624,663
 >     def preprocess_bucketing_function(examples):
 >         # remove pairs where at least one record is None
 > 
@@ -148,22 +148,22 @@
 >         model_inputs["labels"] = labels["input_ids"]
 >         return model_inputs
 > 
-589a678,683
+590a679,684
 >     def wrapper_preprocess_function(examples):
 >         if model.config.is_encoder_decoder:
 >             return preprocess_bucketing_function(examples)
 >         else:
 >             return preprocess_function(examples)
 > 
-598c692
+599c693
 <                 preprocess_function,
 ---
 >                 wrapper_preprocess_function,
-614c708
+615c709
 <                 preprocess_function,
 ---
 >                 wrapper_preprocess_function,
-624,629c718,726
+625,630c719,727
 <     data_collator = DataCollatorForSeq2Seq(
 <         tokenizer,
 <         model=model,
@@ -180,7 +180,7 @@
 >             label_pad_token_id=label_pad_token_id,
 >             pad_to_multiple_of=8 if training_args.fp16 else None,
 >         )
-664,671c761,769
+665,672c762,773
 <     training_args.generation_max_length = (
 <         training_args.generation_max_length
 <         if training_args.generation_max_length is not None
@@ -196,16 +196,19 @@
 >     else:
 >         training_args.generation_config.max_length = data_args.val_max_target_length
 >     if data_args.num_beams is not None:
+>         if data_args.num_beams == 1:
+>             training_args.generation_config.length_penalty = None
+>             training_args.generation_config.early_stopping = False
 >         training_args.generation_config.num_beams = data_args.num_beams
 >     elif training_args.generation_num_beams is not None:
 >         training_args.generation_config.num_beams = training_args.generation_num_beams
-674c772
+675c776
 <     trainer = Seq2SeqTrainer(
 ---
 >     trainer = GaudiSeq2SeqTrainer(
-675a774
+676a778
 >         gaudi_config=gaudi_config,
-764,768d862
+765,769d866
 < 
 < 
 < def _mp_fn(index):
diff --git a/tests/example_diff/run_translation.txt b/tests/example_diff/run_translation.txt
index 1aa504c06f..e7038d847c 100644
--- a/tests/example_diff/run_translation.txt
+++ b/tests/example_diff/run_translation.txt
@@ -15,7 +15,7 @@
 > from optimum.habana.utils import set_seed
 54,55d52
 < # Will error if the minimal version of Transformers is not installed. Remove at your own risks.
-< check_min_version("4.42.0.dev0")
+< check_min_version("4.44.0.dev0")
 57c54,60
 < require_version("datasets>=1.8.0", "To fix: pip install -r examples/pytorch/translation/requirements.txt")
 ---
@@ -79,19 +79,19 @@
 >         f"Process rank: {training_args.local_rank}, device: {training_args.device}, "
 >         + f"distributed training: {training_args.parallel_mode.value == 'distributed'}, "
 >         + f"mixed-precision training: {mixed_precision}"
-384a419
+385a420
 >         use_cache=False if training_args.gradient_checkpointing else model_args.use_cache,
-456c491
+457c492
 <     # Check the whether the source target length fits in the model, if it has absolute positional embeddings
 ---
 >     # Check whether the source target length fits in the model, if it has absolute positional embeddings
-594c629
+595c630
 <     trainer = Seq2SeqTrainer(
 ---
 >     trainer = GaudiSeq2SeqTrainer(
-595a631
+596a632
 >         gaudi_config=gaudi_config,
-688,692d723
+689,693d724
 < 
 < 
 < def _mp_fn(index):