From 4d8d4873a18f185e4bdd86241a96deb9c2f17f59 Mon Sep 17 00:00:00 2001
From: sanchit-gandhi <sanchit@huggingface.co>
Date: Thu, 16 May 2024 15:56:48 +0100
Subject: [PATCH 1/5] pass proc to trainer(s)

---
 src/transformers/trainer.py         | 11 ++++++++++-
 src/transformers/trainer_seq2seq.py |  3 +++
 2 files changed, 13 insertions(+), 1 deletion(-)

diff --git a/src/transformers/trainer.py b/src/transformers/trainer.py
index afe414fbc8b3..fe00eee062cd 100755
--- a/src/transformers/trainer.py
+++ b/src/transformers/trainer.py
@@ -75,6 +75,7 @@
     is_torch_greater_or_equal_than_2_3,
 )
 from .tokenization_utils_base import PreTrainedTokenizerBase
+from .processing_utils import ProcessorMixin
 from .trainer_callback import (
     CallbackHandler,
     DefaultFlowCallback,
@@ -318,6 +319,10 @@ class Trainer:
             The tokenizer used to preprocess the data. If provided, will be used to automatically pad the inputs to the
             maximum length when batching inputs, and it will be saved along the model to make it easier to rerun an
             interrupted training or reuse the fine-tuned model.
+        processor ([`ProcessorMixin`], *optional*):
+            The processor used to pre- and post-process the data for multimodal models. If provided, will be used to
+            automatically pad the inputs to the maximum length when batching inputs, and it will be saved along the
+            model to make it easier to rerun an interrupted training or reuse the fine-tuned model.
         model_init (`Callable[[], PreTrainedModel]`, *optional*):
             A function that instantiates the model to be used. If provided, each call to [`~Trainer.train`] will start
             from a new instance of the model as given by this function.
@@ -375,6 +380,7 @@ def __init__(
         train_dataset: Optional[Union[Dataset, IterableDataset, "datasets.Dataset"]] = None,
         eval_dataset: Optional[Union[Dataset, Dict[str, Dataset], "datasets.Dataset"]] = None,
         tokenizer: Optional[PreTrainedTokenizerBase] = None,
+        processor: Optional[ProcessorMixin] = None,
         model_init: Optional[Callable[[], PreTrainedModel]] = None,
         compute_metrics: Optional[Callable[[EvalPrediction], Dict]] = None,
         callbacks: Optional[List[TrainerCallback]] = None,
@@ -510,6 +516,10 @@ def __init__(
         ):
             self.place_model_on_device = False
 
+        self.tokenizer = tokenizer
+        if processor is not None and hasattr(processor, "feature_extractor"):
+            tokenizer = processor.feature_extractor
+
         default_collator = (
             DataCollatorWithPadding(tokenizer)
             if tokenizer is not None and isinstance(tokenizer, (PreTrainedTokenizerBase, SequenceFeatureExtractor))
@@ -518,7 +528,6 @@ def __init__(
         self.data_collator = data_collator if data_collator is not None else default_collator
         self.train_dataset = train_dataset
         self.eval_dataset = eval_dataset
-        self.tokenizer = tokenizer
 
         # Bnb Quantized models doesn't support `.to` operation.
         if (
diff --git a/src/transformers/trainer_seq2seq.py b/src/transformers/trainer_seq2seq.py
index b6bce1b57d5e..6452c4ca5510 100644
--- a/src/transformers/trainer_seq2seq.py
+++ b/src/transformers/trainer_seq2seq.py
@@ -34,6 +34,7 @@
     from .trainer_callback import TrainerCallback
     from .trainer_utils import EvalPrediction, PredictionOutput
     from .training_args import TrainingArguments
+    from .processing_utils import ProcessorMixin
 
 
 logger = logging.get_logger(__name__)
@@ -48,6 +49,7 @@ def __init__(
         train_dataset: Optional[Dataset] = None,
         eval_dataset: Optional[Union[Dataset, Dict[str, Dataset]]] = None,
         tokenizer: Optional["PreTrainedTokenizerBase"] = None,
+        processor: Optional["ProcessorMixin"] = None,
         model_init: Optional[Callable[[], "PreTrainedModel"]] = None,
         compute_metrics: Optional[Callable[["EvalPrediction"], Dict]] = None,
         callbacks: Optional[List["TrainerCallback"]] = None,
@@ -61,6 +63,7 @@ def __init__(
             train_dataset=train_dataset,
             eval_dataset=eval_dataset,
             tokenizer=tokenizer,
+            processor=processor,
             model_init=model_init,
             compute_metrics=compute_metrics,
             callbacks=callbacks,

From cde0339ac6ac0d86e07174dc779ae73fbb430b81 Mon Sep 17 00:00:00 2001
From: sanchit-gandhi <sanchit@huggingface.co>
Date: Thu, 16 May 2024 15:57:03 +0100
Subject: [PATCH 2/5] update examples

---
 .../speech-recognition/run_speech_recognition_seq2seq.py      | 2 +-
 .../robust-speech-event/run_speech_recognition_ctc_bnb.py     | 2 +-
 examples/research_projects/xtreme-s/run_xtreme_s.py           | 4 ++--
 3 files changed, 4 insertions(+), 4 deletions(-)

diff --git a/examples/pytorch/speech-recognition/run_speech_recognition_seq2seq.py b/examples/pytorch/speech-recognition/run_speech_recognition_seq2seq.py
index 943dff1894ed..b688786fef43 100755
--- a/examples/pytorch/speech-recognition/run_speech_recognition_seq2seq.py
+++ b/examples/pytorch/speech-recognition/run_speech_recognition_seq2seq.py
@@ -584,7 +584,7 @@ def compute_metrics(pred):
         args=training_args,
         train_dataset=vectorized_datasets["train"] if training_args.do_train else None,
         eval_dataset=vectorized_datasets["eval"] if training_args.do_eval else None,
-        tokenizer=feature_extractor,
+        processor=processor,
         data_collator=data_collator,
         compute_metrics=compute_metrics if training_args.predict_with_generate else None,
     )
diff --git a/examples/research_projects/robust-speech-event/run_speech_recognition_ctc_bnb.py b/examples/research_projects/robust-speech-event/run_speech_recognition_ctc_bnb.py
index ebf33eb01df5..f5b36356f2e4 100755
--- a/examples/research_projects/robust-speech-event/run_speech_recognition_ctc_bnb.py
+++ b/examples/research_projects/robust-speech-event/run_speech_recognition_ctc_bnb.py
@@ -709,7 +709,7 @@ def compute_metrics(pred):
         compute_metrics=compute_metrics,
         train_dataset=vectorized_datasets["train"] if training_args.do_train else None,
         eval_dataset=vectorized_datasets["eval"] if training_args.do_eval else None,
-        tokenizer=feature_extractor,
+        processor=processor,
         optimizers=optimizers,
     )
 
diff --git a/examples/research_projects/xtreme-s/run_xtreme_s.py b/examples/research_projects/xtreme-s/run_xtreme_s.py
index e01ccbf4488d..5c65b7ba0633 100644
--- a/examples/research_projects/xtreme-s/run_xtreme_s.py
+++ b/examples/research_projects/xtreme-s/run_xtreme_s.py
@@ -844,7 +844,7 @@ def compute_classification_metric(pred):
             compute_metrics=compute_asr_metric if training_args.predict_with_generate else None,
             train_dataset=vectorized_datasets["train"] if training_args.do_train else None,
             eval_dataset=vectorized_datasets["eval"] if training_args.do_eval else None,
-            tokenizer=feature_extractor,
+            processor=processor,
         )
     else:
         trainer = Trainer(
@@ -855,7 +855,7 @@ def compute_classification_metric(pred):
             compute_metrics=compute_asr_metric if is_text_target else compute_classification_metric,
             train_dataset=vectorized_datasets["train"] if training_args.do_train else None,
             eval_dataset=vectorized_datasets["eval"] if training_args.do_eval else None,
-            tokenizer=feature_extractor,
+            processor=processor,
         )
 
     # 8. Finally, we can start training

From 53ec49df10bc8a9e47ebac7c87fc0f1d752e69f9 Mon Sep 17 00:00:00 2001
From: sanchit-gandhi <sanchit@huggingface.co>
Date: Thu, 16 May 2024 15:59:50 +0100
Subject: [PATCH 3/5] make style

---
 src/transformers/trainer.py         | 2 +-
 src/transformers/trainer_seq2seq.py | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/src/transformers/trainer.py b/src/transformers/trainer.py
index fe00eee062cd..154d206ab182 100755
--- a/src/transformers/trainer.py
+++ b/src/transformers/trainer.py
@@ -69,13 +69,13 @@
     MODEL_MAPPING_NAMES,
 )
 from .optimization import Adafactor, get_scheduler
+from .processing_utils import ProcessorMixin
 from .pytorch_utils import (
     ALL_LAYERNORM_LAYERS,
     is_torch_greater_or_equal_than_1_13,
     is_torch_greater_or_equal_than_2_3,
 )
 from .tokenization_utils_base import PreTrainedTokenizerBase
-from .processing_utils import ProcessorMixin
 from .trainer_callback import (
     CallbackHandler,
     DefaultFlowCallback,
diff --git a/src/transformers/trainer_seq2seq.py b/src/transformers/trainer_seq2seq.py
index 6452c4ca5510..b0b935953287 100644
--- a/src/transformers/trainer_seq2seq.py
+++ b/src/transformers/trainer_seq2seq.py
@@ -30,11 +30,11 @@
 if TYPE_CHECKING:
     from .data.data_collator import DataCollator
     from .modeling_utils import PreTrainedModel
+    from .processing_utils import ProcessorMixin
     from .tokenization_utils_base import PreTrainedTokenizerBase
     from .trainer_callback import TrainerCallback
     from .trainer_utils import EvalPrediction, PredictionOutput
     from .training_args import TrainingArguments
-    from .processing_utils import ProcessorMixin
 
 
 logger = logging.get_logger(__name__)

From e5ee5091fd4adb070520862251e66e79069bec24 Mon Sep 17 00:00:00 2001
From: sanchit-gandhi <sanchit@huggingface.co>
Date: Thu, 16 May 2024 16:07:19 +0100
Subject: [PATCH 4/5] prefer processor

---
 src/transformers/trainer.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/transformers/trainer.py b/src/transformers/trainer.py
index 154d206ab182..fafd1168bfcc 100755
--- a/src/transformers/trainer.py
+++ b/src/transformers/trainer.py
@@ -516,7 +516,7 @@ def __init__(
         ):
             self.place_model_on_device = False
 
-        self.tokenizer = tokenizer
+        self.tokenizer = processor if processor is not None else tokenizer
         if processor is not None and hasattr(processor, "feature_extractor"):
             tokenizer = processor.feature_extractor
 

From f8dc983e889d65402d1e59e38ef4683336cafcb6 Mon Sep 17 00:00:00 2001
From: sanchit-gandhi <sanchit@huggingface.co>
Date: Thu, 16 May 2024 17:05:24 +0100
Subject: [PATCH 5/5] guard

---
 src/transformers/trainer.py | 15 ++++++++++++---
 1 file changed, 12 insertions(+), 3 deletions(-)

diff --git a/src/transformers/trainer.py b/src/transformers/trainer.py
index fafd1168bfcc..59c5a9885abb 100755
--- a/src/transformers/trainer.py
+++ b/src/transformers/trainer.py
@@ -516,9 +516,18 @@ def __init__(
         ):
             self.place_model_on_device = False
 
-        self.tokenizer = processor if processor is not None else tokenizer
-        if processor is not None and hasattr(processor, "feature_extractor"):
-            tokenizer = processor.feature_extractor
+        if processor is not None and tokenizer is not None:
+            raise ValueError(
+                "You cannot pass both `processor` and `tokenizer` to the Trainer. Only pass the `processor` if defined."
+            )
+        elif processor is not None:
+            self.tokenizer = processor
+            if hasattr(processor, "feature_extractor"):
+                tokenizer = processor.feature_extractor
+            elif hasattr(processor, "tokenizer"):
+                tokenizer = processor.tokenizer
+        else:
+            self.tokenizer = tokenizer
 
         default_collator = (
             DataCollatorWithPadding(tokenizer)