huggingface · sanchit-gandhi · May 16, 2024 · May 16, 2024 · May 16, 2024 · May 16, 2024
diff --git a/examples/pytorch/speech-recognition/run_speech_recognition_seq2seq.py b/examples/pytorch/speech-recognition/run_speech_recognition_seq2seq.py
@@ -584,7 +584,7 @@ def compute_metrics(pred):
         args=training_args,
         train_dataset=vectorized_datasets["train"] if training_args.do_train else None,
         eval_dataset=vectorized_datasets["eval"] if training_args.do_eval else None,
-        tokenizer=feature_extractor,
+        processor=processor,
         data_collator=data_collator,
         compute_metrics=compute_metrics if training_args.predict_with_generate else None,
     )

diff --git a/examples/research_projects/robust-speech-event/run_speech_recognition_ctc_bnb.py b/examples/research_projects/robust-speech-event/run_speech_recognition_ctc_bnb.py
@@ -709,7 +709,7 @@ def compute_metrics(pred):
         compute_metrics=compute_metrics,
         train_dataset=vectorized_datasets["train"] if training_args.do_train else None,
         eval_dataset=vectorized_datasets["eval"] if training_args.do_eval else None,
-        tokenizer=feature_extractor,
+        processor=processor,
         optimizers=optimizers,
     )
 

diff --git a/examples/research_projects/xtreme-s/run_xtreme_s.py b/examples/research_projects/xtreme-s/run_xtreme_s.py
@@ -844,7 +844,7 @@ def compute_classification_metric(pred):
             compute_metrics=compute_asr_metric if training_args.predict_with_generate else None,
             train_dataset=vectorized_datasets["train"] if training_args.do_train else None,
             eval_dataset=vectorized_datasets["eval"] if training_args.do_eval else None,
-            tokenizer=feature_extractor,
+            processor=processor,
         )
     else:
         trainer = Trainer(
@@ -855,7 +855,7 @@ def compute_classification_metric(pred):
             compute_metrics=compute_asr_metric if is_text_target else compute_classification_metric,
             train_dataset=vectorized_datasets["train"] if training_args.do_train else None,
             eval_dataset=vectorized_datasets["eval"] if training_args.do_eval else None,
-            tokenizer=feature_extractor,
+            processor=processor,
         )
 
     # 8. Finally, we can start training

diff --git a/src/transformers/trainer.py b/src/transformers/trainer.py
@@ -69,6 +69,7 @@
     MODEL_MAPPING_NAMES,
 )
 from .optimization import Adafactor, get_scheduler
+from .processing_utils import ProcessorMixin
 from .pytorch_utils import (
     ALL_LAYERNORM_LAYERS,
     is_torch_greater_or_equal_than_1_13,
@@ -318,6 +319,10 @@ class Trainer:
             The tokenizer used to preprocess the data. If provided, will be used to automatically pad the inputs to the
             maximum length when batching inputs, and it will be saved along the model to make it easier to rerun an
             interrupted training or reuse the fine-tuned model.
+        processor ([`ProcessorMixin`], *optional*):
+            The processor used to pre- and post-process the data for multimodal models. If provided, will be used to
+            automatically pad the inputs to the maximum length when batching inputs, and it will be saved along the
+            model to make it easier to rerun an interrupted training or reuse the fine-tuned model.
         model_init (`Callable[[], PreTrainedModel]`, *optional*):
             A function that instantiates the model to be used. If provided, each call to [`~Trainer.train`] will start
             from a new instance of the model as given by this function.
@@ -375,6 +380,7 @@ def __init__(
         train_dataset: Optional[Union[Dataset, IterableDataset, "datasets.Dataset"]] = None,
         eval_dataset: Optional[Union[Dataset, Dict[str, Dataset], "datasets.Dataset"]] = None,
         tokenizer: Optional[PreTrainedTokenizerBase] = None,
+        processor: Optional[ProcessorMixin] = None,
         model_init: Optional[Callable[[], PreTrainedModel]] = None,
         compute_metrics: Optional[Callable[[EvalPrediction], Dict]] = None,
         callbacks: Optional[List[TrainerCallback]] = None,
@@ -510,6 +516,19 @@ def __init__(
         ):
             self.place_model_on_device = False
 
+        if processor is not None and tokenizer is not None:
+            raise ValueError(
+                "You cannot pass both `processor` and `tokenizer` to the Trainer. Only pass the `processor` if defined."
+            )
+        elif processor is not None:
+            self.tokenizer = processor
+            if hasattr(processor, "feature_extractor"):
+                tokenizer = processor.feature_extractor
+            elif hasattr(processor, "tokenizer"):
+                tokenizer = processor.tokenizer
+        else:
+            self.tokenizer = tokenizer
+
         default_collator = (
             DataCollatorWithPadding(tokenizer)
             if tokenizer is not None and isinstance(tokenizer, (PreTrainedTokenizerBase, SequenceFeatureExtractor))
@@ -518,7 +537,6 @@ def __init__(
         self.data_collator = data_collator if data_collator is not None else default_collator
         self.train_dataset = train_dataset
         self.eval_dataset = eval_dataset
-        self.tokenizer = tokenizer
 
         # Bnb Quantized models doesn't support `.to` operation.
         if (

diff --git a/src/transformers/trainer_seq2seq.py b/src/transformers/trainer_seq2seq.py
@@ -30,6 +30,7 @@
 if TYPE_CHECKING:
     from .data.data_collator import DataCollator
     from .modeling_utils import PreTrainedModel
+    from .processing_utils import ProcessorMixin
     from .tokenization_utils_base import PreTrainedTokenizerBase
     from .trainer_callback import TrainerCallback
     from .trainer_utils import EvalPrediction, PredictionOutput
@@ -48,6 +49,7 @@ def __init__(
         train_dataset: Optional[Dataset] = None,
         eval_dataset: Optional[Union[Dataset, Dict[str, Dataset]]] = None,
         tokenizer: Optional["PreTrainedTokenizerBase"] = None,
+        processor: Optional["ProcessorMixin"] = None,
         model_init: Optional[Callable[[], "PreTrainedModel"]] = None,
         compute_metrics: Optional[Callable[["EvalPrediction"], Dict]] = None,
         callbacks: Optional[List["TrainerCallback"]] = None,
@@ -61,6 +63,7 @@ def __init__(
             train_dataset=train_dataset,
             eval_dataset=eval_dataset,
             tokenizer=tokenizer,
+            processor=processor,
             model_init=model_init,
             compute_metrics=compute_metrics,
             callbacks=callbacks,