diff --git a/examples/pytorch/speech-recognition/run_speech_recognition_seq2seq.py b/examples/pytorch/speech-recognition/run_speech_recognition_seq2seq.py
index 943dff1894ed..b688786fef43 100755
--- a/examples/pytorch/speech-recognition/run_speech_recognition_seq2seq.py
+++ b/examples/pytorch/speech-recognition/run_speech_recognition_seq2seq.py
@@ -584,7 +584,7 @@ def compute_metrics(pred):
         args=training_args,
         train_dataset=vectorized_datasets["train"] if training_args.do_train else None,
         eval_dataset=vectorized_datasets["eval"] if training_args.do_eval else None,
-        tokenizer=feature_extractor,
+        processor=processor,
         data_collator=data_collator,
         compute_metrics=compute_metrics if training_args.predict_with_generate else None,
     )
diff --git a/examples/research_projects/robust-speech-event/run_speech_recognition_ctc_bnb.py b/examples/research_projects/robust-speech-event/run_speech_recognition_ctc_bnb.py
index ebf33eb01df5..f5b36356f2e4 100755
--- a/examples/research_projects/robust-speech-event/run_speech_recognition_ctc_bnb.py
+++ b/examples/research_projects/robust-speech-event/run_speech_recognition_ctc_bnb.py
@@ -709,7 +709,7 @@ def compute_metrics(pred):
         compute_metrics=compute_metrics,
         train_dataset=vectorized_datasets["train"] if training_args.do_train else None,
         eval_dataset=vectorized_datasets["eval"] if training_args.do_eval else None,
-        tokenizer=feature_extractor,
+        processor=processor,
         optimizers=optimizers,
     )
 
diff --git a/examples/research_projects/xtreme-s/run_xtreme_s.py b/examples/research_projects/xtreme-s/run_xtreme_s.py
index e01ccbf4488d..5c65b7ba0633 100644
--- a/examples/research_projects/xtreme-s/run_xtreme_s.py
+++ b/examples/research_projects/xtreme-s/run_xtreme_s.py
@@ -844,7 +844,7 @@ def compute_classification_metric(pred):
             compute_metrics=compute_asr_metric if training_args.predict_with_generate else None,
             train_dataset=vectorized_datasets["train"] if training_args.do_train else None,
             eval_dataset=vectorized_datasets["eval"] if training_args.do_eval else None,
-            tokenizer=feature_extractor,
+            processor=processor,
         )
     else:
         trainer = Trainer(
@@ -855,7 +855,7 @@ def compute_classification_metric(pred):
             compute_metrics=compute_asr_metric if is_text_target else compute_classification_metric,
             train_dataset=vectorized_datasets["train"] if training_args.do_train else None,
             eval_dataset=vectorized_datasets["eval"] if training_args.do_eval else None,
-            tokenizer=feature_extractor,
+            processor=processor,
         )
 
     # 8. Finally, we can start training
diff --git a/src/transformers/trainer.py b/src/transformers/trainer.py
index afe414fbc8b3..59c5a9885abb 100755
--- a/src/transformers/trainer.py
+++ b/src/transformers/trainer.py
@@ -69,6 +69,7 @@
     MODEL_MAPPING_NAMES,
 )
 from .optimization import Adafactor, get_scheduler
+from .processing_utils import ProcessorMixin
 from .pytorch_utils import (
     ALL_LAYERNORM_LAYERS,
     is_torch_greater_or_equal_than_1_13,
@@ -318,6 +319,10 @@ class Trainer:
             The tokenizer used to preprocess the data. If provided, will be used to automatically pad the inputs to the
             maximum length when batching inputs, and it will be saved along the model to make it easier to rerun an
             interrupted training or reuse the fine-tuned model.
+        processor ([`ProcessorMixin`], *optional*):
+            The processor used to pre- and post-process the data for multimodal models. If provided, will be used to
+            automatically pad the inputs to the maximum length when batching inputs, and it will be saved along the
+            model to make it easier to rerun an interrupted training or reuse the fine-tuned model.
         model_init (`Callable[[], PreTrainedModel]`, *optional*):
             A function that instantiates the model to be used. If provided, each call to [`~Trainer.train`] will start
             from a new instance of the model as given by this function.
@@ -375,6 +380,7 @@ def __init__(
         train_dataset: Optional[Union[Dataset, IterableDataset, "datasets.Dataset"]] = None,
         eval_dataset: Optional[Union[Dataset, Dict[str, Dataset], "datasets.Dataset"]] = None,
         tokenizer: Optional[PreTrainedTokenizerBase] = None,
+        processor: Optional[ProcessorMixin] = None,
         model_init: Optional[Callable[[], PreTrainedModel]] = None,
         compute_metrics: Optional[Callable[[EvalPrediction], Dict]] = None,
         callbacks: Optional[List[TrainerCallback]] = None,
@@ -510,6 +516,19 @@ def __init__(
         ):
             self.place_model_on_device = False
 
+        if processor is not None and tokenizer is not None:
+            raise ValueError(
+                "You cannot pass both `processor` and `tokenizer` to the Trainer. Only pass the `processor` if defined."
+            )
+        elif processor is not None:
+            self.tokenizer = processor
+            if hasattr(processor, "feature_extractor"):
+                tokenizer = processor.feature_extractor
+            elif hasattr(processor, "tokenizer"):
+                tokenizer = processor.tokenizer
+        else:
+            self.tokenizer = tokenizer
+
         default_collator = (
             DataCollatorWithPadding(tokenizer)
             if tokenizer is not None and isinstance(tokenizer, (PreTrainedTokenizerBase, SequenceFeatureExtractor))
@@ -518,7 +537,6 @@ def __init__(
         self.data_collator = data_collator if data_collator is not None else default_collator
         self.train_dataset = train_dataset
         self.eval_dataset = eval_dataset
-        self.tokenizer = tokenizer
 
         # Bnb Quantized models doesn't support `.to` operation.
         if (
diff --git a/src/transformers/trainer_seq2seq.py b/src/transformers/trainer_seq2seq.py
index b6bce1b57d5e..b0b935953287 100644
--- a/src/transformers/trainer_seq2seq.py
+++ b/src/transformers/trainer_seq2seq.py
@@ -30,6 +30,7 @@
 if TYPE_CHECKING:
     from .data.data_collator import DataCollator
     from .modeling_utils import PreTrainedModel
+    from .processing_utils import ProcessorMixin
     from .tokenization_utils_base import PreTrainedTokenizerBase
     from .trainer_callback import TrainerCallback
     from .trainer_utils import EvalPrediction, PredictionOutput
@@ -48,6 +49,7 @@ def __init__(
         train_dataset: Optional[Dataset] = None,
         eval_dataset: Optional[Union[Dataset, Dict[str, Dataset]]] = None,
         tokenizer: Optional["PreTrainedTokenizerBase"] = None,
+        processor: Optional["ProcessorMixin"] = None,
         model_init: Optional[Callable[[], "PreTrainedModel"]] = None,
         compute_metrics: Optional[Callable[["EvalPrediction"], Dict]] = None,
         callbacks: Optional[List["TrainerCallback"]] = None,
@@ -61,6 +63,7 @@ def __init__(
             train_dataset=train_dataset,
             eval_dataset=eval_dataset,
             tokenizer=tokenizer,
+            processor=processor,
             model_init=model_init,
             compute_metrics=compute_metrics,
             callbacks=callbacks,