From 4d8d4873a18f185e4bdd86241a96deb9c2f17f59 Mon Sep 17 00:00:00 2001 From: sanchit-gandhi Date: Thu, 16 May 2024 15:56:48 +0100 Subject: [PATCH 1/5] pass proc to trainer(s) --- src/transformers/trainer.py | 11 ++++++++++- src/transformers/trainer_seq2seq.py | 3 +++ 2 files changed, 13 insertions(+), 1 deletion(-) diff --git a/src/transformers/trainer.py b/src/transformers/trainer.py index afe414fbc8b3..fe00eee062cd 100755 --- a/src/transformers/trainer.py +++ b/src/transformers/trainer.py @@ -75,6 +75,7 @@ is_torch_greater_or_equal_than_2_3, ) from .tokenization_utils_base import PreTrainedTokenizerBase +from .processing_utils import ProcessorMixin from .trainer_callback import ( CallbackHandler, DefaultFlowCallback, @@ -318,6 +319,10 @@ class Trainer: The tokenizer used to preprocess the data. If provided, will be used to automatically pad the inputs to the maximum length when batching inputs, and it will be saved along the model to make it easier to rerun an interrupted training or reuse the fine-tuned model. + processor ([`ProcessorMixin`], *optional*): + The processor used to pre- and post-process the data for multimodal models. If provided, will be used to + automatically pad the inputs to the maximum length when batching inputs, and it will be saved along the + model to make it easier to rerun an interrupted training or reuse the fine-tuned model. model_init (`Callable[[], PreTrainedModel]`, *optional*): A function that instantiates the model to be used. If provided, each call to [`~Trainer.train`] will start from a new instance of the model as given by this function. @@ -375,6 +380,7 @@ def __init__( train_dataset: Optional[Union[Dataset, IterableDataset, "datasets.Dataset"]] = None, eval_dataset: Optional[Union[Dataset, Dict[str, Dataset], "datasets.Dataset"]] = None, tokenizer: Optional[PreTrainedTokenizerBase] = None, + processor: Optional[ProcessorMixin] = None, model_init: Optional[Callable[[], PreTrainedModel]] = None, compute_metrics: Optional[Callable[[EvalPrediction], Dict]] = None, callbacks: Optional[List[TrainerCallback]] = None, @@ -510,6 +516,10 @@ def __init__( ): self.place_model_on_device = False + self.tokenizer = tokenizer + if processor is not None and hasattr(processor, "feature_extractor"): + tokenizer = processor.feature_extractor + default_collator = ( DataCollatorWithPadding(tokenizer) if tokenizer is not None and isinstance(tokenizer, (PreTrainedTokenizerBase, SequenceFeatureExtractor)) @@ -518,7 +528,6 @@ def __init__( self.data_collator = data_collator if data_collator is not None else default_collator self.train_dataset = train_dataset self.eval_dataset = eval_dataset - self.tokenizer = tokenizer # Bnb Quantized models doesn't support `.to` operation. if ( diff --git a/src/transformers/trainer_seq2seq.py b/src/transformers/trainer_seq2seq.py index b6bce1b57d5e..6452c4ca5510 100644 --- a/src/transformers/trainer_seq2seq.py +++ b/src/transformers/trainer_seq2seq.py @@ -34,6 +34,7 @@ from .trainer_callback import TrainerCallback from .trainer_utils import EvalPrediction, PredictionOutput from .training_args import TrainingArguments + from .processing_utils import ProcessorMixin logger = logging.get_logger(__name__) @@ -48,6 +49,7 @@ def __init__( train_dataset: Optional[Dataset] = None, eval_dataset: Optional[Union[Dataset, Dict[str, Dataset]]] = None, tokenizer: Optional["PreTrainedTokenizerBase"] = None, + processor: Optional["ProcessorMixin"] = None, model_init: Optional[Callable[[], "PreTrainedModel"]] = None, compute_metrics: Optional[Callable[["EvalPrediction"], Dict]] = None, callbacks: Optional[List["TrainerCallback"]] = None, @@ -61,6 +63,7 @@ def __init__( train_dataset=train_dataset, eval_dataset=eval_dataset, tokenizer=tokenizer, + processor=processor, model_init=model_init, compute_metrics=compute_metrics, callbacks=callbacks, From cde0339ac6ac0d86e07174dc779ae73fbb430b81 Mon Sep 17 00:00:00 2001 From: sanchit-gandhi Date: Thu, 16 May 2024 15:57:03 +0100 Subject: [PATCH 2/5] update examples --- .../speech-recognition/run_speech_recognition_seq2seq.py | 2 +- .../robust-speech-event/run_speech_recognition_ctc_bnb.py | 2 +- examples/research_projects/xtreme-s/run_xtreme_s.py | 4 ++-- 3 files changed, 4 insertions(+), 4 deletions(-) diff --git a/examples/pytorch/speech-recognition/run_speech_recognition_seq2seq.py b/examples/pytorch/speech-recognition/run_speech_recognition_seq2seq.py index 943dff1894ed..b688786fef43 100755 --- a/examples/pytorch/speech-recognition/run_speech_recognition_seq2seq.py +++ b/examples/pytorch/speech-recognition/run_speech_recognition_seq2seq.py @@ -584,7 +584,7 @@ def compute_metrics(pred): args=training_args, train_dataset=vectorized_datasets["train"] if training_args.do_train else None, eval_dataset=vectorized_datasets["eval"] if training_args.do_eval else None, - tokenizer=feature_extractor, + processor=processor, data_collator=data_collator, compute_metrics=compute_metrics if training_args.predict_with_generate else None, ) diff --git a/examples/research_projects/robust-speech-event/run_speech_recognition_ctc_bnb.py b/examples/research_projects/robust-speech-event/run_speech_recognition_ctc_bnb.py index ebf33eb01df5..f5b36356f2e4 100755 --- a/examples/research_projects/robust-speech-event/run_speech_recognition_ctc_bnb.py +++ b/examples/research_projects/robust-speech-event/run_speech_recognition_ctc_bnb.py @@ -709,7 +709,7 @@ def compute_metrics(pred): compute_metrics=compute_metrics, train_dataset=vectorized_datasets["train"] if training_args.do_train else None, eval_dataset=vectorized_datasets["eval"] if training_args.do_eval else None, - tokenizer=feature_extractor, + processor=processor, optimizers=optimizers, ) diff --git a/examples/research_projects/xtreme-s/run_xtreme_s.py b/examples/research_projects/xtreme-s/run_xtreme_s.py index e01ccbf4488d..5c65b7ba0633 100644 --- a/examples/research_projects/xtreme-s/run_xtreme_s.py +++ b/examples/research_projects/xtreme-s/run_xtreme_s.py @@ -844,7 +844,7 @@ def compute_classification_metric(pred): compute_metrics=compute_asr_metric if training_args.predict_with_generate else None, train_dataset=vectorized_datasets["train"] if training_args.do_train else None, eval_dataset=vectorized_datasets["eval"] if training_args.do_eval else None, - tokenizer=feature_extractor, + processor=processor, ) else: trainer = Trainer( @@ -855,7 +855,7 @@ def compute_classification_metric(pred): compute_metrics=compute_asr_metric if is_text_target else compute_classification_metric, train_dataset=vectorized_datasets["train"] if training_args.do_train else None, eval_dataset=vectorized_datasets["eval"] if training_args.do_eval else None, - tokenizer=feature_extractor, + processor=processor, ) # 8. Finally, we can start training From 53ec49df10bc8a9e47ebac7c87fc0f1d752e69f9 Mon Sep 17 00:00:00 2001 From: sanchit-gandhi Date: Thu, 16 May 2024 15:59:50 +0100 Subject: [PATCH 3/5] make style --- src/transformers/trainer.py | 2 +- src/transformers/trainer_seq2seq.py | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/src/transformers/trainer.py b/src/transformers/trainer.py index fe00eee062cd..154d206ab182 100755 --- a/src/transformers/trainer.py +++ b/src/transformers/trainer.py @@ -69,13 +69,13 @@ MODEL_MAPPING_NAMES, ) from .optimization import Adafactor, get_scheduler +from .processing_utils import ProcessorMixin from .pytorch_utils import ( ALL_LAYERNORM_LAYERS, is_torch_greater_or_equal_than_1_13, is_torch_greater_or_equal_than_2_3, ) from .tokenization_utils_base import PreTrainedTokenizerBase -from .processing_utils import ProcessorMixin from .trainer_callback import ( CallbackHandler, DefaultFlowCallback, diff --git a/src/transformers/trainer_seq2seq.py b/src/transformers/trainer_seq2seq.py index 6452c4ca5510..b0b935953287 100644 --- a/src/transformers/trainer_seq2seq.py +++ b/src/transformers/trainer_seq2seq.py @@ -30,11 +30,11 @@ if TYPE_CHECKING: from .data.data_collator import DataCollator from .modeling_utils import PreTrainedModel + from .processing_utils import ProcessorMixin from .tokenization_utils_base import PreTrainedTokenizerBase from .trainer_callback import TrainerCallback from .trainer_utils import EvalPrediction, PredictionOutput from .training_args import TrainingArguments - from .processing_utils import ProcessorMixin logger = logging.get_logger(__name__) From e5ee5091fd4adb070520862251e66e79069bec24 Mon Sep 17 00:00:00 2001 From: sanchit-gandhi Date: Thu, 16 May 2024 16:07:19 +0100 Subject: [PATCH 4/5] prefer processor --- src/transformers/trainer.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/transformers/trainer.py b/src/transformers/trainer.py index 154d206ab182..fafd1168bfcc 100755 --- a/src/transformers/trainer.py +++ b/src/transformers/trainer.py @@ -516,7 +516,7 @@ def __init__( ): self.place_model_on_device = False - self.tokenizer = tokenizer + self.tokenizer = processor if processor is not None else tokenizer if processor is not None and hasattr(processor, "feature_extractor"): tokenizer = processor.feature_extractor From f8dc983e889d65402d1e59e38ef4683336cafcb6 Mon Sep 17 00:00:00 2001 From: sanchit-gandhi Date: Thu, 16 May 2024 17:05:24 +0100 Subject: [PATCH 5/5] guard --- src/transformers/trainer.py | 15 ++++++++++++--- 1 file changed, 12 insertions(+), 3 deletions(-) diff --git a/src/transformers/trainer.py b/src/transformers/trainer.py index fafd1168bfcc..59c5a9885abb 100755 --- a/src/transformers/trainer.py +++ b/src/transformers/trainer.py @@ -516,9 +516,18 @@ def __init__( ): self.place_model_on_device = False - self.tokenizer = processor if processor is not None else tokenizer - if processor is not None and hasattr(processor, "feature_extractor"): - tokenizer = processor.feature_extractor + if processor is not None and tokenizer is not None: + raise ValueError( + "You cannot pass both `processor` and `tokenizer` to the Trainer. Only pass the `processor` if defined." + ) + elif processor is not None: + self.tokenizer = processor + if hasattr(processor, "feature_extractor"): + tokenizer = processor.feature_extractor + elif hasattr(processor, "tokenizer"): + tokenizer = processor.tokenizer + else: + self.tokenizer = tokenizer default_collator = ( DataCollatorWithPadding(tokenizer)