-
Notifications
You must be signed in to change notification settings - Fork 32.1k
[Trainer] Rename tokenizer to processor, add deprecation #30102
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Changes from all commits
bea74b4
cd6d68a
6ca7eb5
65be4ac
e8abcfc
b91869b
b166f58
a4ccda9
2548635
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -58,6 +58,7 @@ | |
| from .configuration_utils import PretrainedConfig | ||
| from .data.data_collator import DataCollator, DataCollatorWithPadding, default_data_collator | ||
| from .debug_utils import DebugOption, DebugUnderflowOverflow | ||
| from .feature_extraction_sequence_utils import SequenceFeatureExtractor | ||
| from .hyperparameter_search import ALL_HYPERPARAMETER_SEARCH_BACKENDS, default_hp_search_backend | ||
| from .image_processing_utils import BaseImageProcessor | ||
| from .integrations.deepspeed import deepspeed_init, deepspeed_load_checkpoint, is_deepspeed_available | ||
|
|
@@ -69,6 +70,7 @@ | |
| MODEL_MAPPING_NAMES, | ||
| ) | ||
| from .optimization import Adafactor, get_scheduler | ||
| from .processing_utils import ProcessorMixin | ||
| from .pytorch_utils import ALL_LAYERNORM_LAYERS, is_torch_greater_or_equal_than_1_13 | ||
| from .tokenization_utils_base import PreTrainedTokenizerBase | ||
| from .trainer_callback import ( | ||
|
|
@@ -285,8 +287,9 @@ class Trainer: | |
| `output_dir` set to a directory named *tmp_trainer* in the current directory if not provided. | ||
| data_collator (`DataCollator`, *optional*): | ||
| The function to use to form a batch from a list of elements of `train_dataset` or `eval_dataset`. Will | ||
| default to [`default_data_collator`] if no `tokenizer` is provided, an instance of | ||
| [`DataCollatorWithPadding`] otherwise. | ||
| default to [`default_data_collator`] if no `processor` is provided. | ||
|
|
||
| If the `processor` passed is a tokenizer, will default to an instance of [`DataCollatorWithPadding`]. | ||
| train_dataset (`torch.utils.data.Dataset` or `torch.utils.data.IterableDataset`, *optional*): | ||
| The dataset to use for training. If it is a [`~datasets.Dataset`], columns not accepted by the | ||
| `model.forward()` method are automatically removed. | ||
|
|
@@ -300,10 +303,13 @@ class Trainer: | |
| The dataset to use for evaluation. If it is a [`~datasets.Dataset`], columns not accepted by the | ||
| `model.forward()` method are automatically removed. If it is a dictionary, it will evaluate on each | ||
| dataset prepending the dictionary key to the metric name. | ||
| tokenizer ([`PreTrainedTokenizerBase`], *optional*): | ||
| The tokenizer used to preprocess the data. If provided, will be used to automatically pad the inputs to the | ||
| maximum length when batching inputs, and it will be saved along the model to make it easier to rerun an | ||
| interrupted training or reuse the fine-tuned model. | ||
| processor ([`PreTrainedTokenizer` or `BaseImageProcessor` or `SequenceFeatureExtractor` or `ProcessorMixin`], *optional*): | ||
| The processor used to preprocess the data. Can be a tokenizer, image processor, feature extractor or multimodal processor. | ||
|
|
||
| If a tokenizer is provided, it will be used to automatically pad the inputs to the | ||
| maximum length when batching inputs. | ||
|
|
||
| The processor will be saved along the model to make it easier to rerun an interrupted training or reuse the fine-tuned model. | ||
| model_init (`Callable[[], PreTrainedModel]`, *optional*): | ||
| A function that instantiates the model to be used. If provided, each call to [`~Trainer.train`] will start | ||
| from a new instance of the model as given by this function. | ||
|
|
@@ -360,14 +366,24 @@ def __init__( | |
| data_collator: Optional[DataCollator] = None, | ||
| train_dataset: Optional[Union[Dataset, IterableDataset]] = None, | ||
| eval_dataset: Optional[Union[Dataset, Dict[str, Dataset]]] = None, | ||
| tokenizer: Optional[PreTrainedTokenizerBase] = None, | ||
| processor: Optional[ | ||
| Union[PreTrainedTokenizerBase, BaseImageProcessor, SequenceFeatureExtractor, ProcessorMixin] | ||
| ] = None, | ||
| model_init: Optional[Callable[[], PreTrainedModel]] = None, | ||
| compute_metrics: Optional[Callable[[EvalPrediction], Dict]] = None, | ||
| callbacks: Optional[List[TrainerCallback]] = None, | ||
| optimizers: Tuple[torch.optim.Optimizer, torch.optim.lr_scheduler.LambdaLR] = (None, None), | ||
| preprocess_logits_for_metrics: Optional[Callable[[torch.Tensor, torch.Tensor], torch.Tensor]] = None, | ||
| image_processor: Optional["BaseImageProcessor"] = None, | ||
| tokenizer: Optional[PreTrainedTokenizerBase] = None, | ||
| ): | ||
| if tokenizer is not None: | ||
| warnings.warn( | ||
| "The `tokenizer` argument is deprecated and will be removed in v5 of Transformers. You can use `processor` " | ||
|
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I believe this PR is proposing deprecating
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Ah, yes. Perhaps I wasn't clear. What I meant was we would introduce an new argument, which should be used in preference to
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Great thanks for the clarification - on the same page here! Happy to update the PR #30864 accordingly, unless you want to see this one to completion @NielsRogge? |
||
| "instead to pass your tokenizer/image processor/feature extractor/multimodal processor object.", | ||
| FutureWarning, | ||
| ) | ||
| processor = tokenizer | ||
|
|
||
| if args is None: | ||
| output_dir = "tmp_trainer" | ||
| logger.info(f"No `TrainingArguments` passed, using `output_dir={output_dir}`.") | ||
|
|
@@ -490,12 +506,15 @@ def __init__( | |
| ): | ||
| self.place_model_on_device = False | ||
|
|
||
| default_collator = DataCollatorWithPadding(tokenizer) if tokenizer is not None else default_data_collator | ||
| default_collator = ( | ||
| DataCollatorWithPadding(processor) | ||
| if processor is not None and isinstance(processor, PreTrainedTokenizerBase) | ||
| else default_data_collator | ||
| ) | ||
| self.data_collator = data_collator if data_collator is not None else default_collator | ||
| self.train_dataset = train_dataset | ||
| self.eval_dataset = eval_dataset | ||
| self.tokenizer = tokenizer | ||
| self.image_processor = image_processor | ||
| self.processor = processor | ||
|
|
||
| # Bnb Quantized models doesn't support `.to` operation. | ||
| if ( | ||
|
|
@@ -547,7 +566,7 @@ def __init__( | |
| default_callbacks = DEFAULT_CALLBACKS + get_reporting_integration_callbacks(self.args.report_to) | ||
| callbacks = default_callbacks if callbacks is None else default_callbacks + callbacks | ||
| self.callback_handler = CallbackHandler( | ||
| callbacks, self.model, self.tokenizer, self.image_processor, self.optimizer, self.lr_scheduler | ||
| callbacks, self.model, self.processor, self.optimizer, self.lr_scheduler | ||
| ) | ||
| self.add_callback(PrinterCallback if self.args.disable_tqdm else DEFAULT_PROGRESS_CALLBACK) | ||
|
|
||
|
|
@@ -668,6 +687,14 @@ def __init__( | |
| num_devices = xr.global_runtime_device_count() | ||
| xs.set_global_mesh(xs.Mesh(np.array(range(num_devices)), (num_devices, 1), axis_names=("fsdp", "tensor"))) | ||
|
|
||
| @property | ||
| def tokenizer(self): | ||
| warnings.warn( | ||
| "The 'tokenizer' attribute is deprecated and will be removed in v5 of Transformers. Use `processor` instead", | ||
| FutureWarning, | ||
| ) | ||
| return self.processor | ||
|
|
||
| def _activate_neftune(self, model): | ||
| r""" | ||
| Activates the neftune as presented in this code: https://github.com/neelsjain/NEFTune and paper: | ||
|
|
@@ -821,7 +848,7 @@ def _get_train_sampler(self) -> Optional[torch.utils.data.Sampler]: | |
| ) | ||
| else: | ||
| lengths = None | ||
| model_input_name = self.tokenizer.model_input_names[0] if self.tokenizer is not None else None | ||
| model_input_name = self.processor.model_input_names[0] if self.processor is not None else None | ||
| return LengthGroupedSampler( | ||
| self.args.train_batch_size * self.args.gradient_accumulation_steps, | ||
| dataset=self.train_dataset, | ||
|
|
@@ -3280,10 +3307,8 @@ def _save_tpu(self, output_dir: Optional[str] = None): | |
| save_function=xm.save, | ||
| safe_serialization=self.args.save_safetensors, | ||
| ) | ||
| if self.tokenizer is not None and self.args.should_save: | ||
| self.tokenizer.save_pretrained(output_dir) | ||
| if self.image_processor is not None and self.args.should_save: | ||
| self.image_processor.save_pretrained(output_dir) | ||
| if self.processor is not None and self.args.should_save: | ||
| self.processor.save_pretrained(output_dir) | ||
|
|
||
| # We moved the model from TPU -> CPU for saving the weights. | ||
| # Now we should move it back to subsequent compute still works. | ||
|
|
@@ -3319,10 +3344,8 @@ def _save(self, output_dir: Optional[str] = None, state_dict=None): | |
| output_dir, state_dict=state_dict, safe_serialization=self.args.save_safetensors | ||
| ) | ||
|
|
||
| if self.tokenizer is not None: | ||
| self.tokenizer.save_pretrained(output_dir) | ||
| if self.image_processor is not None: | ||
| self.image_processor.save_pretrained(output_dir) | ||
| if self.processor is not None: | ||
| self.processor.save_pretrained(output_dir) | ||
|
|
||
| # Good practice: save your training arguments together with the trained model | ||
| torch.save(self.args, os.path.join(output_dir, TRAINING_ARGS_NAME)) | ||
|
|
@@ -4017,12 +4040,9 @@ def _push_from_checkpoint(self, checkpoint_folder): | |
| for modeling_file in modeling_files: | ||
| if os.path.isfile(os.path.join(checkpoint_folder, modeling_file)): | ||
| shutil.copy(os.path.join(checkpoint_folder, modeling_file), os.path.join(output_dir, modeling_file)) | ||
| # Saving the tokenizer is fast and we don't know how many files it may have spawned, so we resave it to be sure. | ||
| if self.tokenizer is not None: | ||
| self.tokenizer.save_pretrained(output_dir) | ||
| # Same for the image processor | ||
| if self.image_processor is not None: | ||
| self.image_processor.save_pretrained(output_dir) | ||
| # Saving the processor and we don't know how many files it may have spawned, so we resave it to be sure. | ||
| if self.processor is not None: | ||
| self.processor.save_pretrained(output_dir) | ||
| # Same for the training arguments | ||
| torch.save(self.args, os.path.join(output_dir, TRAINING_ARGS_NAME)) | ||
|
|
||
|
|
@@ -4076,7 +4096,7 @@ def push_to_hub( | |
| **kwargs, | ||
| ) -> str: | ||
| """ | ||
| Upload `self.model` and `self.tokenizer` or `self.image_processor` to the 🤗 model hub on the repo `self.args.hub_model_id`. | ||
| Upload `self.model` and `self.processor` to the 🤗 model hub on the repo `self.args.hub_model_id`. | ||
|
|
||
| Parameters: | ||
| commit_message (`str`, *optional*, defaults to `"End of training"`): | ||
|
|
||
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
I think we should be careful when manipulating the init signature, see: #30126 - for new args we should put them at the end