diff --git a/README.md b/README.md index a2a7ce92..60a64970 100644 --- a/README.md +++ b/README.md @@ -69,7 +69,7 @@ model = SetFitModel.from_pretrained( args = TrainingArguments( batch_size=16, num_epochs=4, - evaluation_strategy="epoch", + eval_strategy="epoch", save_strategy="epoch", load_best_model_at_end=True, ) diff --git a/docs/source/en/how_to/absa.mdx b/docs/source/en/how_to/absa.mdx index 68992827..0bd38c95 100644 --- a/docs/source/en/how_to/absa.mdx +++ b/docs/source/en/how_to/absa.mdx @@ -87,7 +87,7 @@ args = TrainingArguments( num_epochs=5, use_amp=True, batch_size=128, - evaluation_strategy="steps", + eval_strategy="steps", eval_steps=50, save_steps=50, load_best_model_at_end=True, diff --git a/docs/source/en/how_to/v1.0.0_migration_guide.mdx b/docs/source/en/how_to/v1.0.0_migration_guide.mdx index daa8a87f..6f064137 100644 --- a/docs/source/en/how_to/v1.0.0_migration_guide.mdx +++ b/docs/source/en/how_to/v1.0.0_migration_guide.mdx @@ -42,7 +42,7 @@ This list contains new functionality that can be used starting from v1.0.0. * [`AbsaTrainer`] and [`AbsaModel`] have been introduced for applying [SetFit for Aspect Based Sentiment Analysis](absa). * [`Trainer`] now supports a `callbacks` argument for a list of [`transformers` `TrainerCallback` instances](https://huggingface.co/docs/transformers/main/en/main_classes/callback). * By default, all installed callbacks integrated with `transformers` are supported, including [`TensorBoardCallback`](https://huggingface.co/docs/transformers/main/en/main_classes/callback#transformers.integrations.TensorBoardCallback), [`WandbCallback`](https://huggingface.co/docs/transformers/main/en/main_classes/callback#transformers.integrations.WandbCallback) to log training logs to [TensorBoard](https://www.tensorflow.org/tensorboard) and [W&B](https://wandb.ai), respectively. - * The [`Trainer`] will now print `embedding_loss` in the terminal, as well as `eval_embedding_loss` if `evaluation_strategy` is set to `"epoch"` or `"steps"` in [`TrainingArguments`]. + * The [`Trainer`] will now print `embedding_loss` in the terminal, as well as `eval_embedding_loss` if `eval_strategy` is set to `"epoch"` or `"steps"` in [`TrainingArguments`]. * [`Trainer.evaluate`] now works with string labels. * An updated contrastive pair sampler increases the variety of training pairs. * [`TrainingArguments`] supports various new arguments: @@ -65,14 +65,14 @@ This list contains new functionality that can be used starting from v1.0.0. * `logging_first_step`: Whether to log and evaluate the first `global_step` or not. * `logging_steps`: Number of update steps between two logs if `logging_strategy="steps"`. - * `evaluation_strategy`: The evaluation strategy to adopt during training. Possible values are: + * `eval_strategy`: The evaluation strategy to adopt during training. Possible values are: - `"no"`: No evaluation is done during training. - `"steps"`: Evaluation is done (and logged) every `eval_steps`. - `"epoch"`: Evaluation is done at the end of each epoch. - * `eval_steps`: Number of update steps between two evaluations if `evaluation_strategy="steps"`. Will default to the same as `logging_steps` if not set. - * `eval_delay`: Number of epochs or steps to wait for before the first evaluation can be performed, depending on the `evaluation_strategy`. + * `eval_steps`: Number of update steps between two evaluations if `eval_strategy="steps"`. Will default to the same as `logging_steps` if not set. + * `eval_delay`: Number of epochs or steps to wait for before the first evaluation can be performed, depending on the `eval_strategy`. * `eval_max_steps`: If set to a positive number, the total number of evaluation steps to perform. The evaluation may stop before reaching the set number of steps when all data is exhausted. * `save_strategy`: The checkpoint save strategy to adopt during training. Possible values are: @@ -81,12 +81,12 @@ This list contains new functionality that can be used starting from v1.0.0. - `"steps"`: Save is done every `save_steps`. * `save_steps`: Number of updates steps before two checkpoint saves if `save_strategy="steps"`. - * `save_total_limit`: If a value is passed, will limit the total amount of checkpoints. Deletes the older checkpoints in `output_dir`. Note, the best model is always preserved if the `evaluation_strategy` is not `"no"`. + * `save_total_limit`: If a value is passed, will limit the total amount of checkpoints. Deletes the older checkpoints in `output_dir`. Note, the best model is always preserved if the `eval_strategy` is not `"no"`. * `load_best_model_at_end`: Whether or not to load the best model found during training at the end of training. - When set to `True`, the parameters `save_strategy` needs to be the same as `evaluation_strategy`, and in + When set to `True`, the parameters `save_strategy` needs to be the same as `eval_strategy`, and in the case it is "steps", `save_steps` must be a round multiple of `eval_steps`. diff --git a/scripts/setfit/distillation_baseline.py b/scripts/setfit/distillation_baseline.py index 75b84b91..cd08a80a 100644 --- a/scripts/setfit/distillation_baseline.py +++ b/scripts/setfit/distillation_baseline.py @@ -82,7 +82,7 @@ def standard_model_distillation(self, train_raw_student, x_test, y_test, num_cla per_device_train_batch_size=self.batch_size, per_device_eval_batch_size=self.batch_size, num_train_epochs=self.num_epochs, - evaluation_strategy="no", + eval_strategy="no", save_strategy="no", load_best_model_at_end=False, weight_decay=0.01, diff --git a/scripts/setfit/run_fewshot.py b/scripts/setfit/run_fewshot.py index 08f7023e..db69445f 100644 --- a/scripts/setfit/run_fewshot.py +++ b/scripts/setfit/run_fewshot.py @@ -59,7 +59,7 @@ def parse_args(): parser.add_argument("--override_results", default=False, action="store_true") parser.add_argument("--keep_body_frozen", default=False, action="store_true") parser.add_argument("--add_data_augmentation", default=False) - parser.add_argument("--evaluation_strategy", default=False) + parser.add_argument("--eval_strategy", default=False) args = parser.parse_args() @@ -149,8 +149,8 @@ def main(): num_epochs=args.num_epochs, num_iterations=args.num_iterations, ) - if not args.evaluation_strategy: - trainer.args.evaluation_strategy = "no" + if not args.eval_strategy: + trainer.args.eval_strategy = "no" if args.classifier == "pytorch": trainer.freeze() trainer.train() diff --git a/scripts/transformers/run_fewshot.py b/scripts/transformers/run_fewshot.py index 33a4e5b4..d248a392 100644 --- a/scripts/transformers/run_fewshot.py +++ b/scripts/transformers/run_fewshot.py @@ -94,7 +94,7 @@ def compute_metrics(pred): per_device_train_batch_size=batch_size, per_device_eval_batch_size=batch_size, weight_decay=0.01, - evaluation_strategy="epoch", + eval_strategy="epoch", logging_steps=100, save_strategy="no", fp16=True, diff --git a/scripts/transformers/run_fewshot_multilingual.py b/scripts/transformers/run_fewshot_multilingual.py index 665a2361..e26826ce 100644 --- a/scripts/transformers/run_fewshot_multilingual.py +++ b/scripts/transformers/run_fewshot_multilingual.py @@ -119,7 +119,7 @@ def compute_metrics(pred): per_device_train_batch_size=batch_size, per_device_eval_batch_size=batch_size, weight_decay=0.01, - evaluation_strategy="epoch", + eval_strategy="epoch", logging_steps=100, save_strategy="no", fp16=True, diff --git a/scripts/transformers/run_full.py b/scripts/transformers/run_full.py index ab93bb43..f3ea0109 100644 --- a/scripts/transformers/run_full.py +++ b/scripts/transformers/run_full.py @@ -85,7 +85,7 @@ def compute_metrics(pred): per_device_train_batch_size=batch_size, per_device_eval_batch_size=batch_size, weight_decay=0.001, - evaluation_strategy="epoch", + eval_strategy="epoch", logging_steps=100, metric_for_best_model=metric, load_best_model_at_end=True, diff --git a/scripts/transformers/run_full_multilingual.py b/scripts/transformers/run_full_multilingual.py index e8b058b2..adfab3fe 100644 --- a/scripts/transformers/run_full_multilingual.py +++ b/scripts/transformers/run_full_multilingual.py @@ -104,7 +104,7 @@ def compute_metrics(pred): per_device_train_batch_size=batch_size, per_device_eval_batch_size=batch_size, weight_decay=0.01, - evaluation_strategy="epoch", + eval_strategy="epoch", logging_steps=100, metric_for_best_model="eval_loss", load_best_model_at_end=True, diff --git a/src/setfit/model_card.py b/src/setfit/model_card.py index 958c0849..63166401 100644 --- a/src/setfit/model_card.py +++ b/src/setfit/model_card.py @@ -80,7 +80,7 @@ def on_train_begin( "logging_strategy", "logging_first_step", "logging_steps", - "evaluation_strategy", + "eval_strategy", "eval_steps", "eval_delay", "save_strategy", diff --git a/src/setfit/trainer.py b/src/setfit/trainer.py index 7698e686..bdf781be 100644 --- a/src/setfit/trainer.py +++ b/src/setfit/trainer.py @@ -443,7 +443,7 @@ def train_embeddings( train_dataloader, loss_func, batch_size, num_unique_pairs = self.get_dataloader( x_train, y_train, args=args, max_pairs=train_max_pairs ) - if x_eval is not None and args.evaluation_strategy != IntervalStrategy.NO: + if x_eval is not None and args.eval_strategy != IntervalStrategy.NO: eval_max_pairs = -1 if args.eval_max_steps == -1 else args.eval_max_steps * args.embedding_batch_size eval_dataloader, _, _, _ = self.get_dataloader(x_eval, y_eval, args=args, max_pairs=eval_max_pairs) else: diff --git a/src/setfit/training_args.py b/src/setfit/training_args.py index 9669c460..306274bd 100644 --- a/src/setfit/training_args.py +++ b/src/setfit/training_args.py @@ -124,7 +124,7 @@ class TrainingArguments: Whether to log and evaluate the first `global_step` or not. logging_steps (`int`, defaults to 50): Number of update steps between two logs if `logging_strategy="steps"`. - evaluation_strategy (`str` or [`~transformers.trainer_utils.IntervalStrategy`], *optional*, defaults to `"no"`): + eval_strategy (`str` or [`~transformers.trainer_utils.IntervalStrategy`], *optional*, defaults to `"no"`): The evaluation strategy to adopt during training. Possible values are: - `"no"`: No evaluation is done during training. @@ -132,11 +132,11 @@ class TrainingArguments: - `"epoch"`: Evaluation is done at the end of each epoch. eval_steps (`int`, *optional*): - Number of update steps between two evaluations if `evaluation_strategy="steps"`. Will default to the same + Number of update steps between two evaluations if `eval_strategy="steps"`. Will default to the same value as `logging_steps` if not set. eval_delay (`float`, *optional*): Number of epochs or steps to wait for before the first evaluation can be performed, depending on the - evaluation_strategy. + eval_strategy. eval_max_steps (`int`, defaults to `-1`): If set to a positive number, the total number of evaluation steps to perform. The evaluation may stop before reaching the set number of steps when all data is exhausted. @@ -151,13 +151,13 @@ class TrainingArguments: Number of updates steps before two checkpoint saves if `save_strategy="steps"`. save_total_limit (`int`, *optional*, defaults to `1`): If a value is passed, will limit the total amount of checkpoints. Deletes the older checkpoints in - `output_dir`. Note, the best model is always preserved if the `evaluation_strategy` is not `"no"`. + `output_dir`. Note, the best model is always preserved if the `eval_strategy` is not `"no"`. load_best_model_at_end (`bool`, *optional*, defaults to `False`): Whether or not to load the best model found during training at the end of training. - When set to `True`, the parameters `save_strategy` needs to be the same as `evaluation_strategy`, and in + When set to `True`, the parameters `save_strategy` needs to be the same as `eval_strategy`, and in the case it is "steps", `save_steps` must be a round multiple of `eval_steps`. @@ -208,7 +208,8 @@ class TrainingArguments: logging_first_step: bool = True logging_steps: int = 50 - evaluation_strategy: str = "no" + eval_strategy: str = "no" + evaluation_strategy: str = field(default="no", repr=False, init=False) # Softly deprecated eval_steps: Optional[int] = None eval_delay: int = 0 eval_max_steps: int = -1 @@ -251,30 +252,36 @@ def __post_init__(self) -> None: self.logging_dir = default_logdir() self.logging_strategy = IntervalStrategy(self.logging_strategy) - self.evaluation_strategy = IntervalStrategy(self.evaluation_strategy) + if self.evaluation_strategy and not self.eval_strategy: + logger.warning( + "The `evaluation_strategy` argument is deprecated and will be removed in a future version. " + "Please use `eval_strategy` instead." + ) + self.eval_strategy = self.evaluation_strategy + self.eval_strategy = IntervalStrategy(self.eval_strategy) - if self.eval_steps is not None and self.evaluation_strategy == IntervalStrategy.NO: - logger.info('Using `evaluation_strategy="steps"` as `eval_steps` is defined.') - self.evaluation_strategy = IntervalStrategy.STEPS + if self.eval_steps is not None and self.eval_strategy == IntervalStrategy.NO: + logger.info('Using `eval_strategy="steps"` as `eval_steps` is defined.') + self.eval_strategy = IntervalStrategy.STEPS # eval_steps has to be defined and non-zero, fallbacks to logging_steps if the latter is non-zero - if self.evaluation_strategy == IntervalStrategy.STEPS and (self.eval_steps is None or self.eval_steps == 0): + if self.eval_strategy == IntervalStrategy.STEPS and (self.eval_steps is None or self.eval_steps == 0): if self.logging_steps > 0: self.eval_steps = self.logging_steps else: raise ValueError( - f"evaluation strategy {self.evaluation_strategy} requires either non-zero `eval_steps` or" + f"evaluation strategy {self.eval_strategy} requires either non-zero `eval_steps` or" " `logging_steps`" ) # Sanity checks for load_best_model_at_end: we require save and eval strategies to be compatible. if self.load_best_model_at_end: - if self.evaluation_strategy != self.save_strategy: + if self.eval_strategy != self.save_strategy: raise ValueError( "`load_best_model_at_end` requires the save and eval strategy to match, but found\n- Evaluation " - f"strategy: {self.evaluation_strategy}\n- Save strategy: {self.save_strategy}" + f"strategy: {self.eval_strategy}\n- Save strategy: {self.save_strategy}" ) - if self.evaluation_strategy == IntervalStrategy.STEPS and self.save_steps % self.eval_steps != 0: + if self.eval_strategy == IntervalStrategy.STEPS and self.save_steps % self.eval_steps != 0: raise ValueError( "`load_best_model_at_end` requires the saving steps to be a round multiple of the evaluation " f"steps, but found {self.save_steps}, which is not a round multiple of {self.eval_steps}." diff --git a/tests/span/test_model_card.py b/tests/span/test_model_card.py index 4b636006..60fd830b 100644 --- a/tests/span/test_model_card.py +++ b/tests/span/test_model_card.py @@ -25,7 +25,7 @@ def test_model_card(absa_dataset: Dataset, tmp_path: Path) -> None: eval_steps=1, logging_steps=1, max_steps=2, - evaluation_strategy="steps", + eval_strategy="steps", ) trainer = AbsaTrainer( model=model, diff --git a/tests/test_model_card.py b/tests/test_model_card.py index 73cdc92d..910f7237 100644 --- a/tests/test_model_card.py +++ b/tests/test_model_card.py @@ -35,7 +35,7 @@ def test_model_card(tmp_path: Path) -> None: eval_steps=1, logging_steps=1, max_steps=2, - evaluation_strategy="steps", + eval_strategy="steps", ) trainer = Trainer( model=model, diff --git a/tests/test_trainer.py b/tests/test_trainer.py index 40f37b5d..b61b10d7 100644 --- a/tests/test_trainer.py +++ b/tests/test_trainer.py @@ -590,7 +590,7 @@ def test_train_load_best(model: SetFitModel, tmp_path: Path, caplog: LogCaptureF output_dir=tmp_path, save_steps=5, eval_steps=5, - evaluation_strategy="steps", + eval_strategy="steps", load_best_model_at_end=True, num_epochs=5, ) diff --git a/tests/test_training_args.py b/tests/test_training_args.py index ee7b4f88..ecce4f42 100644 --- a/tests/test_training_args.py +++ b/tests/test_training_args.py @@ -72,21 +72,21 @@ def test_report_to(self): def test_eval_steps_without_eval_strat(self): args = TrainingArguments(eval_steps=5) - self.assertEqual(args.evaluation_strategy, IntervalStrategy.STEPS) + self.assertEqual(args.eval_strategy, IntervalStrategy.STEPS) def test_eval_strat_steps_without_eval_steps(self): - args = TrainingArguments(evaluation_strategy="steps") + args = TrainingArguments(eval_strategy="steps") self.assertEqual(args.eval_steps, args.logging_steps) with self.assertRaises(ValueError): - TrainingArguments(evaluation_strategy="steps", logging_steps=0, logging_strategy="no") + TrainingArguments(eval_strategy="steps", logging_steps=0, logging_strategy="no") def test_load_best_model(self): with self.assertRaises(ValueError): - TrainingArguments(load_best_model_at_end=True, evaluation_strategy="steps", save_strategy="epoch") + TrainingArguments(load_best_model_at_end=True, eval_strategy="steps", save_strategy="epoch") with self.assertRaises(ValueError): TrainingArguments( load_best_model_at_end=True, - evaluation_strategy="steps", + eval_strategy="steps", save_strategy="steps", eval_steps=100, save_steps=50, @@ -94,7 +94,7 @@ def test_load_best_model(self): # No error: save_steps is a round multiple of eval_steps TrainingArguments( load_best_model_at_end=True, - evaluation_strategy="steps", + eval_strategy="steps", save_strategy="steps", eval_steps=50, save_steps=100,