diff --git a/README.md b/README.md
index a2a7ce92..60a64970 100644
--- a/README.md
+++ b/README.md
@@ -69,7 +69,7 @@ model = SetFitModel.from_pretrained(
args = TrainingArguments(
batch_size=16,
num_epochs=4,
- evaluation_strategy="epoch",
+ eval_strategy="epoch",
save_strategy="epoch",
load_best_model_at_end=True,
)
diff --git a/docs/source/en/how_to/absa.mdx b/docs/source/en/how_to/absa.mdx
index 68992827..0bd38c95 100644
--- a/docs/source/en/how_to/absa.mdx
+++ b/docs/source/en/how_to/absa.mdx
@@ -87,7 +87,7 @@ args = TrainingArguments(
num_epochs=5,
use_amp=True,
batch_size=128,
- evaluation_strategy="steps",
+ eval_strategy="steps",
eval_steps=50,
save_steps=50,
load_best_model_at_end=True,
diff --git a/docs/source/en/how_to/v1.0.0_migration_guide.mdx b/docs/source/en/how_to/v1.0.0_migration_guide.mdx
index daa8a87f..6f064137 100644
--- a/docs/source/en/how_to/v1.0.0_migration_guide.mdx
+++ b/docs/source/en/how_to/v1.0.0_migration_guide.mdx
@@ -42,7 +42,7 @@ This list contains new functionality that can be used starting from v1.0.0.
* [`AbsaTrainer`] and [`AbsaModel`] have been introduced for applying [SetFit for Aspect Based Sentiment Analysis](absa).
* [`Trainer`] now supports a `callbacks` argument for a list of [`transformers` `TrainerCallback` instances](https://huggingface.co/docs/transformers/main/en/main_classes/callback).
* By default, all installed callbacks integrated with `transformers` are supported, including [`TensorBoardCallback`](https://huggingface.co/docs/transformers/main/en/main_classes/callback#transformers.integrations.TensorBoardCallback), [`WandbCallback`](https://huggingface.co/docs/transformers/main/en/main_classes/callback#transformers.integrations.WandbCallback) to log training logs to [TensorBoard](https://www.tensorflow.org/tensorboard) and [W&B](https://wandb.ai), respectively.
- * The [`Trainer`] will now print `embedding_loss` in the terminal, as well as `eval_embedding_loss` if `evaluation_strategy` is set to `"epoch"` or `"steps"` in [`TrainingArguments`].
+ * The [`Trainer`] will now print `embedding_loss` in the terminal, as well as `eval_embedding_loss` if `eval_strategy` is set to `"epoch"` or `"steps"` in [`TrainingArguments`].
* [`Trainer.evaluate`] now works with string labels.
* An updated contrastive pair sampler increases the variety of training pairs.
* [`TrainingArguments`] supports various new arguments:
@@ -65,14 +65,14 @@ This list contains new functionality that can be used starting from v1.0.0.
* `logging_first_step`: Whether to log and evaluate the first `global_step` or not.
* `logging_steps`: Number of update steps between two logs if `logging_strategy="steps"`.
- * `evaluation_strategy`: The evaluation strategy to adopt during training. Possible values are:
+ * `eval_strategy`: The evaluation strategy to adopt during training. Possible values are:
- `"no"`: No evaluation is done during training.
- `"steps"`: Evaluation is done (and logged) every `eval_steps`.
- `"epoch"`: Evaluation is done at the end of each epoch.
- * `eval_steps`: Number of update steps between two evaluations if `evaluation_strategy="steps"`. Will default to the same as `logging_steps` if not set.
- * `eval_delay`: Number of epochs or steps to wait for before the first evaluation can be performed, depending on the `evaluation_strategy`.
+ * `eval_steps`: Number of update steps between two evaluations if `eval_strategy="steps"`. Will default to the same as `logging_steps` if not set.
+ * `eval_delay`: Number of epochs or steps to wait for before the first evaluation can be performed, depending on the `eval_strategy`.
* `eval_max_steps`: If set to a positive number, the total number of evaluation steps to perform. The evaluation may stop before reaching the set number of steps when all data is exhausted.
* `save_strategy`: The checkpoint save strategy to adopt during training. Possible values are:
@@ -81,12 +81,12 @@ This list contains new functionality that can be used starting from v1.0.0.
- `"steps"`: Save is done every `save_steps`.
* `save_steps`: Number of updates steps before two checkpoint saves if `save_strategy="steps"`.
- * `save_total_limit`: If a value is passed, will limit the total amount of checkpoints. Deletes the older checkpoints in `output_dir`. Note, the best model is always preserved if the `evaluation_strategy` is not `"no"`.
+ * `save_total_limit`: If a value is passed, will limit the total amount of checkpoints. Deletes the older checkpoints in `output_dir`. Note, the best model is always preserved if the `eval_strategy` is not `"no"`.
* `load_best_model_at_end`: Whether or not to load the best model found during training at the end of training.
- When set to `True`, the parameters `save_strategy` needs to be the same as `evaluation_strategy`, and in
+ When set to `True`, the parameters `save_strategy` needs to be the same as `eval_strategy`, and in
the case it is "steps", `save_steps` must be a round multiple of `eval_steps`.
diff --git a/scripts/setfit/distillation_baseline.py b/scripts/setfit/distillation_baseline.py
index 75b84b91..cd08a80a 100644
--- a/scripts/setfit/distillation_baseline.py
+++ b/scripts/setfit/distillation_baseline.py
@@ -82,7 +82,7 @@ def standard_model_distillation(self, train_raw_student, x_test, y_test, num_cla
per_device_train_batch_size=self.batch_size,
per_device_eval_batch_size=self.batch_size,
num_train_epochs=self.num_epochs,
- evaluation_strategy="no",
+ eval_strategy="no",
save_strategy="no",
load_best_model_at_end=False,
weight_decay=0.01,
diff --git a/scripts/setfit/run_fewshot.py b/scripts/setfit/run_fewshot.py
index 08f7023e..db69445f 100644
--- a/scripts/setfit/run_fewshot.py
+++ b/scripts/setfit/run_fewshot.py
@@ -59,7 +59,7 @@ def parse_args():
parser.add_argument("--override_results", default=False, action="store_true")
parser.add_argument("--keep_body_frozen", default=False, action="store_true")
parser.add_argument("--add_data_augmentation", default=False)
- parser.add_argument("--evaluation_strategy", default=False)
+ parser.add_argument("--eval_strategy", default=False)
args = parser.parse_args()
@@ -149,8 +149,8 @@ def main():
num_epochs=args.num_epochs,
num_iterations=args.num_iterations,
)
- if not args.evaluation_strategy:
- trainer.args.evaluation_strategy = "no"
+ if not args.eval_strategy:
+ trainer.args.eval_strategy = "no"
if args.classifier == "pytorch":
trainer.freeze()
trainer.train()
diff --git a/scripts/transformers/run_fewshot.py b/scripts/transformers/run_fewshot.py
index 33a4e5b4..d248a392 100644
--- a/scripts/transformers/run_fewshot.py
+++ b/scripts/transformers/run_fewshot.py
@@ -94,7 +94,7 @@ def compute_metrics(pred):
per_device_train_batch_size=batch_size,
per_device_eval_batch_size=batch_size,
weight_decay=0.01,
- evaluation_strategy="epoch",
+ eval_strategy="epoch",
logging_steps=100,
save_strategy="no",
fp16=True,
diff --git a/scripts/transformers/run_fewshot_multilingual.py b/scripts/transformers/run_fewshot_multilingual.py
index 665a2361..e26826ce 100644
--- a/scripts/transformers/run_fewshot_multilingual.py
+++ b/scripts/transformers/run_fewshot_multilingual.py
@@ -119,7 +119,7 @@ def compute_metrics(pred):
per_device_train_batch_size=batch_size,
per_device_eval_batch_size=batch_size,
weight_decay=0.01,
- evaluation_strategy="epoch",
+ eval_strategy="epoch",
logging_steps=100,
save_strategy="no",
fp16=True,
diff --git a/scripts/transformers/run_full.py b/scripts/transformers/run_full.py
index ab93bb43..f3ea0109 100644
--- a/scripts/transformers/run_full.py
+++ b/scripts/transformers/run_full.py
@@ -85,7 +85,7 @@ def compute_metrics(pred):
per_device_train_batch_size=batch_size,
per_device_eval_batch_size=batch_size,
weight_decay=0.001,
- evaluation_strategy="epoch",
+ eval_strategy="epoch",
logging_steps=100,
metric_for_best_model=metric,
load_best_model_at_end=True,
diff --git a/scripts/transformers/run_full_multilingual.py b/scripts/transformers/run_full_multilingual.py
index e8b058b2..adfab3fe 100644
--- a/scripts/transformers/run_full_multilingual.py
+++ b/scripts/transformers/run_full_multilingual.py
@@ -104,7 +104,7 @@ def compute_metrics(pred):
per_device_train_batch_size=batch_size,
per_device_eval_batch_size=batch_size,
weight_decay=0.01,
- evaluation_strategy="epoch",
+ eval_strategy="epoch",
logging_steps=100,
metric_for_best_model="eval_loss",
load_best_model_at_end=True,
diff --git a/src/setfit/model_card.py b/src/setfit/model_card.py
index 958c0849..63166401 100644
--- a/src/setfit/model_card.py
+++ b/src/setfit/model_card.py
@@ -80,7 +80,7 @@ def on_train_begin(
"logging_strategy",
"logging_first_step",
"logging_steps",
- "evaluation_strategy",
+ "eval_strategy",
"eval_steps",
"eval_delay",
"save_strategy",
diff --git a/src/setfit/trainer.py b/src/setfit/trainer.py
index 7698e686..bdf781be 100644
--- a/src/setfit/trainer.py
+++ b/src/setfit/trainer.py
@@ -443,7 +443,7 @@ def train_embeddings(
train_dataloader, loss_func, batch_size, num_unique_pairs = self.get_dataloader(
x_train, y_train, args=args, max_pairs=train_max_pairs
)
- if x_eval is not None and args.evaluation_strategy != IntervalStrategy.NO:
+ if x_eval is not None and args.eval_strategy != IntervalStrategy.NO:
eval_max_pairs = -1 if args.eval_max_steps == -1 else args.eval_max_steps * args.embedding_batch_size
eval_dataloader, _, _, _ = self.get_dataloader(x_eval, y_eval, args=args, max_pairs=eval_max_pairs)
else:
diff --git a/src/setfit/training_args.py b/src/setfit/training_args.py
index 9669c460..306274bd 100644
--- a/src/setfit/training_args.py
+++ b/src/setfit/training_args.py
@@ -124,7 +124,7 @@ class TrainingArguments:
Whether to log and evaluate the first `global_step` or not.
logging_steps (`int`, defaults to 50):
Number of update steps between two logs if `logging_strategy="steps"`.
- evaluation_strategy (`str` or [`~transformers.trainer_utils.IntervalStrategy`], *optional*, defaults to `"no"`):
+ eval_strategy (`str` or [`~transformers.trainer_utils.IntervalStrategy`], *optional*, defaults to `"no"`):
The evaluation strategy to adopt during training. Possible values are:
- `"no"`: No evaluation is done during training.
@@ -132,11 +132,11 @@ class TrainingArguments:
- `"epoch"`: Evaluation is done at the end of each epoch.
eval_steps (`int`, *optional*):
- Number of update steps between two evaluations if `evaluation_strategy="steps"`. Will default to the same
+ Number of update steps between two evaluations if `eval_strategy="steps"`. Will default to the same
value as `logging_steps` if not set.
eval_delay (`float`, *optional*):
Number of epochs or steps to wait for before the first evaluation can be performed, depending on the
- evaluation_strategy.
+ eval_strategy.
eval_max_steps (`int`, defaults to `-1`):
If set to a positive number, the total number of evaluation steps to perform. The evaluation may stop
before reaching the set number of steps when all data is exhausted.
@@ -151,13 +151,13 @@ class TrainingArguments:
Number of updates steps before two checkpoint saves if `save_strategy="steps"`.
save_total_limit (`int`, *optional*, defaults to `1`):
If a value is passed, will limit the total amount of checkpoints. Deletes the older checkpoints in
- `output_dir`. Note, the best model is always preserved if the `evaluation_strategy` is not `"no"`.
+ `output_dir`. Note, the best model is always preserved if the `eval_strategy` is not `"no"`.
load_best_model_at_end (`bool`, *optional*, defaults to `False`):
Whether or not to load the best model found during training at the end of training.
- When set to `True`, the parameters `save_strategy` needs to be the same as `evaluation_strategy`, and in
+ When set to `True`, the parameters `save_strategy` needs to be the same as `eval_strategy`, and in
the case it is "steps", `save_steps` must be a round multiple of `eval_steps`.
@@ -208,7 +208,8 @@ class TrainingArguments:
logging_first_step: bool = True
logging_steps: int = 50
- evaluation_strategy: str = "no"
+ eval_strategy: str = "no"
+ evaluation_strategy: str = field(default="no", repr=False, init=False) # Softly deprecated
eval_steps: Optional[int] = None
eval_delay: int = 0
eval_max_steps: int = -1
@@ -251,30 +252,36 @@ def __post_init__(self) -> None:
self.logging_dir = default_logdir()
self.logging_strategy = IntervalStrategy(self.logging_strategy)
- self.evaluation_strategy = IntervalStrategy(self.evaluation_strategy)
+ if self.evaluation_strategy and not self.eval_strategy:
+ logger.warning(
+ "The `evaluation_strategy` argument is deprecated and will be removed in a future version. "
+ "Please use `eval_strategy` instead."
+ )
+ self.eval_strategy = self.evaluation_strategy
+ self.eval_strategy = IntervalStrategy(self.eval_strategy)
- if self.eval_steps is not None and self.evaluation_strategy == IntervalStrategy.NO:
- logger.info('Using `evaluation_strategy="steps"` as `eval_steps` is defined.')
- self.evaluation_strategy = IntervalStrategy.STEPS
+ if self.eval_steps is not None and self.eval_strategy == IntervalStrategy.NO:
+ logger.info('Using `eval_strategy="steps"` as `eval_steps` is defined.')
+ self.eval_strategy = IntervalStrategy.STEPS
# eval_steps has to be defined and non-zero, fallbacks to logging_steps if the latter is non-zero
- if self.evaluation_strategy == IntervalStrategy.STEPS and (self.eval_steps is None or self.eval_steps == 0):
+ if self.eval_strategy == IntervalStrategy.STEPS and (self.eval_steps is None or self.eval_steps == 0):
if self.logging_steps > 0:
self.eval_steps = self.logging_steps
else:
raise ValueError(
- f"evaluation strategy {self.evaluation_strategy} requires either non-zero `eval_steps` or"
+ f"evaluation strategy {self.eval_strategy} requires either non-zero `eval_steps` or"
" `logging_steps`"
)
# Sanity checks for load_best_model_at_end: we require save and eval strategies to be compatible.
if self.load_best_model_at_end:
- if self.evaluation_strategy != self.save_strategy:
+ if self.eval_strategy != self.save_strategy:
raise ValueError(
"`load_best_model_at_end` requires the save and eval strategy to match, but found\n- Evaluation "
- f"strategy: {self.evaluation_strategy}\n- Save strategy: {self.save_strategy}"
+ f"strategy: {self.eval_strategy}\n- Save strategy: {self.save_strategy}"
)
- if self.evaluation_strategy == IntervalStrategy.STEPS and self.save_steps % self.eval_steps != 0:
+ if self.eval_strategy == IntervalStrategy.STEPS and self.save_steps % self.eval_steps != 0:
raise ValueError(
"`load_best_model_at_end` requires the saving steps to be a round multiple of the evaluation "
f"steps, but found {self.save_steps}, which is not a round multiple of {self.eval_steps}."
diff --git a/tests/span/test_model_card.py b/tests/span/test_model_card.py
index 4b636006..60fd830b 100644
--- a/tests/span/test_model_card.py
+++ b/tests/span/test_model_card.py
@@ -25,7 +25,7 @@ def test_model_card(absa_dataset: Dataset, tmp_path: Path) -> None:
eval_steps=1,
logging_steps=1,
max_steps=2,
- evaluation_strategy="steps",
+ eval_strategy="steps",
)
trainer = AbsaTrainer(
model=model,
diff --git a/tests/test_model_card.py b/tests/test_model_card.py
index 73cdc92d..910f7237 100644
--- a/tests/test_model_card.py
+++ b/tests/test_model_card.py
@@ -35,7 +35,7 @@ def test_model_card(tmp_path: Path) -> None:
eval_steps=1,
logging_steps=1,
max_steps=2,
- evaluation_strategy="steps",
+ eval_strategy="steps",
)
trainer = Trainer(
model=model,
diff --git a/tests/test_trainer.py b/tests/test_trainer.py
index 40f37b5d..b61b10d7 100644
--- a/tests/test_trainer.py
+++ b/tests/test_trainer.py
@@ -590,7 +590,7 @@ def test_train_load_best(model: SetFitModel, tmp_path: Path, caplog: LogCaptureF
output_dir=tmp_path,
save_steps=5,
eval_steps=5,
- evaluation_strategy="steps",
+ eval_strategy="steps",
load_best_model_at_end=True,
num_epochs=5,
)
diff --git a/tests/test_training_args.py b/tests/test_training_args.py
index ee7b4f88..ecce4f42 100644
--- a/tests/test_training_args.py
+++ b/tests/test_training_args.py
@@ -72,21 +72,21 @@ def test_report_to(self):
def test_eval_steps_without_eval_strat(self):
args = TrainingArguments(eval_steps=5)
- self.assertEqual(args.evaluation_strategy, IntervalStrategy.STEPS)
+ self.assertEqual(args.eval_strategy, IntervalStrategy.STEPS)
def test_eval_strat_steps_without_eval_steps(self):
- args = TrainingArguments(evaluation_strategy="steps")
+ args = TrainingArguments(eval_strategy="steps")
self.assertEqual(args.eval_steps, args.logging_steps)
with self.assertRaises(ValueError):
- TrainingArguments(evaluation_strategy="steps", logging_steps=0, logging_strategy="no")
+ TrainingArguments(eval_strategy="steps", logging_steps=0, logging_strategy="no")
def test_load_best_model(self):
with self.assertRaises(ValueError):
- TrainingArguments(load_best_model_at_end=True, evaluation_strategy="steps", save_strategy="epoch")
+ TrainingArguments(load_best_model_at_end=True, eval_strategy="steps", save_strategy="epoch")
with self.assertRaises(ValueError):
TrainingArguments(
load_best_model_at_end=True,
- evaluation_strategy="steps",
+ eval_strategy="steps",
save_strategy="steps",
eval_steps=100,
save_steps=50,
@@ -94,7 +94,7 @@ def test_load_best_model(self):
# No error: save_steps is a round multiple of eval_steps
TrainingArguments(
load_best_model_at_end=True,
- evaluation_strategy="steps",
+ eval_strategy="steps",
save_strategy="steps",
eval_steps=50,
save_steps=100,