diff --git a/docs/source/main_classes/trainer.rst b/docs/source/main_classes/trainer.rst index 4c3bc64f03f1..a6fa393a0ae7 100644 --- a/docs/source/main_classes/trainer.rst +++ b/docs/source/main_classes/trainer.rst @@ -643,7 +643,6 @@ enables FP16, uses AdamW optimizer and WarmupLR scheduler: "weight_decay": 3e-7 } }, - "zero_allow_untested_optimizer": true, "scheduler": { "type": "WarmupLR", @@ -754,8 +753,8 @@ Optimizer ======================================================================================================================= -DeepSpeed's main optimizers are Adam, OneBitAdam, and Lamb. These have been thoroughly tested with ZeRO and are thus -recommended to be used. It, however, can import other optimizers from ``torch``. The full documentation is `here +DeepSpeed's main optimizers are Adam, AdamW, OneBitAdam, and Lamb. These have been thoroughly tested with ZeRO and are +thus recommended to be used. It, however, can import other optimizers from ``torch``. The full documentation is `here `__. If you don't configure the ``optimizer`` entry in the configuration file, the :class:`~transformers.Trainer` will @@ -767,7 +766,6 @@ Here is an example of the pre-configured ``optimizer`` entry for AdamW: .. code-block:: json { - "zero_allow_untested_optimizer": true, "optimizer": { "type": "AdamW", "params": { @@ -779,8 +777,8 @@ Here is an example of the pre-configured ``optimizer`` entry for AdamW: } } -Since AdamW isn't on the list of tested with DeepSpeed/ZeRO optimizers, we have to add -``zero_allow_untested_optimizer`` flag. +If you want to use another optimizer which is not listed above, you will have to add ``"zero_allow_untested_optimizer": +true`` to the top level configuration. If you want to use one of the officially supported optimizers, configure them explicitly in the configuration file, and make sure to adjust the values. e.g. if use Adam you will want ``weight_decay`` around ``0.01``. diff --git a/examples/tests/deepspeed/ds_config.json b/examples/tests/deepspeed/ds_config.json index 24034d1f1d59..8c961be5518f 100644 --- a/examples/tests/deepspeed/ds_config.json +++ b/examples/tests/deepspeed/ds_config.json @@ -19,8 +19,6 @@ "cpu_offload": true }, - "zero_allow_untested_optimizer": true, - "optimizer": { "type": "AdamW", "params": { diff --git a/src/transformers/integrations.py b/src/transformers/integrations.py index b427e33e7c72..634cea5ff083 100644 --- a/src/transformers/integrations.py +++ b/src/transformers/integrations.py @@ -26,6 +26,7 @@ from .trainer_utils import SchedulerType from .utils import logging +from .utils.versions import require_version logger = logging.get_logger(__name__) @@ -281,6 +282,8 @@ def init_deepspeed(trainer, num_training_steps): """ import deepspeed + require_version("deepspeed>0.3.10") + args = trainer.args ds_config_file = args.deepspeed model = trainer.model @@ -323,9 +326,8 @@ def init_deepspeed(trainer, num_training_steps): f"Keeping the `optimizer` config from {ds_config_file} intact, ignoring any optimizer-specific cl args" ) else: # override only if the ds config doesn't already have this section - # ds supports Adam, OneBitAdam, and Lamb optimizers and can import other optimizers from torch. - # But trainer uses AdamW by default. - # To use other optimizers so using a different scheduler requires voiding warranty with: `zero_allow_untested_optimizer` + # ds supports Adam, AdamW, OneBitAdam, and Lamb optimizers and can import other optimizers from torch. + # To use other optimizers requires voiding warranty with: `"zero_allow_untested_optimizer": true"` optimizer_configs = { "AdamW": { @@ -337,7 +339,6 @@ def init_deepspeed(trainer, num_training_steps): } optimizer = "AdamW" - config["zero_allow_untested_optimizer"] = True config["optimizer"] = { "type": optimizer, "params": optimizer_configs[optimizer],