diff --git a/deepspeed/runtime/config.py b/deepspeed/runtime/config.py index 1c4610123bfa..2aeb5135350f 100755 --- a/deepspeed/runtime/config.py +++ b/deepspeed/runtime/config.py @@ -27,17 +27,18 @@ TENSOR_CORE_ALIGN_SIZE = 8 ADAM_OPTIMIZER = 'adam' +ADAMW_OPTIMIZER = 'adamw' LAMB_OPTIMIZER = 'lamb' ONEBIT_ADAM_OPTIMIZER = 'onebitadam' DEEPSPEED_OPTIMIZERS = [ ADAM_OPTIMIZER, + ADAMW_OPTIMIZER, LAMB_OPTIMIZER, ONEBIT_ADAM_OPTIMIZER, ] -# extra optimizer parameters for adam +# extra optimizer parameters for adam/adamw TORCH_ADAM_PARAM = "torch_adam" -ADAM_W_MODE_PARAM = "adam_w_mode" class DeepSpeedConfigError(Exception): diff --git a/deepspeed/runtime/engine.py b/deepspeed/runtime/engine.py index 7d3e4bea0cf1..5036c7de0ee6 100755 --- a/deepspeed/runtime/engine.py +++ b/deepspeed/runtime/engine.py @@ -19,8 +19,8 @@ from deepspeed.runtime.fp16.fused_optimizer import FP16_Optimizer from deepspeed.runtime.fp16.unfused_optimizer import FP16_UnfusedOptimizer from deepspeed.runtime.config import DeepSpeedConfig, DEEPSPEED_OPTIMIZERS, \ - ADAM_OPTIMIZER, LAMB_OPTIMIZER, ONEBIT_ADAM_OPTIMIZER, \ - TORCH_ADAM_PARAM, ADAM_W_MODE_PARAM + ADAM_OPTIMIZER, ADAMW_OPTIMIZER, LAMB_OPTIMIZER, ONEBIT_ADAM_OPTIMIZER, \ + TORCH_ADAM_PARAM from deepspeed.runtime.dataloader import DeepSpeedDataLoader from deepspeed.runtime.constants import \ @@ -582,10 +582,9 @@ def _configure_basic_optimizer(self, model_parameters): "'max_grad_norm' is not supported as an optimizer parameter, please switch to using the deepspeed parameter 'gradient_clipping' see: https://www.deepspeed.ai/docs/config-json/#gradient-clipping for more details" ) - if self.optimizer_name() == ADAM_OPTIMIZER: + if self.optimizer_name() in [ADAM_OPTIMIZER, ADAMW_OPTIMIZER]: torch_adam = optimizer_parameters.pop(TORCH_ADAM_PARAM, False) - adam_w_mode = optimizer_parameters.pop(ADAM_W_MODE_PARAM, True) - + adam_w_mode = self.optimizer_name() == ADAMW_OPTIMIZER # zero-offload torch-adam adam_w_mode optimizer # T|F T T torch.optim.AdamW # T|F T F torch.optim.Adam @@ -603,7 +602,7 @@ def _configure_basic_optimizer(self, model_parameters): **optimizer_parameters, adamw_mode=adam_w_mode) else: - optimizer_parameters[ADAM_W_MODE_PARAM] = adam_w_mode + optimizer_parameters['adam_w_mode'] = adam_w_mode optimizer = FusedAdam(model_parameters, **optimizer_parameters) elif self.optimizer_name() == LAMB_OPTIMIZER: diff --git a/deepspeed/runtime/zero/utils.py b/deepspeed/runtime/zero/utils.py index bdada1b4989a..2173670c632e 100755 --- a/deepspeed/runtime/zero/utils.py +++ b/deepspeed/runtime/zero/utils.py @@ -23,7 +23,12 @@ def _initialize_parameter_parallel_groups(parameter_parallel_size=None): return my_group -ZERO_SUPPORTED_OPTIMIZERS = [torch.optim.Adam, FusedAdam, DeepSpeedCPUAdam] +ZERO_SUPPORTED_OPTIMIZERS = [ + torch.optim.Adam, + torch.optim.AdamW, + FusedAdam, + DeepSpeedCPUAdam +] # Add apex FusedAdam to supported list if apex is installed try: diff --git a/tests/unit/test_fp16.py b/tests/unit/test_fp16.py index eb1ecc86425d..ae6041f3ec44 100755 --- a/tests/unit/test_fp16.py +++ b/tests/unit/test_fp16.py @@ -35,9 +35,9 @@ def test_lamb_fp32_grad_clip(tmpdir): @distributed_test(world_size=[1, 2]) def _test_lamb_fp32_grad_clip(args, model, hidden_dim): - model, _, _,_ = deepspeed.initialize(args=args, - model=model, - model_parameters=model.parameters()) + model, _, _, _ = deepspeed.initialize(args=args, + model=model, + model_parameters=model.parameters()) data_loader = random_dataloader(model=model, total_samples=50, hidden_dim=hidden_dim, @@ -73,9 +73,9 @@ def test_lamb_fp16_basic(tmpdir): @distributed_test(world_size=[1, 2]) def _test_lamb_fp16_basic(args, model, hidden_dim): - model, _, _,_ = deepspeed.initialize(args=args, - model=model, - model_parameters=model.parameters()) + model, _, _, _ = deepspeed.initialize(args=args, + model=model, + model_parameters=model.parameters()) data_loader = random_dataloader(model=model, total_samples=50, hidden_dim=hidden_dim, @@ -110,9 +110,9 @@ def test_lamb_fp16_empty_grad(tmpdir): @distributed_test(world_size=[2]) def _test_lamb_fp16_empty_grad(args, model, hidden_dim): - model, _, _,_ = deepspeed.initialize(args=args, - model=model, - model_parameters=model.parameters()) + model, _, _, _ = deepspeed.initialize(args=args, + model=model, + model_parameters=model.parameters()) data_loader = random_dataloader(model=model, total_samples=50, hidden_dim=hidden_dim, @@ -147,9 +147,9 @@ def test_adam_fp32_empty_grad(tmpdir): @distributed_test(world_size=[2]) def _test_adam_fp32_empty_grad(args, model, hidden_dim): - model, _, _,_ = deepspeed.initialize(args=args, - model=model, - model_parameters=model.parameters()) + model, _, _, _ = deepspeed.initialize(args=args, + model=model, + model_parameters=model.parameters()) data_loader = random_dataloader(model=model, total_samples=50, hidden_dim=hidden_dim, @@ -179,9 +179,9 @@ def test_adamw_fp16_basic(tmpdir): @distributed_test(world_size=[1]) def _test_adamw_fp16_basic(args, model, hidden_dim): optimizer = torch.optim.AdamW(params=model.parameters()) - model, _, _,_ = deepspeed.initialize(args=args, - model=model, - optimizer=optimizer) + model, _, _, _ = deepspeed.initialize(args=args, + model=model, + optimizer=optimizer) data_loader = random_dataloader(model=model, total_samples=50, hidden_dim=hidden_dim, @@ -210,10 +210,10 @@ def test_dict_config_adamw_fp16_basic(): @distributed_test(world_size=[1]) def _test_adamw_fp16_basic(args, model, hidden_dim, config_dict): optimizer = torch.optim.AdamW(params=model.parameters()) - model, _, _,_ = deepspeed.initialize(args=args, - model=model, - optimizer=optimizer, - config_params=config_dict) + model, _, _, _ = deepspeed.initialize(args=args, + model=model, + optimizer=optimizer, + config_params=config_dict) data_loader = random_dataloader(model=model, total_samples=50, hidden_dim=hidden_dim, @@ -245,9 +245,9 @@ def test_adamw_fp16_empty_grad(tmpdir): @distributed_test(world_size=[1]) def _test_adamw_fp16_empty_grad(args, model, hidden_dim): optimizer = torch.optim.AdamW(params=model.parameters()) - model, _, _,_ = deepspeed.initialize(args=args, - model=model, - optimizer=optimizer) + model, _, _, _ = deepspeed.initialize(args=args, + model=model, + optimizer=optimizer) data_loader = random_dataloader(model=model, total_samples=50, hidden_dim=hidden_dim, @@ -270,7 +270,7 @@ def _test_adamw_fp16_empty_grad(args, model, hidden_dim): True), ]) def test_adam_fp16_zero_onecycle_compatibility(tmpdir, zero_stage, use_cpu_offload): - #if use_cpu_offload and not deepspeed.ops.__installed_ops__['cpu-adam']: + # if use_cpu_offload and not deepspeed.ops.__installed_ops__['cpu-adam']: # pytest.skip("cpu-adam is not installed") config_dict = { "train_batch_size": 1, @@ -311,9 +311,9 @@ def test_adam_fp16_zero_onecycle_compatibility(tmpdir, zero_stage, use_cpu_offlo @distributed_test(world_size=[1]) def _test_adam_fp16_zero_onecycle_compatibility(args, model, hidden_dim): - model, _, _,_ = deepspeed.initialize(args=args, - model=model, - model_parameters=model.parameters()) + model, _, _, _ = deepspeed.initialize(args=args, + model=model, + model_parameters=model.parameters()) data_loader = random_dataloader(model=model, total_samples=50, hidden_dim=hidden_dim, @@ -338,7 +338,7 @@ def _test_adam_fp16_zero_onecycle_compatibility(args, model, hidden_dim): True), ]) def test_zero_static_scale(tmpdir, zero_stage, use_cpu_offload): - #if use_cpu_offload and not deepspeed.ops.__installed_ops__['cpu-adam']: + # if use_cpu_offload and not deepspeed.ops.__installed_ops__['cpu-adam']: # pytest.skip("cpu-adam is not installed") config_dict = { "train_batch_size": 4, @@ -364,9 +364,9 @@ def test_zero_static_scale(tmpdir, zero_stage, use_cpu_offload): def _test_zero_static_scale(args): hidden_dim = 10 model = SimpleModel(hidden_dim, empty_grad=True) - model, optim, _,_ = deepspeed.initialize(args=args, - model=model, - model_parameters=model.parameters()) + model, optim, _, _ = deepspeed.initialize(args=args, + model=model, + model_parameters=model.parameters()) # Ensure the static scaler is configured. assert optim.dynamic_loss_scale == False @@ -407,9 +407,9 @@ def test_zero_static_scale_deprecated_format(tmpdir): def _test_zero_static_scale(args): hidden_dim = 10 model = SimpleModel(hidden_dim, empty_grad=True) - model, optim, _,_ = deepspeed.initialize(args=args, - model=model, - model_parameters=model.parameters()) + model, optim, _, _ = deepspeed.initialize(args=args, + model=model, + model_parameters=model.parameters()) # Ensure the static scaler is configured. assert optim.dynamic_loss_scale == False @@ -438,7 +438,7 @@ def _test_zero_static_scale(args): True), ]) def test_zero_allow_untested_optimizer(tmpdir, zero_stage, use_cpu_offload): - #if use_cpu_offload and not deepspeed.ops.__installed_ops__['cpu-adam']: + # if use_cpu_offload and not deepspeed.ops.__installed_ops__['cpu-adam']: # pytest.skip("cpu-adam is not installed") config_dict = { "train_batch_size": 4, @@ -460,10 +460,10 @@ def _test_zero_allow_untested_optimizer(args): model = SimpleModel(hidden_dim, empty_grad=True) optimizer = SimpleOptimizer(model.parameters()) with pytest.raises(AssertionError): - model, optim, _,_ = deepspeed.initialize(args=args, - model=model, - optimizer=optimizer, - model_parameters=model.parameters()) + model, optim, _, _ = deepspeed.initialize(args=args, + model=model, + optimizer=optimizer, + model_parameters=model.parameters()) _test_zero_allow_untested_optimizer(args) @@ -478,7 +478,7 @@ def _test_zero_allow_untested_optimizer(args): True), ]) def test_zero_empty_partition(tmpdir, zero_stage, use_cpu_offload): - #if use_cpu_offload and not deepspeed.ops.__installed_ops__['cpu-adam']: + # if use_cpu_offload and not deepspeed.ops.__installed_ops__['cpu-adam']: # pytest.skip("cpu-adam is not installed") config_dict = { "train_micro_batch_size_per_gpu": 1, @@ -536,9 +536,9 @@ def test_adam_amp_basic(tmpdir): @distributed_test(world_size=[1]) def _test_adam_amp_basic(args, model, hidden_dim): optimizer = torch.optim.Adam(params=model.parameters()) - model, _, _,_ = deepspeed.initialize(args=args, - model=model, - optimizer=optimizer) + model, _, _, _ = deepspeed.initialize(args=args, + model=model, + optimizer=optimizer) data_loader = random_dataloader(model=model, total_samples=50, hidden_dim=hidden_dim, @@ -574,9 +574,9 @@ def test_lamb_amp_basic(tmpdir): @distributed_test(world_size=[1, 2]) def _test_lamb_amp_basic(args, model, hidden_dim): - model, _, _,_ = deepspeed.initialize(args=args, - model=model, - model_parameters=model.parameters()) + model, _, _, _ = deepspeed.initialize(args=args, + model=model, + model_parameters=model.parameters()) data_loader = random_dataloader(model=model, total_samples=50, hidden_dim=hidden_dim, @@ -613,9 +613,9 @@ def test_adam_amp_o2(tmpdir): @distributed_test(world_size=[1, 2]) def _test_adam_amp_o2(args, model, hidden_dim): - model, _, _,_ = deepspeed.initialize(args=args, - model=model, - model_parameters=model.parameters()) + model, _, _, _ = deepspeed.initialize(args=args, + model=model, + model_parameters=model.parameters()) data_loader = random_dataloader(model=model, total_samples=50, hidden_dim=hidden_dim, @@ -652,9 +652,9 @@ def test_adam_amp_o2_empty_grad(tmpdir): @distributed_test(world_size=[2]) def _test_adam_amp_o2_empty_grad(args, model, hidden_dim): - model, _, _,_ = deepspeed.initialize(args=args, - model=model, - model_parameters=model.parameters()) + model, _, _, _ = deepspeed.initialize(args=args, + model=model, + model_parameters=model.parameters()) data_loader = random_dataloader(model=model, total_samples=50, hidden_dim=hidden_dim, @@ -694,8 +694,8 @@ def test_zero_supported_client_optimizer(tmpdir, zero_stage, optimizer_construct def _test_zero_supported_client_optimizer(args, model, optimizer_constructor): client_optimizer = optimizer_constructor(params=model.parameters()) model, _, _, _ = deepspeed.initialize(args=args, - model=model, - optimizer=client_optimizer) + model=model, + optimizer=client_optimizer) _test_zero_supported_client_optimizer(args=args, model=model, @@ -732,9 +732,9 @@ def test_zero2_reduce_scatter_off(tmpdir): @distributed_test(world_size=[2]) def _helper(args, model, hidden_dim): - model, _, _,_ = deepspeed.initialize(args=args, - model=model, - model_parameters=model.parameters()) + model, _, _, _ = deepspeed.initialize(args=args, + model=model, + model_parameters=model.parameters()) data_loader = random_dataloader(model=model, total_samples=50, hidden_dim=hidden_dim, @@ -745,3 +745,53 @@ def _helper(args, model, hidden_dim): model.step() _helper(args=args, model=model, hidden_dim=hidden_dim) + + +@pytest.mark.parametrize('adam_type, torch_impl', + [('Adam', + True), + ('Adam', + False), + ('AdamW', + True), + ('AdamW', + False)]) +def test_fp16_adam_types(tmpdir, adam_type, torch_impl): + config_dict = { + "train_batch_size": 1, + "steps_per_print": 1, + "fp16": { + "enabled": True, + "initial_scale_power": 10 + }, + "optimizer": { + "type": adam_type, + "torch_adam": torch_impl, + "params": { + "lr": 0.00015 + } + } + } + args = args_from_dict(tmpdir, config_dict) + hidden_dim = 10 + + model = SimpleModel(hidden_dim, empty_grad=False) + + @distributed_test(world_size=[1]) + def _test_fp16_adam_types(args, model, hidden_dim): + + model, _, _, _ = deepspeed.initialize(args=args, + model=model, + model_parameters=model.parameters()) + + data_loader = random_dataloader(model=model, + total_samples=10, + hidden_dim=hidden_dim, + device=model.device) + + for _, batch in enumerate(data_loader): + loss = model(batch[0], batch[1]) + model.backward(loss) + model.step() + + _test_fp16_adam_types(args=args, model=model, hidden_dim=hidden_dim)