Skip to content
Merged
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
16 changes: 8 additions & 8 deletions deepspeed/runtime/engine.py
Original file line number Diff line number Diff line change
Expand Up @@ -1440,7 +1440,7 @@ def _configure_fp16_optimizer(self, optimizer):
if isinstance(optimizer, fused_opts) \
or self.optimizer_name() in [ONEBIT_ADAM_OPTIMIZER, ZERO_ONE_ADAM_OPTIMIZER]:
if self.dynamic_loss_scale():
log_dist("Creating fp16 optimizer with dynamic loss scale", ranks=[0])
log_dist(f'Creating fp16 optimizer with dynamic loss scale', ranks=[0])
timers = self.timers if self.wall_clock_breakdown() else None
optimizer = FP16_Optimizer(
optimizer,
Expand All @@ -1456,10 +1456,8 @@ def _configure_fp16_optimizer(self, optimizer):
)
else:
log_dist(
"Creating fp16 optimizer with static loss scale: {}".format(
self.loss_scale()),
ranks=[0],
)
f'Creating fp16 optimizer with static loss scale: {self.loss_scale()}',
ranks=[0])
optimizer = FP16_Optimizer(
optimizer,
deepspeed=self,
Expand All @@ -1470,7 +1468,7 @@ def _configure_fp16_optimizer(self, optimizer):
has_moe_layers=self.has_moe_layers,
)
else:
log_dist("Creating fp16 unfused optimizer with dynamic loss scale",
log_dist(f'Creating fp16 unfused optimizer with dynamic loss scale',
ranks=[0])
optimizer = FP16_UnfusedOptimizer(
optimizer,
Expand Down Expand Up @@ -1507,6 +1505,7 @@ def _configure_bf16_optimizer(self, optimizer):

def _configure_zero_optimizer(self, optimizer):
zero_stage = self.zero_optimization_stage()
model_dtype, grad_accum_dtype = self.get_data_types()
assert self.communication_data_type in (torch.float16, torch.bfloat16), "ZeRO supports only 'communication_data_type': ['fp16', 'bfp16']"
timers = self.timers if self.wall_clock_breakdown() else None

Expand All @@ -1524,7 +1523,8 @@ def _configure_zero_optimizer(self, optimizer):
round_robin_gradients = self.zero_round_robin_gradients()
assert not isinstance(optimizer, DummyOptim), "zero stage {} requires an optimizer".format(zero_stage)

log_dist('Creating ZeRO stage {} optimizer'.format(zero_stage), ranks=[0])
log_dist(f'Creating {model_dtype} ZeRO stage {zero_stage} optimizer',
ranks=[0])
# Overlap and contiguous grads are meaningless in stage 1 and are ignored
if zero_stage == ZeroStageEnum.optimizer_states:
overlap_comm = False
Expand Down Expand Up @@ -1588,7 +1588,7 @@ def _configure_zero_optimizer(self, optimizer):
offload_param_config=self.zero_offload_param(),
mpu=self.mpu)
else:
log_dist('Creating fp16 ZeRO stage {} optimizer'.format(zero_stage),
log_dist(f'Creating {model_dtype} ZeRO stage {zero_stage} optimizer',
ranks=[0])
from deepspeed.runtime.zero.stage3 import DeepSpeedZeroOptimizer_Stage3
optimizer = DeepSpeedZeroOptimizer_Stage3(
Expand Down