diff --git a/deepspeed/runtime/engine.py b/deepspeed/runtime/engine.py index dd77413c82a2..6d8040fe8ec2 100755 --- a/deepspeed/runtime/engine.py +++ b/deepspeed/runtime/engine.py @@ -1662,27 +1662,19 @@ def _save_checkpoint(self, save_dir, tag, client_state={}): # then instead just returns None. self._curr_ckpt_path = os.path.join(save_dir, tag) - state = { - 'module': - self.module_state_dict(), - 'optimizer': - self.optimizer.state_dict() + state = dict( + module=self.module_state_dict(), + optimizer=self.optimizer.state_dict() if self.optimizer and not self.zero_optimization() else None, - 'lr_scheduler': - self.lr_scheduler.state_dict() if self.lr_scheduler is not None else None, - 'csr_tensor_module_names': - self.csr_tensor_module_names, - 'skipped_steps': - self.skipped_steps, - 'global_steps': - self.global_steps, - 'global_samples': - self.global_samples, - 'dp_world_size': - self.dp_world_size, - 'mp_world_size': - self.mp_world_size - } + lr_scheduler=self.lr_scheduler.state_dict() + if self.lr_scheduler is not None else None, + csr_tensor_module_names=self.csr_tensor_module_names, + skipped_steps=self.skipped_steps, + global_steps=self.global_steps, + global_samples=self.global_samples, + dp_world_size=self.dp_world_size, + mp_world_size=self.mp_world_size, + ) state.update(client_state) log_dist(message=f'Saving model checkpoint: {save_path}', ranks=[0]) diff --git a/deepspeed/runtime/zero/config.py b/deepspeed/runtime/zero/config.py index 622ffa9ba1cb..c179d01f2988 100755 --- a/deepspeed/runtime/zero/config.py +++ b/deepspeed/runtime/zero/config.py @@ -36,12 +36,6 @@ def __init__(self, param_dict): self.max_reuse_distance = None self.gather_fp16_weights_on_model_save = None - #Stage3 Specific Parameters - self.prefetch_bucket_size = None - self.param_persistence_threshold = None - self.max_live_parameters = None - self.max_reuse_distance = None - if ZERO_OPTIMIZATION in param_dict.keys(): zero_config_dict = param_dict[ZERO_OPTIMIZATION] if type(zero_config_dict) is bool: diff --git a/deepspeed/runtime/zero/partition_parameters.py b/deepspeed/runtime/zero/partition_parameters.py index c654e66abc59..0acc675985ca 100755 --- a/deepspeed/runtime/zero/partition_parameters.py +++ b/deepspeed/runtime/zero/partition_parameters.py @@ -399,10 +399,10 @@ def _convert_to_deepspeed_param(self, param): # Stores the shape of the original tensor param.ds_shape = param.shape - # Stores the number of elements in the original parmaeter without padding + # Stores the number of elements in the original parameter without padding param.ds_numel = param.numel() - # Stores the paritioned copy of the tensor + # Stores the partitioned copy of the tensor param.ds_tensor = None # Keeps track of how many active sub-modules need this param at any given point in time diff --git a/deepspeed/runtime/zero/stage3.py b/deepspeed/runtime/zero/stage3.py index ea4653578616..e5299949fcf6 100755 --- a/deepspeed/runtime/zero/stage3.py +++ b/deepspeed/runtime/zero/stage3.py @@ -580,7 +580,7 @@ def __init__(self, gradient_accumulation_steps=1, elastic_checkpoint=False): - see_memory_usage("Stage 3 intialize beginning", force=True) + see_memory_usage("Stage 3 initialize beginning", force=True) if dist.get_rank() == 0: logger.info(f"Reduce bucket size {reduce_bucket_size}")