diff --git a/deepspeed/runtime/engine.py b/deepspeed/runtime/engine.py index 8a9d5abf173f..f5737c07ea04 100755 --- a/deepspeed/runtime/engine.py +++ b/deepspeed/runtime/engine.py @@ -585,6 +585,12 @@ def _configure_distributed_model(self, model): def _configure_optimizer(self, client_optimizer, model_parameters): if client_optimizer is not None: + client_optimizer.param_groups[:] = [ + pg for pg in client_optimizer.param_groups if len(pg["params"]) != 0 + ] + logger.info( + "Removing param_group that has no 'params'in the client Optimizer") + basic_optimizer = client_optimizer if self.global_rank == 0: logger.info('Using client Optimizer as basic optimizer') diff --git a/deepspeed/runtime/lr_schedules.py b/deepspeed/runtime/lr_schedules.py index 515233851a1d..7846da12fdbd 100755 --- a/deepspeed/runtime/lr_schedules.py +++ b/deepspeed/runtime/lr_schedules.py @@ -706,8 +706,8 @@ def __init__(self, self.min_lrs = self._format_param(self.optimizer, warmup_min_lr, "min_lr") self.max_lrs = self._format_param(self.optimizer, warmup_max_lr, "max_lr") self.delta_lrs = [big - small for big, small in zip(self.max_lrs, self.min_lrs)] - self.warmup_num_steps = warmup_num_steps - self.inverse_log_warm_up = 1.0 / math.log(warmup_num_steps) + self.warmup_num_steps = max(2, warmup_num_steps) + self.inverse_log_warm_up = 1.0 / math.log(self.warmup_num_steps) self.last_batch_iteration = last_batch_iteration def get_lr(self): diff --git a/docs/_tutorials/pipeline.md b/docs/_tutorials/pipeline.md index e7730ebe2661..46546066ab1a 100644 --- a/docs/_tutorials/pipeline.md +++ b/docs/_tutorials/pipeline.md @@ -132,7 +132,7 @@ net = PipelineModule(layers=net.to_layers(), num_stages=2) ``` **Note:** -the `lamda` in the middle of `layers` above is not a `torch.nn.Module` +the `lambda` in the middle of `layers` above is not a `torch.nn.Module` type. Any object that implements `__call__()` can be a layer in a `PipelineModule`: this allows for convenient data transformations in the pipeline. @@ -165,7 +165,7 @@ These modifications can be accomplished with a short subclass: class TransformerBlockPipe(TransformerBlock) def forward(self, inputs): hidden, mask = inputs - outputs = super().forward(hidden, mask) + output = super().forward(hidden, mask) return (output, mask) stack = [ TransformerBlockPipe() for _ in range(num_layers) ] ``` @@ -269,17 +269,18 @@ by DeepSpeed: * `partition_method="uniform"` balances the number of layers per stage. ### Memory-Efficient Model Construction -Building a `Sequential` and providing it `PipelineModule` is a convenient way -of specifying a pipeline parallel model. However, this approach encounters -scalability issues for massive models. Starting from a `Sequential` allocates -the model in CPU memory redundantly by every worker. A machine with 16 GPUs -must have as much local CPU memory as 16 times the model size. +Building a `Sequential` container and providing it to a `PipelineModule` is a convenient way +of specifying a pipeline parallel model. However, this approach encounters scalability issues +for massive models because each worker replicates the whole model in CPU memory. +For example, a machine with 16 GPUs must have as much local CPU memory as 16 times the model size. DeepSpeed provides a `LayerSpec` class that delays the construction of -modules until the model layers have been partitioned across workers. Then, -the modules are built on the GPU that owns the layer. +modules until the model layers have been partitioned across workers. +Then each worker will allocate only the layers it's assigned to. So, continuing the +example from the previous paragraph, a machine with 16 GPUs will need to allocate a +total of 1x model size on its CPU, compared to 16x in the LayerSpec example. -Here's an example of the abbreviated AlexNet model, but expressed only +Here is an example of the abbreviated AlexNet model, but expressed only with `LayerSpec`s. Note that the syntax is almost unchanged: `nn.ReLU(inplace=True)` simply becomes `LayerSpec(nn.ReLU, inplace=True)`. ```python diff --git a/version.txt b/version.txt index 208059121dde..0b9c0199636e 100644 --- a/version.txt +++ b/version.txt @@ -1 +1 @@ -0.3.11 +0.3.12