diff --git a/deepspeed/runtime/zero/stage_1_and_2.py b/deepspeed/runtime/zero/stage_1_and_2.py index 5870e9680b47..49a154b80b55 100755 --- a/deepspeed/runtime/zero/stage_1_and_2.py +++ b/deepspeed/runtime/zero/stage_1_and_2.py @@ -516,7 +516,14 @@ def is_moe_group(self, group): return 'moe' in group and group['moe'] def _configure_moe_settings(self): - assert self.contiguous_gradients, "Contiguous Gradients in ZeRO Stage 2 must be set to True for MoE. Other code paths are not tested with MoE" + # if we're using ZeRO stage 2, ensure contiguous gradients are used + if self.partition_gradients: + assert self.contiguous_gradients, "Contiguous Gradients in ZeRO Stage 2 must be set to True for MoE. Other code paths are not tested with MoE" + # NOTE: To run ZeRO stage 1 with MoE, we need to set self.contiguous_gradients to True or ignore the assertion + if not self.partition_gradients and not self.contiguous_gradients: + logger.warn( + "ZeRO Stage 1 has not been thoroughly tested with MoE. This configuration is still experimental." + ) assert self.reduce_scatter, "Reduce Scatter in ZeRO Stage 2 must be set to True for MoE. Other code paths are not tested with MoE" assert any([self.is_moe_group(group) for group in self.optimizer.param_groups]), "The model has moe layers, but None of the param groups are marked as MoE. Create a param group with 'moe' key set to True before creating optimizer"