diff --git a/deepspeed/utils/distributed.py b/deepspeed/utils/distributed.py index c9722af21c24..dba48e6fdac4 100644 --- a/deepspeed/utils/distributed.py +++ b/deepspeed/utils/distributed.py @@ -12,15 +12,17 @@ def init_distributed(dist_backend="nccl", auto_mpi_discovery=True, distributed_port=TORCH_DISTRIBUTED_DEFAULT_PORT, verbose=True): - """ - Initialize torch.distributed backend, potentially performing MPI discovery if needed + """Initialize torch.distributed backend, potentially performing MPI discovery if needed. + Arguments: - dist_backend (str): torch distributed backend, e.g., nccl, mpi, gloo - auto_mpi_discovery (bool): if distributed environment variables are not set, attempt to discover them from MPI - distributed_port (int, optional): torch distributed backend port - verbose (bool, optional): verbose logging - """ + dist_backend: torch distributed backend, e.g., nccl, mpi, gloo + auto_mpi_discovery: if distributed environment variables are not set, attempt to discover them from MPI + + distributed_port: torch distributed backend port + + verbose: verbose logging + """ required_env = ["RANK", "WORLD_SIZE", "MASTER_ADDR", "MASTER_PORT", "LOCAL_RANK"] if auto_mpi_discovery and not all(map(lambda v: v in os.environ, required_env)): if verbose: diff --git a/docs/code-docs/source/initialize.rst b/docs/code-docs/source/initialize.rst index ee10154515ea..938045de8fc8 100644 --- a/docs/code-docs/source/initialize.rst +++ b/docs/code-docs/source/initialize.rst @@ -25,7 +25,7 @@ to add DeepSpeed's builtin arguments to your application's parser. Training Initialization ----------------------- -The entrypoint for all training with DeepSpeed is ``deepspeed.initialize()``. +The entrypoint for all training with DeepSpeed is ``deepspeed.initialize()``. Will initialize distributed backend if it is not intialized already. Example usage: @@ -36,3 +36,9 @@ Example usage: model_parameters=net.parameters()) .. autofunction:: deepspeed.initialize + +Distributed Initialization +----------------------- +Optional distributed backend initializating separate from ``deepspeed.initialize()``. Useful in scenarios where the user wants to use torch distributed calls before calling ``deepspeed.initialize()``, such as when using model parallelism, pipeline parallelism, or certain data loader scenarios. + +.. autofunction:: deepspeed.init_distributed