diff --git a/deepspeed/utils/distributed.py b/deepspeed/utils/distributed.py
index c9722af21c24..dba48e6fdac4 100644
--- a/deepspeed/utils/distributed.py
+++ b/deepspeed/utils/distributed.py
@@ -12,15 +12,17 @@ def init_distributed(dist_backend="nccl",
                      auto_mpi_discovery=True,
                      distributed_port=TORCH_DISTRIBUTED_DEFAULT_PORT,
                      verbose=True):
-    """
-    Initialize torch.distributed backend, potentially performing MPI discovery if needed
+    """Initialize torch.distributed backend, potentially performing MPI discovery if needed.
+
     Arguments:
-        dist_backend (str): torch distributed backend, e.g., nccl, mpi, gloo
-        auto_mpi_discovery (bool): if distributed environment variables are not set, attempt to discover them from MPI
-        distributed_port (int, optional): torch distributed backend port
-        verbose (bool, optional): verbose logging
-    """
+        dist_backend: torch distributed backend, e.g., nccl, mpi, gloo
 
+        auto_mpi_discovery: if distributed environment variables are not set, attempt to discover them from MPI
+
+        distributed_port: torch distributed backend port
+
+        verbose: verbose logging
+    """
     required_env = ["RANK", "WORLD_SIZE", "MASTER_ADDR", "MASTER_PORT", "LOCAL_RANK"]
     if auto_mpi_discovery and not all(map(lambda v: v in os.environ, required_env)):
         if verbose:
diff --git a/docs/code-docs/source/initialize.rst b/docs/code-docs/source/initialize.rst
index ee10154515ea..938045de8fc8 100644
--- a/docs/code-docs/source/initialize.rst
+++ b/docs/code-docs/source/initialize.rst
@@ -25,7 +25,7 @@ to add DeepSpeed's builtin arguments to your application's parser.
 
 Training Initialization
 -----------------------
-The entrypoint for all training with DeepSpeed is ``deepspeed.initialize()``.
+The entrypoint for all training with DeepSpeed is ``deepspeed.initialize()``. Will initialize distributed backend if it is not intialized already.
 
 Example usage:
 
@@ -36,3 +36,9 @@ Example usage:
                                                          model_parameters=net.parameters())
 
 .. autofunction:: deepspeed.initialize
+
+Distributed Initialization
+-----------------------
+Optional distributed backend initializating separate from ``deepspeed.initialize()``. Useful in scenarios where the user wants to use torch distributed calls before calling ``deepspeed.initialize()``, such as when using model parallelism, pipeline parallelism, or certain data loader scenarios.
+
+.. autofunction:: deepspeed.init_distributed