Lightning-AI · awaelchli · Oct 4, 2023 · Sep 29, 2023 · Sep 29, 2023 · Sep 29, 2023
@@ -477,6 +477,7 @@ def _load_py_module(name: str, location: str) -> ModuleType:
     ("py:class", "torch.utils.data.DistributedSampler"),
     ("py:class", "torch_xla.distributed.parallel_loader.MpDeviceLoader"),
     ("py:func", "torch_xla.distributed.xla_multiprocessing.spawn"),
+    ("py:class", "torch._dynamo.OptimizedModule"),
     ("py:mod", "tqdm"),
     ("py:meth", "training_step"),
     ("py:meth", "transfer_batch_to_device"),

@@ -168,6 +168,10 @@ The format is based on [Keep a Changelog](http://keepachangelog.com/en/1.0.0/).
 - Enabled launching via `torchrun` in a SLURM environment; the `TorchElasticEnvironment` now gets chosen over the `SLURMEnvironment` if both are detected ([#18618](https://github.com/Lightning-AI/lightning/pull/18618))
 
 
+- If not set by the user, Lightning will set `OMP_NUM_THREADS` to `num_cpus / num_processes` when launching subprocesses (e.g. when DDP is used) to avoid system overload for CPU-intensive tasks ([#18677](https://github.com/Lightning-AI/lightning/pull/18677))
+
+
+
 ### Deprecated
 
 - Deprecated the `DDPStrategy.is_distributed` property. This strategy is distributed by definition ([#17381](https://github.com/Lightning-AI/lightning/pull/17381))

@@ -24,6 +24,7 @@
 from lightning.fabric.plugins.precision.precision import _PRECISION_INPUT_STR, _PRECISION_INPUT_STR_ALIAS
 from lightning.fabric.strategies import STRATEGY_REGISTRY
 from lightning.fabric.utilities.device_parser import _parse_gpu_ids
+from lightning.fabric.utilities.distributed import _suggested_max_num_threads
 
 _log = logging.getLogger(__name__)
 
@@ -177,7 +178,7 @@ def _torchrun_launch(args: Namespace, script_args: List[str]) -> None:
     torchrun_args.extend(script_args)
 
     # set a good default number of threads for OMP to avoid warnings being emitted to the user
-    os.environ.setdefault("OMP_NUM_THREADS", str(max(1, (os.cpu_count() or 1) // num_processes)))
+    os.environ.setdefault("OMP_NUM_THREADS", str(_suggested_max_num_threads()))
     torchrun.main(torchrun_args)
 
 

@@ -27,6 +27,7 @@
 from lightning.fabric.accelerators.cpu import CPUAccelerator
 from lightning.fabric.strategies.launchers.launcher import _Launcher
 from lightning.fabric.utilities.apply_func import move_data_to_device
+from lightning.fabric.utilities.distributed import _set_num_threads_if_needed
 from lightning.fabric.utilities.imports import _IS_INTERACTIVE
 from lightning.fabric.utilities.seed import _collect_rng_states, _set_rng_states
 
@@ -129,10 +130,11 @@ def _wrapping_function(
     ) -> None:
         if global_states:
             global_states.restore()
-
         if self._start_method == "spawn" and isinstance(self._strategy.accelerator, CPUAccelerator):
             args, kwargs = _disable_module_memory_sharing((args, kwargs))
 
+        _set_num_threads_if_needed(num_processes=self._strategy.num_processes)
+
         os.environ["LOCAL_RANK"] = str(process_idx)
         results = function(*args, **kwargs)
 

@@ -24,6 +24,7 @@
 
 from lightning.fabric.plugins.environments.cluster_environment import ClusterEnvironment
 from lightning.fabric.strategies.launchers.launcher import _Launcher
+from lightning.fabric.utilities.distributed import _set_num_threads_if_needed
 from lightning.fabric.utilities.rank_zero import rank_prefixed_message
 
 _logger = logging.getLogger(__name__)
@@ -98,6 +99,8 @@ def launch(self, function: Callable, *args: Any, **kwargs: Any) -> Any:
         if not self.cluster_environment.creates_processes_externally:
             self._call_children_scripts()
             _launch_process_observer(self.procs)
+
+        _set_num_threads_if_needed(num_processes=self.num_processes)
         return function(*args, **kwargs)
 
     def _call_children_scripts(self) -> None:

@@ -13,6 +13,7 @@
 from torch import Tensor
 from torch.utils.data import Dataset, DistributedSampler, Sampler
 
+from lightning.fabric.utilities.data import _num_cpus_available
 from lightning.fabric.utilities.rank_zero import rank_zero_info
 from lightning.fabric.utilities.types import _PATH, ReduceOp
 
@@ -359,3 +360,16 @@ def __init__(self, sampler: Union[Sampler, Iterable], *args: Any, **kwargs: Any)
     def __iter__(self) -> Iterator:
         self.dataset.reset()
         return (self.dataset[index] for index in super().__iter__())
+
+
+def _suggested_max_num_threads(num_processes: int = 1) -> int:
+    if num_processes < 1:
+        raise ValueError(f"`num_processes` should be >= 1, got {num_processes}.")
+    return max(1, _num_cpus_available() // num_processes)
+
+
+def _set_num_threads_if_needed(num_processes: int = 1) -> None:
+    if "OMP_NUM_THREADS" not in os.environ:
+        num_threads = _suggested_max_num_threads(num_processes)
+        torch.set_num_threads(num_threads)
+        os.environ["OMP_NUM_THREADS"] = str(num_threads)
@@ -222,6 +222,8 @@ The format is based on [Keep a Changelog](http://keepachangelog.com/en/1.0.0/).
 - Enabled launching via `torchrun` in a SLURM environment; the `TorchElasticEnvironment` now gets chosen over the `SLURMEnvironment` if both are detected ([#18618](https://github.com/Lightning-AI/lightning/pull/18618))
 
 
+- If not set by the user, Lightning will set `OMP_NUM_THREADS` to `num_cpus / num_processes` when launching subprocesses (e.g. when DDP is used) to avoid system overload for CPU-intensive tasks ([#18677](https://github.com/Lightning-AI/lightning/pull/18677))
+
 
 ### Deprecated
 

@@ -33,6 +33,7 @@
     _disable_module_memory_sharing,
 )
 from lightning.fabric.utilities import move_data_to_device
+from lightning.fabric.utilities.distributed import _set_num_threads_if_needed
 from lightning.fabric.utilities.seed import _collect_rng_states, _set_rng_states
 from lightning.fabric.utilities.types import _PATH
 from lightning.pytorch.accelerators import CPUAccelerator
@@ -154,6 +155,8 @@ def _wrapping_function(
         if self._start_method == "spawn" and isinstance(self._strategy.accelerator, CPUAccelerator):
             args, kwargs = _disable_module_memory_sharing((args, kwargs))
 
+        _set_num_threads_if_needed(num_processes=self._strategy.num_processes)
+
         os.environ["LOCAL_RANK"] = str(process_idx)
         results = function(*args, **kwargs)
 

@@ -25,6 +25,7 @@
     _hydra_subprocess_cmd,
     _launch_process_observer,
 )
+from lightning.fabric.utilities.distributed import _set_num_threads_if_needed
 from lightning.pytorch.strategies.launchers.launcher import _Launcher
 from lightning.pytorch.trainer.connectors.signal_connector import _SIGNUM
 
@@ -96,6 +97,8 @@ def launch(self, function: Callable, *args: Any, trainer: Optional["pl.Trainer"]
         if not self.cluster_environment.creates_processes_externally:
             self._call_children_scripts()
             _launch_process_observer(self.procs)
+
+        _set_num_threads_if_needed(num_processes=self.num_processes)
         return function(*args, **kwargs)
 
     def kill(self, signum: _SIGNUM) -> None:

@@ -56,6 +56,7 @@ def restore_env_variables():
         "POPLAR_ENGINE_OPTIONS",  # set by IPUStrategy
         "CUDA_MODULE_LOADING",  # leaked since PyTorch 1.13
         "CRC32C_SW_MODE",  # set by tensorboardX
+        "OMP_NUM_THREADS",  # set by our launchers
         # set by XLA FSDP on XRT
         "XRT_TORCH_DIST_ROOT",
         "XRT_MESH_SERVICE_ADDRESS",

@@ -2,14 +2,21 @@
 import os
 from functools import partial
 from pathlib import Path
+from unittest import mock
 
 import pytest
 import torch
 from lightning.fabric.accelerators import CPUAccelerator, CUDAAccelerator, MPSAccelerator
 from lightning.fabric.plugins.environments import LightningEnvironment
 from lightning.fabric.strategies import DDPStrategy, SingleDeviceStrategy
 from lightning.fabric.strategies.launchers.multiprocessing import _MultiProcessingLauncher
-from lightning.fabric.utilities.distributed import _gather_all_tensors, _sync_ddp, is_shared_filesystem
+from lightning.fabric.utilities.distributed import (
+    _gather_all_tensors,
+    _set_num_threads_if_needed,
+    _suggested_max_num_threads,
+    _sync_ddp,
+    is_shared_filesystem,
+)
 
 from tests_fabric.helpers.runif import RunIf
 
@@ -158,3 +165,26 @@ def _test_is_shared_filesystem(strategy, tmp_path, monkeypatch):
 
     # Remote path is considered shared
     assert is_shared_filesystem(strategy, path="s3://my-bucket/data")
+
+
+@pytest.mark.parametrize("invalid", [-1, 0])
+def test_suggested_max_num_threads(invalid):
+    with pytest.raises(ValueError, match="should be >= 1"):
+        _suggested_max_num_threads(invalid)
+
+
+@mock.patch.dict(os.environ, {}, clear=True)
+@mock.patch("lightning.fabric.utilities.distributed.torch.set_num_threads")
+@mock.patch("lightning.fabric.utilities.distributed._num_cpus_available", return_value=4)
+@pytest.mark.parametrize(("num_processes", "expected"), [(1, 4), (2, 2), (3, 1), (4, 1), (8, 1)])
+def test_set_num_threads_if_needed(_, set_num_threads_mock, num_processes, expected):
+    assert "OMP_NUM_THREADS" not in os.environ
+    _set_num_threads_if_needed(num_processes)
+    set_num_threads_mock.assert_called_with(expected)
+    assert os.environ["OMP_NUM_THREADS"] == str(expected)
+
+    # if env variable is already set, no change
+    set_num_threads_mock.reset_mock()
+    _set_num_threads_if_needed(1)
+    set_num_threads_mock.assert_not_called()
+    assert os.environ["OMP_NUM_THREADS"] == str(expected)
@@ -78,6 +78,7 @@ def restore_env_variables():
         "KMP_DUPLICATE_LIB_OK",  # leaked since PyTorch 1.13
         "CRC32C_SW_MODE",  # leaked by tensorboardX
         "TRITON_CACHE_DIR",  # leaked by torch.compile
+        "OMP_NUM_THREADS",  # set by our launchers
         # leaked by XLA
         "ALLOW_MULTIPLE_LIBTPU_LOAD",
         "GRPC_VERBOSITY",

@@ -337,7 +337,7 @@ def local_rank(self):
         def node_rank(self):
             return 0
 
-    ddp_strategy = DDPStrategy(cluster_environment=MyClusterEnvironment())
+    ddp_strategy = DDPStrategy(cluster_environment=MyClusterEnvironment(), parallel_devices=[torch.device("cpu")])
     assert ddp_strategy.launcher is None
     ddp_strategy._configure_launcher()
     assert isinstance(ddp_strategy.launcher, _SubprocessScriptLauncher)