Skip to content

Commit

Permalink
Fix ddp accelerator choice for cpu (#8645)
Browse files Browse the repository at this point in the history
* Fix ddp accelerator choice for cpu
  • Loading branch information
kaushikb11 authored Aug 2, 2021
1 parent dd8216a commit d01d833
Show file tree
Hide file tree
Showing 3 changed files with 12 additions and 4 deletions.
8 changes: 5 additions & 3 deletions pytorch_lightning/trainer/connectors/accelerator_connector.py
Original file line number Diff line number Diff line change
Expand Up @@ -607,7 +607,7 @@ def select_training_type_plugin(self) -> TrainingTypePlugin:
use_torchelastic_ddp = self.use_ddp and TorchElasticEnvironment.is_using_torchelastic()
use_kubeflow_ddp = self.use_ddp and KubeflowEnvironment.is_using_kubeflow()
use_ddp_spawn = self._distrib_type == DistributedType.DDP_SPAWN
use_ddp_cpu_spawn = self.use_ddp and self.use_cpu
use_ddp_cpu_spawn = use_ddp_spawn and self.use_cpu
use_tpu_spawn = self.use_tpu and self._distrib_type == DistributedType.TPU_SPAWN
use_ddp_cpu_torch_elastic = use_ddp_cpu_spawn and TorchElasticEnvironment.is_using_torchelastic()
use_ddp_cpu_kubeflow = use_ddp_cpu_spawn and KubeflowEnvironment.is_using_kubeflow()
Expand Down Expand Up @@ -738,14 +738,16 @@ def set_distributed_mode(self, distributed_backend: Optional[str] = None):
if self.distributed_backend is None:
if self.has_horovodrun():
self._set_horovod_backend()
elif self.num_gpus == 0 and (self.num_nodes > 1 or self.num_processes > 1):
elif self.num_gpus == 0 and self.num_nodes > 1:
self._distrib_type = DistributedType.DDP
elif self.num_gpus == 0 and self.num_processes > 1:
self.distributed_backend = DistributedType.DDP_SPAWN
elif self.num_gpus > 1 and not _use_cpu:
rank_zero_warn(
"You requested multiple GPUs but did not specify a backend, e.g."
' `Trainer(accelerator="dp"|"ddp"|"ddp2")`. Setting `accelerator="ddp_spawn"` for you.'
)
self.distributed_backend = "ddp_spawn"
self.distributed_backend = DistributedType.DDP_SPAWN

# special case with DDP on CPUs
if self.distributed_backend == DistributedType.DDP_CPU:
Expand Down
6 changes: 6 additions & 0 deletions tests/accelerators/test_accelerator_connector.py
Original file line number Diff line number Diff line change
Expand Up @@ -623,3 +623,9 @@ def test_unsupported_distrib_types_on_cpu(training_type):
trainer = Trainer(accelerator=training_type, num_processes=2)

assert trainer._distrib_type == DistributedType.DDP


def test_accelerator_ddp_for_cpu(tmpdir):
trainer = Trainer(accelerator="ddp", num_processes=2)
assert isinstance(trainer.accelerator, CPUAccelerator)
assert isinstance(trainer.training_type_plugin, DDPPlugin)
2 changes: 1 addition & 1 deletion tests/trainer/test_trainer.py
Original file line number Diff line number Diff line change
Expand Up @@ -1750,7 +1750,7 @@ def on_predict_start(self) -> None:


@pytest.mark.parametrize(
"accelerator,num_processes", [(None, 1), pytest.param("ddp", 2, marks=RunIf(skip_windows=True))]
"accelerator,num_processes", [(None, 1), pytest.param("ddp_cpu", 2, marks=RunIf(skip_windows=True))]
)
def test_model_in_correct_mode_during_stages(tmpdir, accelerator, num_processes):
model = TrainerStagesModel()
Expand Down

0 comments on commit d01d833

Please sign in to comment.