Skip to content

Commit

Permalink
Browse files Browse the repository at this point in the history
  • Loading branch information
williamFalcon authored and tullie committed Apr 3, 2020
1 parent 24109e8 commit 470fbd0
Showing 1 changed file with 13 additions and 0 deletions.
13 changes: 13 additions & 0 deletions pytorch_lightning/trainer/trainer.py
Original file line number Diff line number Diff line change
Expand Up @@ -958,6 +958,7 @@ def fit(
task = int(os.environ['SLURM_LOCALID'])
self.ddp_train(task, model)
else:
self.__set_random_port()
mp.spawn(self.ddp_train, nprocs=self.num_gpus, args=(model,))

# 1 gpu or dp option triggers training using DP module
Expand Down Expand Up @@ -994,6 +995,18 @@ def fit(
# used for testing or when we need to know that training succeeded
return 1

def __set_random_port(self):
"""
When running DDP NOT managed by SLURM, the ports might collide
:return:
"""
try:
default_port = os.environ['MASTER_PORT']
except Exception:
import random
default_port = random.randint(10000, 19000)
os.environ['MASTER_PORT'] = str(default_port)

def __set_fit_dataloaders(self, model, train_dataloader, val_dataloaders, test_dataloaders):
# when dataloader is passed via fit, patch the train_dataloader
# functions to overwrite with these implementations
Expand Down

0 comments on commit 470fbd0

Please sign in to comment.