diff --git a/deepspeed/pt/deepspeed_run.py b/deepspeed/pt/deepspeed_run.py index 02a5f351cd6d..cd82097c8ffe 100755 --- a/deepspeed/pt/deepspeed_run.py +++ b/deepspeed/pt/deepspeed_run.py @@ -21,6 +21,7 @@ EXPORT_ENVS = ["NCCL", "PYTHON"] DEEPSPEED_ENVIRONMENT_NAME = ".deepspeed_env" DEEPSPEED_ENVIRONMENT_PATHS = [os.path.expanduser("~"), '.'] +PDSH_MAX_FAN_OUT = 1024 def parse_args(args=None): @@ -294,7 +295,9 @@ def main(args=None): active_workers = ",".join(active_resources.keys()) logger.info("Running on the following workers: %s" % active_workers) - pdsh_cmd_args = ['pdsh', '-w', active_workers] + # PDSH flags for max node fan out and specific hosts to launch on + # See https://linux.die.net/man/1/pdsh for flag details + pdsh_cmd_args = ['pdsh', '-f', str(PDSH_MAX_FAN_OUT), '-w', active_workers] num_nodes = len(active_resources.keys()) num_gpus_per_node = None