From 3782c7ae60f8624e566e3879b89bb09e8b59b869 Mon Sep 17 00:00:00 2001 From: Quentin Anthony Date: Sat, 18 Mar 2023 14:31:36 -0400 Subject: [PATCH] Add jsrun launcher based on neox 2.0 --- configs/neox_arguments.md | 2 +- megatron/neox_arguments/deepspeed_args.py | 2 +- megatron/neox_arguments/neox_args.py | 5 +++++ 3 files changed, 7 insertions(+), 2 deletions(-) diff --git a/configs/neox_arguments.md b/configs/neox_arguments.md index 416ad5f83..2e20dc61f 100644 --- a/configs/neox_arguments.md +++ b/configs/neox_arguments.md @@ -1809,7 +1809,7 @@ Args for deepspeed runner (deepspeed.launcher.runner). -- **launcher**: typing.Literal['pdsh', 'openmpi', 'mvapich', 'slurm'] +- **launcher**: typing.Literal['pdsh', 'openmpi', 'mvapich', 'slurm', 'jsrun'] Default = pdsh diff --git a/megatron/neox_arguments/deepspeed_args.py b/megatron/neox_arguments/deepspeed_args.py index 905f46c78..8ec7e7d63 100644 --- a/megatron/neox_arguments/deepspeed_args.py +++ b/megatron/neox_arguments/deepspeed_args.py @@ -192,7 +192,7 @@ class NeoXArgsDeepspeedRunner(NeoXArgsTemplate): IP address of node 0, will be inferred via 'hostname -I' if not specified. """ - launcher: Literal["pdsh", "openmpi", "mvapich", "slurm"] = "pdsh" + launcher: Literal["pdsh", "openmpi", "mvapich", "slurm", "jsrun"] = "pdsh" """ Launcher backend for multi-node training. Options currently include PDSH, OpenMPI, MVAPICH. """ diff --git a/megatron/neox_arguments/neox_args.py b/megatron/neox_arguments/neox_args.py index 95e6b6b8e..b6115a50d 100644 --- a/megatron/neox_arguments/neox_args.py +++ b/megatron/neox_arguments/neox_args.py @@ -612,6 +612,11 @@ class NeoXArgsOther(NeoXArgsTemplate): Run via SLURM, this will attempt to discover the necessary variables to initialize torch distributed from the SLURM environment """ + deepspeed_jsrun: bool = False + """ + Run via JSRUN, this will attempt to discover the necessary variables to initialize torch distributed from the IBM LSF environment + """ + user_script: str = None """ user script to be run