diff --git a/scripts/performance/README.md b/scripts/performance/README.md index d643fdd50a..5ebadd44fe 100644 --- a/scripts/performance/README.md +++ b/scripts/performance/README.md @@ -151,7 +151,7 @@ python scripts/performance/setup_experiment.py - `-a/--account`: Slurm account to use for experiment. - `-p/--partition`: Slurm partition to use for experiment. - `-t/--time_limit`: Maximum time limit before the Slurm job is cancelled. Format `HH:MM:SS`. Default `00:30:00`. -- `-gn/--gpus_per_node`: GPUs per node. Default `8`. +- `-gn/--gpus_per_node`: GPUs per node. Default `None`. If not provided, will be inferred from the GPU type. - `-cm/--custom_mounts`: Comma-separated list of host mounts to expose inside the container. - `-ce/--custom_env_vars`: Comma-separated string of environment variables (format: `key1=value1,key2=value2`). - `-cs/--custom_srun_args`: Comma-separated string of srun arguments. diff --git a/scripts/performance/argument_parser.py b/scripts/performance/argument_parser.py index bb4e82c7cd..67fcc66e42 100644 --- a/scripts/performance/argument_parser.py +++ b/scripts/performance/argument_parser.py @@ -26,6 +26,14 @@ VALID_CUDA_GRAPH_IMPLS = ["none", "local", "transformer_engine"] VALID_CUDA_GRAPH_SCOPES = ["full_iteration", "attn", "mlp", "moe", "moe_router", "moe_preprocess", "mamba"] +NUM_GPUS_PER_NODE_MAP = { + "h100": 8, + "b200": 8, + "b300": 8, + "gb200": 4, + "gb300": 4, +} + def list_of_strings(arg): """Split a comma-separated string into a list of substrings.""" @@ -383,8 +391,8 @@ def parse_cli_args(): "-gn", "--gpus_per_node", type=int, - help="Number of gpus per node. Defaults to 8", - default=8, + help="Number of gpus per node. Defaults to None. If not provided, will be inferred from the GPU type.", + default=None, ) slurm_args.add_argument( "-i", @@ -500,7 +508,7 @@ def parse_cli_args(): "-g", "--gpu", type=str, - choices=["h100", "b200", "gb200", "gb300", "b300"], + choices=NUM_GPUS_PER_NODE_MAP.keys(), help="Target gpu type.", required=True, ) diff --git a/scripts/performance/setup_experiment.py b/scripts/performance/setup_experiment.py index 1e5714dffe..2ee374e7de 100755 --- a/scripts/performance/setup_experiment.py +++ b/scripts/performance/setup_experiment.py @@ -28,12 +28,12 @@ try: - from argument_parser import parse_cli_args + from argument_parser import NUM_GPUS_PER_NODE_MAP, parse_cli_args from utils.evaluate import calc_convergence_and_performance from utils.executors import dgxc_executor, slurm_executor from utils.utils import get_exp_name_config, select_config_variant_interactive except (ImportError, ModuleNotFoundError): - from .argument_parser import parse_cli_args + from .argument_parser import NUM_GPUS_PER_NODE_MAP, parse_cli_args from .utils.evaluate import calc_convergence_and_performance from .utils.executors import dgxc_executor, slurm_executor from .utils.utils import get_exp_name_config, select_config_variant_interactive @@ -529,6 +529,15 @@ def main( parser = parse_cli_args() args, unknown_args = parser.parse_known_args() + gpus_per_node = args.gpus_per_node + if gpus_per_node is None: + if args.gpu in NUM_GPUS_PER_NODE_MAP: + gpus_per_node = NUM_GPUS_PER_NODE_MAP[args.gpu] + else: + raise ValueError( + f"Invalid GPU type: {args.gpu}. Please use one of the following: {NUM_GPUS_PER_NODE_MAP.keys()}" + ) + assert not (args.enable_nsys and args.pytorch_profiler), ( "Both NSys and PyTorch profiler cannot be enabled at the same time" ) @@ -586,7 +595,7 @@ def main( account=args.account, partition=args.partition, log_dir=args.log_dir, - gpus_per_node=args.gpus_per_node, + gpus_per_node=gpus_per_node, time_limit=args.time_limit, container_image=args.container_image, custom_mounts=args.custom_mounts,