Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion scripts/performance/README.md
Original file line number Diff line number Diff line change
Expand Up @@ -151,7 +151,7 @@ python scripts/performance/setup_experiment.py
- `-a/--account`: Slurm account to use for experiment.
- `-p/--partition`: Slurm partition to use for experiment.
- `-t/--time_limit`: Maximum time limit before the Slurm job is cancelled. Format `HH:MM:SS`. Default `00:30:00`.
- `-gn/--gpus_per_node`: GPUs per node. Default `8`.
- `-gn/--gpus_per_node`: GPUs per node. Default `None`. If not provided, will be inferred from the GPU type.
- `-cm/--custom_mounts`: Comma-separated list of host mounts to expose inside the container.
- `-ce/--custom_env_vars`: Comma-separated string of environment variables (format: `key1=value1,key2=value2`).
- `-cs/--custom_srun_args`: Comma-separated string of srun arguments.
Expand Down
14 changes: 11 additions & 3 deletions scripts/performance/argument_parser.py
Original file line number Diff line number Diff line change
Expand Up @@ -26,6 +26,14 @@
VALID_CUDA_GRAPH_IMPLS = ["none", "local", "transformer_engine"]
VALID_CUDA_GRAPH_SCOPES = ["full_iteration", "attn", "mlp", "moe", "moe_router", "moe_preprocess", "mamba"]

NUM_GPUS_PER_NODE_MAP = {
"h100": 8,
"b200": 8,
"b300": 8,
"gb200": 4,
"gb300": 4,
}


def list_of_strings(arg):
"""Split a comma-separated string into a list of substrings."""
Expand Down Expand Up @@ -383,8 +391,8 @@ def parse_cli_args():
"-gn",
"--gpus_per_node",
type=int,
help="Number of gpus per node. Defaults to 8",
default=8,
help="Number of gpus per node. Defaults to None. If not provided, will be inferred from the GPU type.",
default=None,
Comment on lines 391 to +395
Copy link
Copy Markdown
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

⚠️ Potential issue | 🟠 Major

Validate --gpus_per_node as a positive integer.

Line 385 currently accepts any integer. 0 can trigger division by zero in downstream node calculation, and negative values can produce invalid node counts.

🛠️ Proposed fix
+def positive_int(arg: str) -> int:
+    value = int(arg)
+    if value <= 0:
+        raise argparse.ArgumentTypeError("Value must be a positive integer")
+    return value
+
 ...
     slurm_args.add_argument(
         "-gn",
         "--gpus_per_node",
-        type=int,
+        type=positive_int,
         help="Number of gpus per node. Defaults to None. If not provided, will be inferred from the GPU type.",
         default=None,
     )
🤖 Prompt for AI Agents
Verify each finding against the current code and only fix it if needed.

In `@scripts/performance/argument_parser.py` around lines 383 - 387, The
--gpus_per_node argparse argument currently allows non-positive integers; add
validation so gpus_per_node must be either None or a positive int (>0).
Implement a small validator (e.g., a custom type function like positive_int or a
check in the argument parsing flow) and use it for the "--gpus_per_node" ("-gn",
"--gpus_per_node") argument to raise argparse.ArgumentTypeError for 0 or
negative values, keeping default=None and leaving None handling for downstream
inference unchanged.

)
Comment on lines 391 to 396
Copy link
Copy Markdown
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

⚠️ Potential issue | 🟠 Major

Validate --gpus_per_node as a positive integer at parse time.

Line 385 currently accepts 0/negative values, which can break downstream node calculation (num_gpus // gpus_per_node) and produce runtime failures.

🛠️ Proposed fix
 def list_of_ints(arg):
@@
     return result
+
+
+def positive_int(arg: str) -> int:
+    """Parse a strictly positive integer CLI value."""
+    value = int(arg)
+    if value < 1:
+        raise argparse.ArgumentTypeError("value must be >= 1")
+    return value
@@
     slurm_args.add_argument(
         "-gn",
         "--gpus_per_node",
-        type=int,
+        type=positive_int,
         help="Number of gpus per node. Defaults to None. If not provided, will be inferred from the GPU type.",
         default=None,
     )
📝 Committable suggestion

‼️ IMPORTANT
Carefully review the code before committing. Ensure that it accurately replaces the highlighted code, contains no missing lines, and has no issues with indentation. Thoroughly test & benchmark the code to ensure it meets the requirements.

Suggested change
"-gn",
"--gpus_per_node",
type=int,
help="Number of gpus per node. Defaults to 8",
default=8,
help="Number of gpus per node. Defaults to None. If not provided, will be inferred from the GPU type.",
default=None,
)
"-gn",
"--gpus_per_node",
type=positive_int,
help="Number of gpus per node. Defaults to None. If not provided, will be inferred from the GPU type.",
default=None,
)
🤖 Prompt for AI Agents
Verify each finding against the current code and only fix it if needed.

In `@scripts/performance/argument_parser.py` around lines 383 - 388, The
--gpus_per_node add_argument currently allows 0/negative values; add a small
validator and use it as the argument type so parsing fails early. Create a
helper like positive_int(value) that converts to int and raises
argparse.ArgumentTypeError if value <= 0, then replace the existing
add_argument(..., "--gpus_per_node", type=int, ...) to use type=positive_int in
the argument_parser (the add_argument call for "--gpus_per_node" and any
parse_args or parse function that references it). This ensures invalid values
are rejected at parse time with a clear error.

slurm_args.add_argument(
"-i",
Expand Down Expand Up @@ -500,7 +508,7 @@ def parse_cli_args():
"-g",
"--gpu",
type=str,
choices=["h100", "b200", "gb200", "gb300", "b300"],
choices=NUM_GPUS_PER_NODE_MAP.keys(),
help="Target gpu type.",
required=True,
)
Expand Down
15 changes: 12 additions & 3 deletions scripts/performance/setup_experiment.py
Original file line number Diff line number Diff line change
Expand Up @@ -28,12 +28,12 @@


try:
from argument_parser import parse_cli_args
from argument_parser import NUM_GPUS_PER_NODE_MAP, parse_cli_args
from utils.evaluate import calc_convergence_and_performance
from utils.executors import dgxc_executor, slurm_executor
from utils.utils import get_exp_name_config, select_config_variant_interactive
except (ImportError, ModuleNotFoundError):
from .argument_parser import parse_cli_args
from .argument_parser import NUM_GPUS_PER_NODE_MAP, parse_cli_args
from .utils.evaluate import calc_convergence_and_performance
from .utils.executors import dgxc_executor, slurm_executor
from .utils.utils import get_exp_name_config, select_config_variant_interactive
Expand Down Expand Up @@ -529,6 +529,15 @@ def main(
parser = parse_cli_args()
args, unknown_args = parser.parse_known_args()

gpus_per_node = args.gpus_per_node
if gpus_per_node is None:
if args.gpu in NUM_GPUS_PER_NODE_MAP:
gpus_per_node = NUM_GPUS_PER_NODE_MAP[args.gpu]
else:
raise ValueError(
f"Invalid GPU type: {args.gpu}. Please use one of the following: {NUM_GPUS_PER_NODE_MAP.keys()}"
)

assert not (args.enable_nsys and args.pytorch_profiler), (
"Both NSys and PyTorch profiler cannot be enabled at the same time"
)
Expand Down Expand Up @@ -586,7 +595,7 @@ def main(
account=args.account,
partition=args.partition,
log_dir=args.log_dir,
gpus_per_node=args.gpus_per_node,
gpus_per_node=gpus_per_node,
time_limit=args.time_limit,
container_image=args.container_image,
custom_mounts=args.custom_mounts,
Expand Down
Loading