Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
42 changes: 41 additions & 1 deletion scripts/performance/README.md
Original file line number Diff line number Diff line change
Expand Up @@ -136,6 +136,41 @@ python scripts/performance/setup_experiment.py
- `-hf/--hf_token`: HuggingFace token for accessing tokenizers and checkpoints.
- User can generate a token from- huggingface.co/settings/tokens (click on "Create new token" button)
- For a "Fine-grained" token, only "User permissions" are needed. Under "User permissions", make selections for "Repositories", "Webhooks" and "Collections".
- `--offline`: Set `HF_HUB_OFFLINE=1` (Slurm launcher path).
- Cannot be used together with `--hf_token`.

##### HuggingFace connectivity and cache behavior (Slurm launcher)

This launcher uses split defaults:

- `TRANSFORMERS_OFFLINE=1`
- `HF_HUB_OFFLINE=0`

What each variable controls in this workflow:

- `TRANSFORMERS_OFFLINE`: Transformers calls (for example `AutoTokenizer`) stay offline unless `--hf_token` is provided.
- `HF_HUB_OFFLINE`: HuggingFace Hub calls (for example Hub-backed config/model resolution such as `AutoConfig`) stay online unless `--offline` is provided.

Why this split exists:

- Most benchmark recipes use `NullTokenizer`, so `TRANSFORMERS_OFFLINE=1` avoids unnecessary network traffic.
- Most performance model families (`llama`, `qwen`, `qwen_vl`, `deepseek`, `gpt_oss`) use HF-backed config/model lookup paths.

Flag mapping:

- `--hf_token` sets `HF_TOKEN` and `TRANSFORMERS_OFFLINE=0`.
- `--offline` sets `HF_HUB_OFFLINE=1`.
- `--hf_token` and `--offline` are mutually exclusive.

Practical guidance:

1. Prefetch required model/tokenizer/config files into a local HF cache.
2. Mount that cache into the container with `-cm/--custom_mounts`.
3. Set `HF_HOME` to that mounted cache path before launch (Slurm exports env vars by default), for example `export HF_HOME=/path/to/hf_cache`.
4. If needed, explicitly override `HF_HOME` with `-ce/--custom_env_vars`.
5. Pass `--offline` to block Hub network checks.

Mounting cached files is not enough by itself. If `HF_HUB_OFFLINE` remains `0`, Hub-backed code paths may still perform network checks and hit HuggingFace rate limits.

##### Parallelism arguments

Expand All @@ -146,12 +181,17 @@ python scripts/performance/setup_experiment.py
- `-ep/--expert_model_parallel_size`: MoE expert parallel degree. Distributes MoE experts across sub data parallel dimension.
- `-et/--expert_tensor_parallel_size`: Expert tensor parallel degree. Intra-layer tensor model parallelism for expert layer. Use `-et` (no value) for `None` or `-et <int>`.

##### Slurm launcher behavior

- The launcher always adds `--container-writable` to `srun`.
- This avoids benchmark failures on clusters using Enroot defaults, where `ENROOT_ROOTFS_WRITABLE=no`.

##### Slurm arguments

- `-a/--account`: Slurm account to use for experiment.
- `-p/--partition`: Slurm partition to use for experiment.
- `-t/--time_limit`: Maximum time limit before the Slurm job is cancelled. Format `HH:MM:SS`. Default `00:30:00`.
- `-gn/--gpus_per_node`: GPUs per node. Default `None`. If not provided, will be inferred from the GPU type.
- `-gn/--gpus_per_node`: GPUs per node. Default `None`. If not provided, it is inferred from the GPU type.
- `-cm/--custom_mounts`: Comma-separated list of host mounts to expose inside the container.
- `-ce/--custom_env_vars`: Comma-separated string of environment variables (format: `key1=value1,key2=value2`).
- `-cs/--custom_srun_args`: Comma-separated string of srun arguments.
Expand Down
8 changes: 7 additions & 1 deletion scripts/performance/argument_parser.py
Original file line number Diff line number Diff line change
Expand Up @@ -318,12 +318,18 @@ def parse_cli_args():
"--tokenizer_model", type=str, help="Path to tokenizer model (automatically provided by launcher)"
)
tokenizer_args.add_argument("--vocab_size", type=int, default=32000, help="Vocabulary size for NullTokenizer")
tokenizer_args.add_argument(
hf_mode = tokenizer_args.add_mutually_exclusive_group()
hf_mode.add_argument(
"-hf",
"--hf_token",
type=str,
help="HuggingFace token. Defaults to None. Required for accessing tokenizers and checkpoints.",
)
hf_mode.add_argument(
"--offline",
action="store_true",
help="Enable offline HuggingFace Hub mode by setting HF_HUB_OFFLINE=1.",
)

# Parallelism
parallelism_args = parser.add_argument_group("Parallelism arguments")
Expand Down
9 changes: 8 additions & 1 deletion scripts/performance/setup_experiment.py
Original file line number Diff line number Diff line change
Expand Up @@ -186,6 +186,7 @@ def main(
compute_dtype: str,
gpu: str,
hf_token: str,
offline: bool,
detach: bool,
dryrun: bool,
enable_vboost: bool,
Expand Down Expand Up @@ -253,7 +254,11 @@ def main(
]
and task == "pretrain"
):
assert hf_token is not None, "HF token is required for Qwen3 tokenizer. NullTokenizer to be used soon."
assert hf_token or offline, (
"Qwen3 tokenizer requires --hf_token (online) or --offline (with a pre-populated local HF cache). "
"For --offline, pre-download the tokenizer with `huggingface-cli download` and ensure HF_HOME points "
"to the cache directory. NullTokenizer to be used soon."
)

if wandb_key is not None:
assert wandb_project_name is not None and wandb_experiment_name is not None, (
Expand Down Expand Up @@ -326,6 +331,7 @@ def main(
custom_bash_cmds=custom_bash_cmds,
gres=gres,
hf_token=hf_token,
offline=offline,
nemo_home=nemo_home,
additional_slurm_params=additional_slurm_params,
wandb_key=wandb_key,
Expand Down Expand Up @@ -592,6 +598,7 @@ def main(
compute_dtype=args.compute_dtype,
gpu=args.gpu,
hf_token=args.hf_token,
offline=args.offline,
detach=args.detach,
dryrun=args.dryrun,
enable_vboost=args.enable_vboost,
Expand Down
26 changes: 17 additions & 9 deletions scripts/performance/utils/executors.py
Original file line number Diff line number Diff line change
Expand Up @@ -38,13 +38,13 @@

PERF_ENV_VARS = {
"TORCH_NCCL_AVOID_RECORD_STREAMS": "1", # Disable caching NCCL communication buffer memory
"TRANSFORMERS_OFFLINE": "1", # Disable online downloads from HuggingFace
"TRANSFORMERS_OFFLINE": "1", # Default for benchmark runs that mostly use NullTokenizer.
"TOKENIZERS_PARALLELISM": "False", # Restrict warning message prints
"NCCL_NVLS_ENABLE": "0", # Disable NVLink SHARP to save memory
"NVTE_NORM_FWD_USE_CUDNN": "1",
"NVTE_NORM_BWD_USE_CUDNN": "1",
"TORCH_NCCL_HIGH_PRIORITY": "1",
"HF_HUB_OFFLINE": "0",
"HF_HUB_OFFLINE": "0", # Keep HF Hub online by default; --offline flips this to 1.
}


Expand All @@ -61,6 +61,7 @@ def slurm_executor(
custom_env_vars: Dict[str, str] = {},
custom_srun_args: List[str] = [],
hf_token: str = None,
offline: bool = False,
nemo_home: str = DEFAULT_NEMO_HOME,
wandb_key: str = None,
network: str = None,
Expand All @@ -86,6 +87,7 @@ def slurm_executor(
srun_args = custom_srun_args.copy() + [
"--mpi=pmix",
"--no-container-mount-home",
"--container-writable", # Required on clusters using Enroot defaults, where ENROOT_ROOTFS_WRITABLE=no.
]

if log_dir is not None:
Expand All @@ -96,20 +98,26 @@ def slurm_executor(
f"Logs will be written to {get_nemorun_home()}, which is probably not desired. export NEMORUN_HOME in your shell environment or use the --log_dir argument"
)

perf_env = PERF_ENV_VARS.copy()

if wandb_key is not None:
PERF_ENV_VARS["WANDB_API_KEY"] = wandb_key
perf_env["WANDB_API_KEY"] = wandb_key

if gpu.lower() == "gb200":
PERF_ENV_VARS["NCCL_NET_GDR_LEVEL"] = "PHB" # For NCCL 2.25
PERF_ENV_VARS["NCCL_NET_GDR_C2C"] = "1" # For NCCL 2.26
perf_env["NCCL_NET_GDR_LEVEL"] = "PHB" # For NCCL 2.25
perf_env["NCCL_NET_GDR_C2C"] = "1" # For NCCL 2.26

if nemo_home != DEFAULT_NEMO_CACHE_HOME: # DO NOT change this to 'DEFAULT_NEMO_HOME'/'NEMO_HOME'
PERF_ENV_VARS["NEMO_HOME"] = nemo_home
perf_env["NEMO_HOME"] = nemo_home
mounts.extend([f"{nemo_home}:{nemo_home}"])
if hf_token is not None:
PERF_ENV_VARS.update({"HF_TOKEN": hf_token, "TRANSFORMERS_OFFLINE": "0"})
# Enable authenticated online access for tokenizer/config paths.
perf_env.update({"HF_TOKEN": hf_token, "TRANSFORMERS_OFFLINE": "0"})
if offline:
# Disable HF Hub network calls. Requires a pre-populated local HF cache.
perf_env["HF_HUB_OFFLINE"] = "1"

PERF_ENV_VARS.update(custom_env_vars)
perf_env.update(custom_env_vars)
mounts.extend(custom_mounts)

# add --segment flag to sbatch if job uses GB200.
Expand Down Expand Up @@ -143,7 +151,7 @@ def slurm_executor(
gres=gres,
container_image=container_image,
container_mounts=mounts,
env_vars=PERF_ENV_VARS,
env_vars=perf_env,
srun_args=srun_args,
time=time_limit,
mem="0",
Expand Down
203 changes: 203 additions & 0 deletions tests/unit_tests/scripts/test_performance_offline_mode.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,203 @@
# Copyright (c) 2026, NVIDIA CORPORATION. All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

import importlib
import sys
import types
from types import SimpleNamespace
from unittest.mock import patch

import pytest


def _package_module(name: str) -> types.ModuleType:
"""Create a minimal package-like module."""
module = types.ModuleType(name)
module.__path__ = []
return module


@pytest.fixture
def import_performance_module():
"""Import performance modules with only the nemo_run surface these tests need."""
nemo_run = _package_module("nemo_run")
nemo_run_config = types.ModuleType("nemo_run.config")
nemo_run_core = _package_module("nemo_run.core")
nemo_run_core_execution = _package_module("nemo_run.core.execution")
nemo_run_core_execution_launcher = types.ModuleType("nemo_run.core.execution.launcher")

nemo_run.LocalTunnel = lambda **kwargs: SimpleNamespace(**kwargs)
nemo_run.GitArchivePackager = lambda **kwargs: SimpleNamespace(**kwargs)
nemo_run.SlurmExecutor = lambda **kwargs: SimpleNamespace(**kwargs)
nemo_run.DGXCloudExecutor = lambda **kwargs: SimpleNamespace(**kwargs)

nemo_run_config.get_nemorun_home = lambda: "/tmp/nemorun"
nemo_run_config.set_nemorun_home = lambda _path: None

nemo_run_core_execution_launcher.SlurmTemplate = lambda **kwargs: SimpleNamespace(**kwargs)

nemo_run.config = nemo_run_config
nemo_run.core = nemo_run_core
nemo_run_core.execution = nemo_run_core_execution
nemo_run_core_execution.launcher = nemo_run_core_execution_launcher

with patch.dict(
sys.modules,
{
"nemo_run": nemo_run,
"nemo_run.config": nemo_run_config,
"nemo_run.core": nemo_run_core,
"nemo_run.core.execution": nemo_run_core_execution,
"nemo_run.core.execution.launcher": nemo_run_core_execution_launcher,
},
):

def _import(module_name: str):
sys.modules.pop(module_name, None)
return importlib.import_module(module_name)

yield _import


def test_parse_cli_args_accepts_offline_flag(import_performance_module):
"""The performance CLI should keep exposing the offline switch."""
argument_parser = import_performance_module("scripts.performance.argument_parser")

parser = argument_parser.parse_cli_args()
args = parser.parse_args(
[
"--model_family_name",
"llama",
"--model_recipe_name",
"llama3_8b",
"--num_gpus",
"8",
"--gpu",
"h100",
"--offline",
]
)

assert args.offline is True


def test_argparse_rejects_hf_token_with_offline(import_performance_module):
"""argparse should reject --hf_token and --offline together at parse time."""
argument_parser = import_performance_module("scripts.performance.argument_parser")

parser = argument_parser.parse_cli_args()
with pytest.raises(SystemExit):
parser.parse_args(
[
"--model_family_name",
"llama",
"--model_recipe_name",
"llama3_8b",
"--num_gpus",
"8",
"--gpu",
"h100",
"--hf_token",
"hf_test_token",
"--offline",
]
)


def test_slurm_executor_sets_offline_env_and_container_writable(import_performance_module):
"""Offline mode should set HF_HUB_OFFLINE and preserve the offline Transformers default."""
executors = import_performance_module("scripts.performance.utils.executors")

executor = executors.slurm_executor(
gpu="h100",
account="test_account",
partition="test_partition",
log_dir="/tmp/log_dir",
nodes=1,
num_gpus_per_node=8,
offline=True,
)

assert "--container-writable" in executor.srun_args
assert executor.env_vars["HF_HUB_OFFLINE"] == "1"
assert executor.env_vars["TRANSFORMERS_OFFLINE"] == "1"


def test_slurm_executor_default_has_container_writable_and_hub_online(import_performance_module):
"""By default, --container-writable is always set and HF Hub access stays online."""
executors = import_performance_module("scripts.performance.utils.executors")

executor = executors.slurm_executor(
gpu="h100",
account="test_account",
partition="test_partition",
log_dir="/tmp/log_dir",
nodes=1,
num_gpus_per_node=8,
)

assert "--container-writable" in executor.srun_args
assert executor.env_vars["HF_HUB_OFFLINE"] == "0"
assert executor.env_vars["TRANSFORMERS_OFFLINE"] == "1"


def test_slurm_executor_hf_token_enables_online_transformers(import_performance_module):
"""Providing an HF token should enable the online Transformers path."""
executors = import_performance_module("scripts.performance.utils.executors")

executor = executors.slurm_executor(
gpu="h100",
account="test_account",
partition="test_partition",
log_dir="/tmp/log_dir",
nodes=1,
num_gpus_per_node=8,
hf_token="hf_test_token",
)

assert executor.env_vars["HF_TOKEN"] == "hf_test_token"
assert executor.env_vars["TRANSFORMERS_OFFLINE"] == "0"
assert executor.env_vars["HF_HUB_OFFLINE"] == "0"


def test_slurm_executor_no_state_leakage_between_calls(import_performance_module):
"""Calling slurm_executor twice must not leak env vars from the first call into the second."""
executors = import_performance_module("scripts.performance.utils.executors")

# First call: with hf_token and wandb_key
first = executors.slurm_executor(
gpu="h100",
account="test_account",
partition="test_partition",
log_dir="/tmp/log_dir",
nodes=1,
num_gpus_per_node=8,
hf_token="hf_secret",
wandb_key="wandb_secret",
)
assert first.env_vars["HF_TOKEN"] == "hf_secret"
assert first.env_vars["WANDB_API_KEY"] == "wandb_secret"

# Second call: no token, no wandb — should get clean defaults
second = executors.slurm_executor(
gpu="h100",
account="test_account",
partition="test_partition",
log_dir="/tmp/log_dir",
nodes=1,
num_gpus_per_node=8,
)
assert "HF_TOKEN" not in second.env_vars
assert "WANDB_API_KEY" not in second.env_vars
assert second.env_vars["TRANSFORMERS_OFFLINE"] == "1"
Loading