diff --git a/scripts/performance/README.md b/scripts/performance/README.md index 5ebadd44fe..9acd517d27 100644 --- a/scripts/performance/README.md +++ b/scripts/performance/README.md @@ -136,6 +136,41 @@ python scripts/performance/setup_experiment.py - `-hf/--hf_token`: HuggingFace token for accessing tokenizers and checkpoints. - User can generate a token from- huggingface.co/settings/tokens (click on "Create new token" button) - For a "Fine-grained" token, only "User permissions" are needed. Under "User permissions", make selections for "Repositories", "Webhooks" and "Collections". +- `--offline`: Set `HF_HUB_OFFLINE=1` (Slurm launcher path). + - Cannot be used together with `--hf_token`. + +##### HuggingFace connectivity and cache behavior (Slurm launcher) + +This launcher uses split defaults: + +- `TRANSFORMERS_OFFLINE=1` +- `HF_HUB_OFFLINE=0` + +What each variable controls in this workflow: + +- `TRANSFORMERS_OFFLINE`: Transformers calls (for example `AutoTokenizer`) stay offline unless `--hf_token` is provided. +- `HF_HUB_OFFLINE`: HuggingFace Hub calls (for example Hub-backed config/model resolution such as `AutoConfig`) stay online unless `--offline` is provided. + +Why this split exists: + +- Most benchmark recipes use `NullTokenizer`, so `TRANSFORMERS_OFFLINE=1` avoids unnecessary network traffic. +- Most performance model families (`llama`, `qwen`, `qwen_vl`, `deepseek`, `gpt_oss`) use HF-backed config/model lookup paths. + +Flag mapping: + +- `--hf_token` sets `HF_TOKEN` and `TRANSFORMERS_OFFLINE=0`. +- `--offline` sets `HF_HUB_OFFLINE=1`. +- `--hf_token` and `--offline` are mutually exclusive. + +Practical guidance: + +1. Prefetch required model/tokenizer/config files into a local HF cache. +2. Mount that cache into the container with `-cm/--custom_mounts`. +3. Set `HF_HOME` to that mounted cache path before launch (Slurm exports env vars by default), for example `export HF_HOME=/path/to/hf_cache`. +4. If needed, explicitly override `HF_HOME` with `-ce/--custom_env_vars`. +5. Pass `--offline` to block Hub network checks. + +Mounting cached files is not enough by itself. If `HF_HUB_OFFLINE` remains `0`, Hub-backed code paths may still perform network checks and hit HuggingFace rate limits. ##### Parallelism arguments @@ -146,12 +181,17 @@ python scripts/performance/setup_experiment.py - `-ep/--expert_model_parallel_size`: MoE expert parallel degree. Distributes MoE experts across sub data parallel dimension. - `-et/--expert_tensor_parallel_size`: Expert tensor parallel degree. Intra-layer tensor model parallelism for expert layer. Use `-et` (no value) for `None` or `-et `. +##### Slurm launcher behavior + +- The launcher always adds `--container-writable` to `srun`. +- This avoids benchmark failures on clusters using Enroot defaults, where `ENROOT_ROOTFS_WRITABLE=no`. + ##### Slurm arguments - `-a/--account`: Slurm account to use for experiment. - `-p/--partition`: Slurm partition to use for experiment. - `-t/--time_limit`: Maximum time limit before the Slurm job is cancelled. Format `HH:MM:SS`. Default `00:30:00`. -- `-gn/--gpus_per_node`: GPUs per node. Default `None`. If not provided, will be inferred from the GPU type. +- `-gn/--gpus_per_node`: GPUs per node. Default `None`. If not provided, it is inferred from the GPU type. - `-cm/--custom_mounts`: Comma-separated list of host mounts to expose inside the container. - `-ce/--custom_env_vars`: Comma-separated string of environment variables (format: `key1=value1,key2=value2`). - `-cs/--custom_srun_args`: Comma-separated string of srun arguments. diff --git a/scripts/performance/argument_parser.py b/scripts/performance/argument_parser.py index a4626ac80c..e0f04c25ff 100644 --- a/scripts/performance/argument_parser.py +++ b/scripts/performance/argument_parser.py @@ -318,12 +318,18 @@ def parse_cli_args(): "--tokenizer_model", type=str, help="Path to tokenizer model (automatically provided by launcher)" ) tokenizer_args.add_argument("--vocab_size", type=int, default=32000, help="Vocabulary size for NullTokenizer") - tokenizer_args.add_argument( + hf_mode = tokenizer_args.add_mutually_exclusive_group() + hf_mode.add_argument( "-hf", "--hf_token", type=str, help="HuggingFace token. Defaults to None. Required for accessing tokenizers and checkpoints.", ) + hf_mode.add_argument( + "--offline", + action="store_true", + help="Enable offline HuggingFace Hub mode by setting HF_HUB_OFFLINE=1.", + ) # Parallelism parallelism_args = parser.add_argument_group("Parallelism arguments") diff --git a/scripts/performance/setup_experiment.py b/scripts/performance/setup_experiment.py index 7a35f70afe..491a74d0e7 100755 --- a/scripts/performance/setup_experiment.py +++ b/scripts/performance/setup_experiment.py @@ -186,6 +186,7 @@ def main( compute_dtype: str, gpu: str, hf_token: str, + offline: bool, detach: bool, dryrun: bool, enable_vboost: bool, @@ -253,7 +254,11 @@ def main( ] and task == "pretrain" ): - assert hf_token is not None, "HF token is required for Qwen3 tokenizer. NullTokenizer to be used soon." + assert hf_token or offline, ( + "Qwen3 tokenizer requires --hf_token (online) or --offline (with a pre-populated local HF cache). " + "For --offline, pre-download the tokenizer with `huggingface-cli download` and ensure HF_HOME points " + "to the cache directory. NullTokenizer to be used soon." + ) if wandb_key is not None: assert wandb_project_name is not None and wandb_experiment_name is not None, ( @@ -326,6 +331,7 @@ def main( custom_bash_cmds=custom_bash_cmds, gres=gres, hf_token=hf_token, + offline=offline, nemo_home=nemo_home, additional_slurm_params=additional_slurm_params, wandb_key=wandb_key, @@ -592,6 +598,7 @@ def main( compute_dtype=args.compute_dtype, gpu=args.gpu, hf_token=args.hf_token, + offline=args.offline, detach=args.detach, dryrun=args.dryrun, enable_vboost=args.enable_vboost, diff --git a/scripts/performance/utils/executors.py b/scripts/performance/utils/executors.py index a1ba8fac2b..45994fe629 100644 --- a/scripts/performance/utils/executors.py +++ b/scripts/performance/utils/executors.py @@ -38,13 +38,13 @@ PERF_ENV_VARS = { "TORCH_NCCL_AVOID_RECORD_STREAMS": "1", # Disable caching NCCL communication buffer memory - "TRANSFORMERS_OFFLINE": "1", # Disable online downloads from HuggingFace + "TRANSFORMERS_OFFLINE": "1", # Default for benchmark runs that mostly use NullTokenizer. "TOKENIZERS_PARALLELISM": "False", # Restrict warning message prints "NCCL_NVLS_ENABLE": "0", # Disable NVLink SHARP to save memory "NVTE_NORM_FWD_USE_CUDNN": "1", "NVTE_NORM_BWD_USE_CUDNN": "1", "TORCH_NCCL_HIGH_PRIORITY": "1", - "HF_HUB_OFFLINE": "0", + "HF_HUB_OFFLINE": "0", # Keep HF Hub online by default; --offline flips this to 1. } @@ -61,6 +61,7 @@ def slurm_executor( custom_env_vars: Dict[str, str] = {}, custom_srun_args: List[str] = [], hf_token: str = None, + offline: bool = False, nemo_home: str = DEFAULT_NEMO_HOME, wandb_key: str = None, network: str = None, @@ -86,6 +87,7 @@ def slurm_executor( srun_args = custom_srun_args.copy() + [ "--mpi=pmix", "--no-container-mount-home", + "--container-writable", # Required on clusters using Enroot defaults, where ENROOT_ROOTFS_WRITABLE=no. ] if log_dir is not None: @@ -96,20 +98,26 @@ def slurm_executor( f"Logs will be written to {get_nemorun_home()}, which is probably not desired. export NEMORUN_HOME in your shell environment or use the --log_dir argument" ) + perf_env = PERF_ENV_VARS.copy() + if wandb_key is not None: - PERF_ENV_VARS["WANDB_API_KEY"] = wandb_key + perf_env["WANDB_API_KEY"] = wandb_key if gpu.lower() == "gb200": - PERF_ENV_VARS["NCCL_NET_GDR_LEVEL"] = "PHB" # For NCCL 2.25 - PERF_ENV_VARS["NCCL_NET_GDR_C2C"] = "1" # For NCCL 2.26 + perf_env["NCCL_NET_GDR_LEVEL"] = "PHB" # For NCCL 2.25 + perf_env["NCCL_NET_GDR_C2C"] = "1" # For NCCL 2.26 if nemo_home != DEFAULT_NEMO_CACHE_HOME: # DO NOT change this to 'DEFAULT_NEMO_HOME'/'NEMO_HOME' - PERF_ENV_VARS["NEMO_HOME"] = nemo_home + perf_env["NEMO_HOME"] = nemo_home mounts.extend([f"{nemo_home}:{nemo_home}"]) if hf_token is not None: - PERF_ENV_VARS.update({"HF_TOKEN": hf_token, "TRANSFORMERS_OFFLINE": "0"}) + # Enable authenticated online access for tokenizer/config paths. + perf_env.update({"HF_TOKEN": hf_token, "TRANSFORMERS_OFFLINE": "0"}) + if offline: + # Disable HF Hub network calls. Requires a pre-populated local HF cache. + perf_env["HF_HUB_OFFLINE"] = "1" - PERF_ENV_VARS.update(custom_env_vars) + perf_env.update(custom_env_vars) mounts.extend(custom_mounts) # add --segment flag to sbatch if job uses GB200. @@ -143,7 +151,7 @@ def slurm_executor( gres=gres, container_image=container_image, container_mounts=mounts, - env_vars=PERF_ENV_VARS, + env_vars=perf_env, srun_args=srun_args, time=time_limit, mem="0", diff --git a/tests/unit_tests/scripts/test_performance_offline_mode.py b/tests/unit_tests/scripts/test_performance_offline_mode.py new file mode 100644 index 0000000000..79a781ca60 --- /dev/null +++ b/tests/unit_tests/scripts/test_performance_offline_mode.py @@ -0,0 +1,203 @@ +# Copyright (c) 2026, NVIDIA CORPORATION. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import importlib +import sys +import types +from types import SimpleNamespace +from unittest.mock import patch + +import pytest + + +def _package_module(name: str) -> types.ModuleType: + """Create a minimal package-like module.""" + module = types.ModuleType(name) + module.__path__ = [] + return module + + +@pytest.fixture +def import_performance_module(): + """Import performance modules with only the nemo_run surface these tests need.""" + nemo_run = _package_module("nemo_run") + nemo_run_config = types.ModuleType("nemo_run.config") + nemo_run_core = _package_module("nemo_run.core") + nemo_run_core_execution = _package_module("nemo_run.core.execution") + nemo_run_core_execution_launcher = types.ModuleType("nemo_run.core.execution.launcher") + + nemo_run.LocalTunnel = lambda **kwargs: SimpleNamespace(**kwargs) + nemo_run.GitArchivePackager = lambda **kwargs: SimpleNamespace(**kwargs) + nemo_run.SlurmExecutor = lambda **kwargs: SimpleNamespace(**kwargs) + nemo_run.DGXCloudExecutor = lambda **kwargs: SimpleNamespace(**kwargs) + + nemo_run_config.get_nemorun_home = lambda: "/tmp/nemorun" + nemo_run_config.set_nemorun_home = lambda _path: None + + nemo_run_core_execution_launcher.SlurmTemplate = lambda **kwargs: SimpleNamespace(**kwargs) + + nemo_run.config = nemo_run_config + nemo_run.core = nemo_run_core + nemo_run_core.execution = nemo_run_core_execution + nemo_run_core_execution.launcher = nemo_run_core_execution_launcher + + with patch.dict( + sys.modules, + { + "nemo_run": nemo_run, + "nemo_run.config": nemo_run_config, + "nemo_run.core": nemo_run_core, + "nemo_run.core.execution": nemo_run_core_execution, + "nemo_run.core.execution.launcher": nemo_run_core_execution_launcher, + }, + ): + + def _import(module_name: str): + sys.modules.pop(module_name, None) + return importlib.import_module(module_name) + + yield _import + + +def test_parse_cli_args_accepts_offline_flag(import_performance_module): + """The performance CLI should keep exposing the offline switch.""" + argument_parser = import_performance_module("scripts.performance.argument_parser") + + parser = argument_parser.parse_cli_args() + args = parser.parse_args( + [ + "--model_family_name", + "llama", + "--model_recipe_name", + "llama3_8b", + "--num_gpus", + "8", + "--gpu", + "h100", + "--offline", + ] + ) + + assert args.offline is True + + +def test_argparse_rejects_hf_token_with_offline(import_performance_module): + """argparse should reject --hf_token and --offline together at parse time.""" + argument_parser = import_performance_module("scripts.performance.argument_parser") + + parser = argument_parser.parse_cli_args() + with pytest.raises(SystemExit): + parser.parse_args( + [ + "--model_family_name", + "llama", + "--model_recipe_name", + "llama3_8b", + "--num_gpus", + "8", + "--gpu", + "h100", + "--hf_token", + "hf_test_token", + "--offline", + ] + ) + + +def test_slurm_executor_sets_offline_env_and_container_writable(import_performance_module): + """Offline mode should set HF_HUB_OFFLINE and preserve the offline Transformers default.""" + executors = import_performance_module("scripts.performance.utils.executors") + + executor = executors.slurm_executor( + gpu="h100", + account="test_account", + partition="test_partition", + log_dir="/tmp/log_dir", + nodes=1, + num_gpus_per_node=8, + offline=True, + ) + + assert "--container-writable" in executor.srun_args + assert executor.env_vars["HF_HUB_OFFLINE"] == "1" + assert executor.env_vars["TRANSFORMERS_OFFLINE"] == "1" + + +def test_slurm_executor_default_has_container_writable_and_hub_online(import_performance_module): + """By default, --container-writable is always set and HF Hub access stays online.""" + executors = import_performance_module("scripts.performance.utils.executors") + + executor = executors.slurm_executor( + gpu="h100", + account="test_account", + partition="test_partition", + log_dir="/tmp/log_dir", + nodes=1, + num_gpus_per_node=8, + ) + + assert "--container-writable" in executor.srun_args + assert executor.env_vars["HF_HUB_OFFLINE"] == "0" + assert executor.env_vars["TRANSFORMERS_OFFLINE"] == "1" + + +def test_slurm_executor_hf_token_enables_online_transformers(import_performance_module): + """Providing an HF token should enable the online Transformers path.""" + executors = import_performance_module("scripts.performance.utils.executors") + + executor = executors.slurm_executor( + gpu="h100", + account="test_account", + partition="test_partition", + log_dir="/tmp/log_dir", + nodes=1, + num_gpus_per_node=8, + hf_token="hf_test_token", + ) + + assert executor.env_vars["HF_TOKEN"] == "hf_test_token" + assert executor.env_vars["TRANSFORMERS_OFFLINE"] == "0" + assert executor.env_vars["HF_HUB_OFFLINE"] == "0" + + +def test_slurm_executor_no_state_leakage_between_calls(import_performance_module): + """Calling slurm_executor twice must not leak env vars from the first call into the second.""" + executors = import_performance_module("scripts.performance.utils.executors") + + # First call: with hf_token and wandb_key + first = executors.slurm_executor( + gpu="h100", + account="test_account", + partition="test_partition", + log_dir="/tmp/log_dir", + nodes=1, + num_gpus_per_node=8, + hf_token="hf_secret", + wandb_key="wandb_secret", + ) + assert first.env_vars["HF_TOKEN"] == "hf_secret" + assert first.env_vars["WANDB_API_KEY"] == "wandb_secret" + + # Second call: no token, no wandb — should get clean defaults + second = executors.slurm_executor( + gpu="h100", + account="test_account", + partition="test_partition", + log_dir="/tmp/log_dir", + nodes=1, + num_gpus_per_node=8, + ) + assert "HF_TOKEN" not in second.env_vars + assert "WANDB_API_KEY" not in second.env_vars + assert second.env_vars["TRANSFORMERS_OFFLINE"] == "1"