NVIDIA-NeMo · sudostock · Mar 6, 2026 · Mar 16, 2026 · Mar 16, 2026 · Mar 16, 2026
diff --git a/scripts/performance/README.md b/scripts/performance/README.md
@@ -136,6 +136,41 @@ python scripts/performance/setup_experiment.py
 - `-hf/--hf_token`: HuggingFace token for accessing tokenizers and checkpoints.
   - User can generate a token from- huggingface.co/settings/tokens (click on "Create new token" button)
   - For a "Fine-grained" token, only "User permissions" are needed. Under "User permissions", make selections for "Repositories", "Webhooks" and "Collections".
+- `--offline`: Set `HF_HUB_OFFLINE=1` (Slurm launcher path).
+  - Cannot be used together with `--hf_token`.
+
+##### HuggingFace connectivity and cache behavior (Slurm launcher)
+
+This launcher uses split defaults:
+
+- `TRANSFORMERS_OFFLINE=1`
+- `HF_HUB_OFFLINE=0`
+
+What each variable controls in this workflow:
+
+- `TRANSFORMERS_OFFLINE`: Transformers calls (for example `AutoTokenizer`) stay offline unless `--hf_token` is provided.
+- `HF_HUB_OFFLINE`: HuggingFace Hub calls (for example Hub-backed config/model resolution such as `AutoConfig`) stay online unless `--offline` is provided.
+
+Why this split exists:
+
+- Most benchmark recipes use `NullTokenizer`, so `TRANSFORMERS_OFFLINE=1` avoids unnecessary network traffic.
+- Most performance model families (`llama`, `qwen`, `qwen_vl`, `deepseek`, `gpt_oss`) use HF-backed config/model lookup paths.
+
+Flag mapping:
+
+- `--hf_token` sets `HF_TOKEN` and `TRANSFORMERS_OFFLINE=0`.
+- `--offline` sets `HF_HUB_OFFLINE=1`.
+- `--hf_token` and `--offline` are mutually exclusive.
+
+Practical guidance:
+
+1. Prefetch required model/tokenizer/config files into a local HF cache.
+2. Mount that cache into the container with `-cm/--custom_mounts`.
+3. Set `HF_HOME` to that mounted cache path before launch (Slurm exports env vars by default), for example `export HF_HOME=/path/to/hf_cache`.
+4. If needed, explicitly override `HF_HOME` with `-ce/--custom_env_vars`.
+5. Pass `--offline` to block Hub network checks.
+
+Mounting cached files is not enough by itself. If `HF_HUB_OFFLINE` remains `0`, Hub-backed code paths may still perform network checks and hit HuggingFace rate limits.
 
 ##### Parallelism arguments
 
@@ -146,12 +181,17 @@ python scripts/performance/setup_experiment.py
 - `-ep/--expert_model_parallel_size`: MoE expert parallel degree. Distributes MoE experts across sub data parallel dimension.
 - `-et/--expert_tensor_parallel_size`: Expert tensor parallel degree. Intra-layer tensor model parallelism for expert layer. Use `-et` (no value) for `None` or `-et <int>`.
 
+##### Slurm launcher behavior
+
+- The launcher always adds `--container-writable` to `srun`.
+- This avoids benchmark failures on clusters using Enroot defaults, where `ENROOT_ROOTFS_WRITABLE=no`.
+
 ##### Slurm arguments
 
 - `-a/--account`: Slurm account to use for experiment.
 - `-p/--partition`: Slurm partition to use for experiment.
 - `-t/--time_limit`: Maximum time limit before the Slurm job is cancelled. Format `HH:MM:SS`. Default `00:30:00`.
-- `-gn/--gpus_per_node`: GPUs per node. Default `None`. If not provided, will be inferred from the GPU type.
+- `-gn/--gpus_per_node`: GPUs per node. Default `None`. If not provided, it is inferred from the GPU type.
 - `-cm/--custom_mounts`: Comma-separated list of host mounts to expose inside the container.
 - `-ce/--custom_env_vars`: Comma-separated string of environment variables (format: `key1=value1,key2=value2`).
 - `-cs/--custom_srun_args`: Comma-separated string of srun arguments.

@@ -318,12 +318,18 @@ def parse_cli_args():
         "--tokenizer_model", type=str, help="Path to tokenizer model (automatically provided by launcher)"
     )
     tokenizer_args.add_argument("--vocab_size", type=int, default=32000, help="Vocabulary size for NullTokenizer")
-    tokenizer_args.add_argument(
+    hf_mode = tokenizer_args.add_mutually_exclusive_group()
+    hf_mode.add_argument(
         "-hf",
         "--hf_token",
         type=str,
         help="HuggingFace token. Defaults to None. Required for accessing tokenizers and checkpoints.",
     )
+    hf_mode.add_argument(
+        "--offline",
+        action="store_true",
+        help="Enable offline HuggingFace Hub mode by setting HF_HUB_OFFLINE=1.",
+    )
 
     # Parallelism
     parallelism_args = parser.add_argument_group("Parallelism arguments")

@@ -186,6 +186,7 @@ def main(
     compute_dtype: str,
     gpu: str,
     hf_token: str,
+    offline: bool,
     detach: bool,
     dryrun: bool,
     enable_vboost: bool,
@@ -253,7 +254,11 @@ def main(
         ]
         and task == "pretrain"
     ):
-        assert hf_token is not None, "HF token is required for Qwen3 tokenizer. NullTokenizer to be used soon."
+        assert hf_token or offline, (
+            "Qwen3 tokenizer requires --hf_token (online) or --offline (with a pre-populated local HF cache). "
+            "For --offline, pre-download the tokenizer with `huggingface-cli download` and ensure HF_HOME points "
+            "to the cache directory. NullTokenizer to be used soon."
+        )
 
     if wandb_key is not None:
         assert wandb_project_name is not None and wandb_experiment_name is not None, (
@@ -326,6 +331,7 @@ def main(
             custom_bash_cmds=custom_bash_cmds,
             gres=gres,
             hf_token=hf_token,
+            offline=offline,
             nemo_home=nemo_home,
             additional_slurm_params=additional_slurm_params,
             wandb_key=wandb_key,
@@ -592,6 +598,7 @@ def main(
         compute_dtype=args.compute_dtype,
         gpu=args.gpu,
         hf_token=args.hf_token,
+        offline=args.offline,
         detach=args.detach,
         dryrun=args.dryrun,
         enable_vboost=args.enable_vboost,

diff --git a/scripts/performance/utils/executors.py b/scripts/performance/utils/executors.py
@@ -38,13 +38,13 @@
 
 PERF_ENV_VARS = {
     "TORCH_NCCL_AVOID_RECORD_STREAMS": "1",  # Disable caching NCCL communication buffer memory
-    "TRANSFORMERS_OFFLINE": "1",  # Disable online downloads from HuggingFace
+    "TRANSFORMERS_OFFLINE": "1",  # Default for benchmark runs that mostly use NullTokenizer.
     "TOKENIZERS_PARALLELISM": "False",  # Restrict warning message prints
     "NCCL_NVLS_ENABLE": "0",  # Disable NVLink SHARP to save memory
     "NVTE_NORM_FWD_USE_CUDNN": "1",
     "NVTE_NORM_BWD_USE_CUDNN": "1",
     "TORCH_NCCL_HIGH_PRIORITY": "1",
-    "HF_HUB_OFFLINE": "0",
+    "HF_HUB_OFFLINE": "0",  # Keep HF Hub online by default; --offline flips this to 1.
 }
 
 
@@ -61,6 +61,7 @@ def slurm_executor(
     custom_env_vars: Dict[str, str] = {},
     custom_srun_args: List[str] = [],
     hf_token: str = None,
+    offline: bool = False,
     nemo_home: str = DEFAULT_NEMO_HOME,
     wandb_key: str = None,
     network: str = None,
@@ -86,6 +87,7 @@ def slurm_executor(
     srun_args = custom_srun_args.copy() + [
         "--mpi=pmix",
         "--no-container-mount-home",
+        "--container-writable",  # Required on clusters using Enroot defaults, where ENROOT_ROOTFS_WRITABLE=no.
     ]
 
     if log_dir is not None:
@@ -96,20 +98,26 @@ def slurm_executor(
                 f"Logs will be written to {get_nemorun_home()}, which is probably not desired.  export NEMORUN_HOME in your shell environment or use the --log_dir argument"
             )
 
+    perf_env = PERF_ENV_VARS.copy()
+
     if wandb_key is not None:
-        PERF_ENV_VARS["WANDB_API_KEY"] = wandb_key
+        perf_env["WANDB_API_KEY"] = wandb_key
 
     if gpu.lower() == "gb200":
-        PERF_ENV_VARS["NCCL_NET_GDR_LEVEL"] = "PHB"  # For NCCL 2.25
-        PERF_ENV_VARS["NCCL_NET_GDR_C2C"] = "1"  # For NCCL 2.26
+        perf_env["NCCL_NET_GDR_LEVEL"] = "PHB"  # For NCCL 2.25
+        perf_env["NCCL_NET_GDR_C2C"] = "1"  # For NCCL 2.26
 
     if nemo_home != DEFAULT_NEMO_CACHE_HOME:  # DO NOT change this to 'DEFAULT_NEMO_HOME'/'NEMO_HOME'
-        PERF_ENV_VARS["NEMO_HOME"] = nemo_home
+        perf_env["NEMO_HOME"] = nemo_home
         mounts.extend([f"{nemo_home}:{nemo_home}"])
     if hf_token is not None:
-        PERF_ENV_VARS.update({"HF_TOKEN": hf_token, "TRANSFORMERS_OFFLINE": "0"})
+        # Enable authenticated online access for tokenizer/config paths.
+        perf_env.update({"HF_TOKEN": hf_token, "TRANSFORMERS_OFFLINE": "0"})
+    if offline:
+        # Disable HF Hub network calls. Requires a pre-populated local HF cache.
+        perf_env["HF_HUB_OFFLINE"] = "1"
 
-    PERF_ENV_VARS.update(custom_env_vars)
+    perf_env.update(custom_env_vars)
     mounts.extend(custom_mounts)
 
     # add --segment flag to sbatch if job uses GB200.
@@ -143,7 +151,7 @@ def slurm_executor(
         gres=gres,
         container_image=container_image,
         container_mounts=mounts,
-        env_vars=PERF_ENV_VARS,
+        env_vars=perf_env,
         srun_args=srun_args,
         time=time_limit,
         mem="0",

diff --git a/tests/unit_tests/scripts/test_performance_offline_mode.py b/tests/unit_tests/scripts/test_performance_offline_mode.py
@@ -0,0 +1,203 @@
+# Copyright (c) 2026, NVIDIA CORPORATION.  All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import importlib
+import sys
+import types
+from types import SimpleNamespace
+from unittest.mock import patch
+
+import pytest
+
+
+def _package_module(name: str) -> types.ModuleType:
+    """Create a minimal package-like module."""
+    module = types.ModuleType(name)
+    module.__path__ = []
+    return module
+
+
+@pytest.fixture
+def import_performance_module():
+    """Import performance modules with only the nemo_run surface these tests need."""
+    nemo_run = _package_module("nemo_run")
+    nemo_run_config = types.ModuleType("nemo_run.config")
+    nemo_run_core = _package_module("nemo_run.core")
+    nemo_run_core_execution = _package_module("nemo_run.core.execution")
+    nemo_run_core_execution_launcher = types.ModuleType("nemo_run.core.execution.launcher")
+
+    nemo_run.LocalTunnel = lambda **kwargs: SimpleNamespace(**kwargs)
+    nemo_run.GitArchivePackager = lambda **kwargs: SimpleNamespace(**kwargs)
+    nemo_run.SlurmExecutor = lambda **kwargs: SimpleNamespace(**kwargs)
+    nemo_run.DGXCloudExecutor = lambda **kwargs: SimpleNamespace(**kwargs)
+
+    nemo_run_config.get_nemorun_home = lambda: "/tmp/nemorun"
+    nemo_run_config.set_nemorun_home = lambda _path: None
+
+    nemo_run_core_execution_launcher.SlurmTemplate = lambda **kwargs: SimpleNamespace(**kwargs)
+
+    nemo_run.config = nemo_run_config
+    nemo_run.core = nemo_run_core
+    nemo_run_core.execution = nemo_run_core_execution
+    nemo_run_core_execution.launcher = nemo_run_core_execution_launcher
+
+    with patch.dict(
+        sys.modules,
+        {
+            "nemo_run": nemo_run,
+            "nemo_run.config": nemo_run_config,
+            "nemo_run.core": nemo_run_core,
+            "nemo_run.core.execution": nemo_run_core_execution,
+            "nemo_run.core.execution.launcher": nemo_run_core_execution_launcher,
+        },
+    ):
+
+        def _import(module_name: str):
+            sys.modules.pop(module_name, None)
+            return importlib.import_module(module_name)
+
+        yield _import
+
+
+def test_parse_cli_args_accepts_offline_flag(import_performance_module):
+    """The performance CLI should keep exposing the offline switch."""
+    argument_parser = import_performance_module("scripts.performance.argument_parser")
+
+    parser = argument_parser.parse_cli_args()
+    args = parser.parse_args(
+        [
+            "--model_family_name",
+            "llama",
+            "--model_recipe_name",
+            "llama3_8b",
+            "--num_gpus",
+            "8",
+            "--gpu",
+            "h100",
+            "--offline",
+        ]
+    )
+
+    assert args.offline is True
+
+
+def test_argparse_rejects_hf_token_with_offline(import_performance_module):
+    """argparse should reject --hf_token and --offline together at parse time."""
+    argument_parser = import_performance_module("scripts.performance.argument_parser")
+
+    parser = argument_parser.parse_cli_args()
+    with pytest.raises(SystemExit):
+        parser.parse_args(
+            [
+                "--model_family_name",
+                "llama",
+                "--model_recipe_name",
+                "llama3_8b",
+                "--num_gpus",
+                "8",
+                "--gpu",
+                "h100",
+                "--hf_token",
+                "hf_test_token",
+                "--offline",
+            ]
+        )
+
+
+def test_slurm_executor_sets_offline_env_and_container_writable(import_performance_module):
+    """Offline mode should set HF_HUB_OFFLINE and preserve the offline Transformers default."""
+    executors = import_performance_module("scripts.performance.utils.executors")
+
+    executor = executors.slurm_executor(
+        gpu="h100",
+        account="test_account",
+        partition="test_partition",
+        log_dir="/tmp/log_dir",
+        nodes=1,
+        num_gpus_per_node=8,
+        offline=True,
+    )
+
+    assert "--container-writable" in executor.srun_args
+    assert executor.env_vars["HF_HUB_OFFLINE"] == "1"
+    assert executor.env_vars["TRANSFORMERS_OFFLINE"] == "1"
+
+
+def test_slurm_executor_default_has_container_writable_and_hub_online(import_performance_module):
+    """By default, --container-writable is always set and HF Hub access stays online."""
+    executors = import_performance_module("scripts.performance.utils.executors")
+
+    executor = executors.slurm_executor(
+        gpu="h100",
+        account="test_account",
+        partition="test_partition",
+        log_dir="/tmp/log_dir",
+        nodes=1,
+        num_gpus_per_node=8,
+    )
+
+    assert "--container-writable" in executor.srun_args
+    assert executor.env_vars["HF_HUB_OFFLINE"] == "0"
+    assert executor.env_vars["TRANSFORMERS_OFFLINE"] == "1"
+
+
+def test_slurm_executor_hf_token_enables_online_transformers(import_performance_module):
+    """Providing an HF token should enable the online Transformers path."""
+    executors = import_performance_module("scripts.performance.utils.executors")
+
+    executor = executors.slurm_executor(
+        gpu="h100",
+        account="test_account",
+        partition="test_partition",
+        log_dir="/tmp/log_dir",
+        nodes=1,
+        num_gpus_per_node=8,
+        hf_token="hf_test_token",
+    )
+
+    assert executor.env_vars["HF_TOKEN"] == "hf_test_token"
+    assert executor.env_vars["TRANSFORMERS_OFFLINE"] == "0"
+    assert executor.env_vars["HF_HUB_OFFLINE"] == "0"
+
+
+def test_slurm_executor_no_state_leakage_between_calls(import_performance_module):
+    """Calling slurm_executor twice must not leak env vars from the first call into the second."""
+    executors = import_performance_module("scripts.performance.utils.executors")
+
+    # First call: with hf_token and wandb_key
+    first = executors.slurm_executor(
+        gpu="h100",
+        account="test_account",
+        partition="test_partition",
+        log_dir="/tmp/log_dir",
+        nodes=1,
+        num_gpus_per_node=8,
+        hf_token="hf_secret",
+        wandb_key="wandb_secret",
+    )
+    assert first.env_vars["HF_TOKEN"] == "hf_secret"
+    assert first.env_vars["WANDB_API_KEY"] == "wandb_secret"
+
+    # Second call: no token, no wandb — should get clean defaults
+    second = executors.slurm_executor(
+        gpu="h100",
+        account="test_account",
+        partition="test_partition",
+        log_dir="/tmp/log_dir",
+        nodes=1,
+        num_gpus_per_node=8,
+    )
+    assert "HF_TOKEN" not in second.env_vars
+    assert "WANDB_API_KEY" not in second.env_vars
+    assert second.env_vars["TRANSFORMERS_OFFLINE"] == "1"