Skip to content
Merged
Changes from 1 commit
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
8 changes: 5 additions & 3 deletions scripts/performance/utils/executors.py
Original file line number Diff line number Diff line change
Expand Up @@ -38,13 +38,13 @@

PERF_ENV_VARS = {
"TORCH_NCCL_AVOID_RECORD_STREAMS": "1", # Disable caching NCCL communication buffer memory
"TRANSFORMERS_OFFLINE": "1", # Enable online downloads from HuggingFace
"TRANSFORMERS_OFFLINE": "1", # Disable online downloads from HuggingFace
"TOKENIZERS_PARALLELISM": "False", # Restrict warning message prints
"NCCL_NVLS_ENABLE": "0", # Disable NVLink SHARP to save memory
"NVTE_NORM_FWD_USE_CUDNN": "1",
"NVTE_NORM_BWD_USE_CUDNN": "1",
"TORCH_NCCL_HIGH_PRIORITY": "1",
"HF_HUB_OFFLINE": "0",
"HF_HUB_OFFLINE": "1",
}


Expand Down Expand Up @@ -107,7 +107,9 @@ def slurm_executor(
PERF_ENV_VARS["NEMO_HOME"] = nemo_home
mounts.extend([f"{nemo_home}:{nemo_home}"])
if hf_token is not None:
PERF_ENV_VARS.update({"HF_TOKEN": hf_token, "TRANSFORMERS_OFFLINE": "0"})
PERF_ENV_VARS["HF_TOKEN"] = hf_token
PERF_ENV_VARS["TRANSFORMERS_OFFLINE"] = "0"
PERF_ENV_VARS["HF_HUB_OFFLINE"] = "0"
Comment on lines 109 to +112

Copy link
Copy Markdown
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

⚠️ Potential issue | 🟠 Major

Avoid global env var leakage across calls.
Mutating PERF_ENV_VARS in-place makes HF_HUB_OFFLINE/TRANSFORMERS_OFFLINE “sticky” across future invocations (and across threads). Use a per-call copy instead.

✅ Proposed fix (use a local copy)
 def slurm_executor(
@@
 ) -> run.SlurmExecutor:
@@
-    if wandb_key is not None:
-        PERF_ENV_VARS["WANDB_API_KEY"] = wandb_key
+    env_vars = PERF_ENV_VARS.copy()
+    if wandb_key is not None:
+        env_vars["WANDB_API_KEY"] = wandb_key
@@
-        PERF_ENV_VARS["NCCL_NET_GDR_LEVEL"] = "PHB"  # For NCCL 2.25
-        PERF_ENV_VARS["NCCL_NET_GDR_C2C"] = "1"  # For NCCL 2.26
+        env_vars["NCCL_NET_GDR_LEVEL"] = "PHB"  # For NCCL 2.25
+        env_vars["NCCL_NET_GDR_C2C"] = "1"  # For NCCL 2.26
@@
-        PERF_ENV_VARS["NEMO_HOME"] = nemo_home
+        env_vars["NEMO_HOME"] = nemo_home
@@
-        PERF_ENV_VARS["HF_TOKEN"] = hf_token
-        PERF_ENV_VARS["TRANSFORMERS_OFFLINE"] = "0"
-        PERF_ENV_VARS["HF_HUB_OFFLINE"] = "0"
+        env_vars["HF_TOKEN"] = hf_token
+        env_vars["TRANSFORMERS_OFFLINE"] = "0"
+        env_vars["HF_HUB_OFFLINE"] = "0"
@@
-    PERF_ENV_VARS.update(custom_env_vars)
+    env_vars.update(custom_env_vars)
@@
-        env_vars=PERF_ENV_VARS,
+        env_vars=env_vars,
🤖 Prompt for AI Agents
In `@scripts/performance/utils/executors.py` around lines 109 - 112, The code
mutates the global PERF_ENV_VARS in-place when hf_token is provided, causing
HF_HUB_OFFLINE/TRANSFORMERS_OFFLINE to persist across calls/threads; fix by
creating a per-call copy (e.g., local_env = PERF_ENV_VARS.copy()) inside the
executor before modifying and use local_env for subprocess/env injection instead
of mutating PERF_ENV_VARS; update the branch that checks hf_token to set values
on local_env and ensure any code that previously referenced PERF_ENV_VARS in
this execution uses local_env (referencing PERF_ENV_VARS and hf_token to locate
the change).


PERF_ENV_VARS.update(custom_env_vars)
mounts.extend(custom_mounts)
Expand Down