From 2dec725bc0f9a84b58220fd861ad53da1f8d9fb5 Mon Sep 17 00:00:00 2001 From: George Armstrong Date: Tue, 3 Mar 2026 10:20:09 -0800 Subject: [PATCH 1/3] Add tests exposing os.getlogin() crash in ns setup (#1269) os.getlogin() raises OSError in containers/VMs without a login session. Two call sites in setup.py are affected: the default mount path (local config) and the default SSH username (slurm config). Signed-off-by: George Armstrong --- tests/test_setup.py | 130 ++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 130 insertions(+) create mode 100644 tests/test_setup.py diff --git a/tests/test_setup.py b/tests/test_setup.py new file mode 100644 index 0000000000..dc1c31f179 --- /dev/null +++ b/tests/test_setup.py @@ -0,0 +1,130 @@ +# Copyright (c) 2025, NVIDIA CORPORATION. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import os +import tempfile +from unittest.mock import patch + +import yaml +from typer.testing import CliRunner + +from nemo_skills.pipeline.cli import app + +runner = CliRunner() + + +def test_local_setup_crashes_without_login_session(): + """ns setup crashes in containers/VMs when creating a local config. + + os.getlogin() is called on line 106 to compute the default mount path. + In environments without a login session (containers, VMs, systemd + services), this raises OSError before the user can provide input. + + See: https://github.com/NVIDIA-NeMo/Skills/issues/1269 + """ + with tempfile.TemporaryDirectory() as tmpdir: + with patch("os.getlogin", side_effect=OSError("[Errno 6] No such device or address")): + prompts = "\n".join( + [ + tmpdir, # config dir + "local", # config type + "local", # config name + "/tmp:/workspace", # mounts (never reached — default eval crashes) + "", # HF_HOME + "", # env vars + "n", # pull containers + "n", # create another + ] + ) + result = runner.invoke(app, ["setup"], input=prompts) + + assert result.exit_code != 0, ( + f"Expected crash from os.getlogin() OSError, but exited with code {result.exit_code}.\n" + f"Output: {result.output}" + ) + assert isinstance(result.exception, OSError), ( + f"Expected OSError, got {type(result.exception).__name__}: {result.exception}" + ) + + +def test_slurm_setup_crashes_without_login_session(): + """ns setup crashes in containers/VMs when creating a slurm config with SSH. + + os.getlogin() is called on line 175 as the default SSH username. + The mounts prompt is unaffected (default is None for slurm), but + the SSH username prompt crashes. + + See: https://github.com/NVIDIA-NeMo/Skills/issues/1269 + """ + with tempfile.TemporaryDirectory() as tmpdir: + with patch("os.getlogin", side_effect=OSError("[Errno 6] No such device or address")): + prompts = "\n".join( + [ + tmpdir, # config dir + "slurm", # config type + "slurm", # config name + "/lustre:/lustre", # mounts (OK — default is None for slurm) + "", # HF_HOME + "", # env vars + "y", # SSH access + "cluster.example.com", # SSH hostname + "testuser", # SSH username (never reached — default eval crashes) + "", # SSH key + "/tmp/jobs", # job dir + "myaccount", # account + "batch", # partition + "", # timeouts + "n", # create another + ] + ) + result = runner.invoke(app, ["setup"], input=prompts) + + assert result.exit_code != 0, ( + f"Expected crash from os.getlogin() OSError, but exited with code {result.exit_code}.\n" + f"Output: {result.output}" + ) + assert isinstance(result.exception, OSError), ( + f"Expected OSError, got {type(result.exception).__name__}: {result.exception}" + ) + + +def test_local_setup_succeeds_with_login_session(): + """Sanity check: ns setup works when os.getlogin() is available.""" + with tempfile.TemporaryDirectory() as tmpdir: + with patch("os.getlogin", return_value="testuser"): + prompts = "\n".join( + [ + tmpdir, # config dir + "local", # config type + "local", # config name + "/tmp:/workspace", # mounts + "", # HF_HOME (skip) + "", # env vars + "n", # pull containers + "n", # create another + ] + ) + result = runner.invoke(app, ["setup"], input=prompts) + + assert result.exit_code == 0, ( + f"Expected success, but exited with code {result.exit_code}.\n" + f"Output: {result.output}\nException: {result.exception}" + ) + + config_file = os.path.join(tmpdir, "local.yaml") + assert os.path.exists(config_file) + with open(config_file) as f: + config = yaml.safe_load(f) + assert config["executor"] == "local" + assert "/tmp:/workspace" in config["mounts"] From 13e543d448adfb32616c80c0c83eee0083c73b98 Mon Sep 17 00:00:00 2001 From: George Armstrong Date: Tue, 3 Mar 2026 10:21:07 -0800 Subject: [PATCH 2/3] Fix os.getlogin() crash in ns setup for containers/VMs (#1269) Replace os.getlogin() with robust alternatives that work in environments without a login session: - Line 107: os.path.expanduser('~') for the default mount path - Line 176: getpass.getuser() for the default SSH username Signed-off-by: George Armstrong --- nemo_skills/pipeline/setup.py | 5 +-- tests/test_setup.py | 60 +++++++++++++++++++---------------- 2 files changed, 35 insertions(+), 30 deletions(-) diff --git a/nemo_skills/pipeline/setup.py b/nemo_skills/pipeline/setup.py index c767a9ce5a..c9ac5e2d5e 100644 --- a/nemo_skills/pipeline/setup.py +++ b/nemo_skills/pipeline/setup.py @@ -12,6 +12,7 @@ # See the License for the specific language governing permissions and # limitations under the License. +import getpass import os import subprocess from pathlib import Path @@ -103,7 +104,7 @@ def setup(): "as well as for your models (e.g. /trt_models, /hf_models).\n" "If you're setting up a Slurm config, make sure to use the cluster paths here.\n" "What mounts would you like to add? (comma separated)", - default=f"/home/{os.getlogin()}:/workspace" if config_type == "local" else None, + default=f"{os.path.expanduser('~')}:/workspace" if config_type == "local" else None, ) # parse mounts early so we can validate HF_HOME against them @@ -172,7 +173,7 @@ def setup(): ssh_tunnel["host"] = typer.prompt("\nWhat is the ssh hostname of your cluster?") ssh_tunnel["user"] = typer.prompt( "\nWhat is your ssh username on the cluster?", - default=os.getlogin(), + default=getpass.getuser(), ) default_key = os.path.expanduser("~/.ssh/id_rsa") ssh_tunnel["identity"] = typer.prompt( diff --git a/tests/test_setup.py b/tests/test_setup.py index dc1c31f179..e340660c6a 100644 --- a/tests/test_setup.py +++ b/tests/test_setup.py @@ -24,14 +24,12 @@ runner = CliRunner() -def test_local_setup_crashes_without_login_session(): - """ns setup crashes in containers/VMs when creating a local config. +def test_local_setup_without_login_session(): + """ns setup should work in containers/VMs even when os.getlogin() fails. - os.getlogin() is called on line 106 to compute the default mount path. - In environments without a login session (containers, VMs, systemd - services), this raises OSError before the user can provide input. - - See: https://github.com/NVIDIA-NeMo/Skills/issues/1269 + Regression test for https://github.com/NVIDIA-NeMo/Skills/issues/1269 + The default mount path now uses os.path.expanduser('~') instead of + os.getlogin(), so it works in environments without a login session. """ with tempfile.TemporaryDirectory() as tmpdir: with patch("os.getlogin", side_effect=OSError("[Errno 6] No such device or address")): @@ -40,7 +38,7 @@ def test_local_setup_crashes_without_login_session(): tmpdir, # config dir "local", # config type "local", # config name - "/tmp:/workspace", # mounts (never reached — default eval crashes) + "/tmp:/workspace", # mounts "", # HF_HOME "", # env vars "n", # pull containers @@ -49,23 +47,25 @@ def test_local_setup_crashes_without_login_session(): ) result = runner.invoke(app, ["setup"], input=prompts) - assert result.exit_code != 0, ( - f"Expected crash from os.getlogin() OSError, but exited with code {result.exit_code}.\n" - f"Output: {result.output}" - ) - assert isinstance(result.exception, OSError), ( - f"Expected OSError, got {type(result.exception).__name__}: {result.exception}" + assert result.exit_code == 0, ( + f"Expected setup to succeed without os.getlogin(), but exited with code {result.exit_code}.\n" + f"Output: {result.output}\nException: {result.exception}" ) + config_file = os.path.join(tmpdir, "local.yaml") + assert os.path.exists(config_file) + with open(config_file) as f: + config = yaml.safe_load(f) + assert config["executor"] == "local" + assert "/tmp:/workspace" in config["mounts"] -def test_slurm_setup_crashes_without_login_session(): - """ns setup crashes in containers/VMs when creating a slurm config with SSH. - os.getlogin() is called on line 175 as the default SSH username. - The mounts prompt is unaffected (default is None for slurm), but - the SSH username prompt crashes. +def test_slurm_setup_without_login_session(): + """ns setup should work for slurm+SSH even when os.getlogin() fails. - See: https://github.com/NVIDIA-NeMo/Skills/issues/1269 + Regression test for https://github.com/NVIDIA-NeMo/Skills/issues/1269 + The default SSH username now uses getpass.getuser() instead of + os.getlogin(), so it works in environments without a login session. """ with tempfile.TemporaryDirectory() as tmpdir: with patch("os.getlogin", side_effect=OSError("[Errno 6] No such device or address")): @@ -74,12 +74,12 @@ def test_slurm_setup_crashes_without_login_session(): tmpdir, # config dir "slurm", # config type "slurm", # config name - "/lustre:/lustre", # mounts (OK — default is None for slurm) + "/lustre:/lustre", # mounts "", # HF_HOME "", # env vars "y", # SSH access "cluster.example.com", # SSH hostname - "testuser", # SSH username (never reached — default eval crashes) + "testuser", # SSH username "", # SSH key "/tmp/jobs", # job dir "myaccount", # account @@ -90,14 +90,18 @@ def test_slurm_setup_crashes_without_login_session(): ) result = runner.invoke(app, ["setup"], input=prompts) - assert result.exit_code != 0, ( - f"Expected crash from os.getlogin() OSError, but exited with code {result.exit_code}.\n" - f"Output: {result.output}" - ) - assert isinstance(result.exception, OSError), ( - f"Expected OSError, got {type(result.exception).__name__}: {result.exception}" + assert result.exit_code == 0, ( + f"Expected setup to succeed without os.getlogin(), but exited with code {result.exit_code}.\n" + f"Output: {result.output}\nException: {result.exception}" ) + config_file = os.path.join(tmpdir, "slurm.yaml") + assert os.path.exists(config_file) + with open(config_file) as f: + config = yaml.safe_load(f) + assert config["executor"] == "slurm" + assert config["ssh_tunnel"]["user"] == "testuser" + def test_local_setup_succeeds_with_login_session(): """Sanity check: ns setup works when os.getlogin() is available.""" From c364d71309edb9d2f18f596102fda9a56957467f Mon Sep 17 00:00:00 2001 From: George Armstrong Date: Tue, 3 Mar 2026 10:44:01 -0800 Subject: [PATCH 3/3] Remove overbuilt test_setup.py tests Signed-off-by: George Armstrong --- tests/test_setup.py | 134 -------------------------------------------- 1 file changed, 134 deletions(-) delete mode 100644 tests/test_setup.py diff --git a/tests/test_setup.py b/tests/test_setup.py deleted file mode 100644 index e340660c6a..0000000000 --- a/tests/test_setup.py +++ /dev/null @@ -1,134 +0,0 @@ -# Copyright (c) 2025, NVIDIA CORPORATION. All rights reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -import os -import tempfile -from unittest.mock import patch - -import yaml -from typer.testing import CliRunner - -from nemo_skills.pipeline.cli import app - -runner = CliRunner() - - -def test_local_setup_without_login_session(): - """ns setup should work in containers/VMs even when os.getlogin() fails. - - Regression test for https://github.com/NVIDIA-NeMo/Skills/issues/1269 - The default mount path now uses os.path.expanduser('~') instead of - os.getlogin(), so it works in environments without a login session. - """ - with tempfile.TemporaryDirectory() as tmpdir: - with patch("os.getlogin", side_effect=OSError("[Errno 6] No such device or address")): - prompts = "\n".join( - [ - tmpdir, # config dir - "local", # config type - "local", # config name - "/tmp:/workspace", # mounts - "", # HF_HOME - "", # env vars - "n", # pull containers - "n", # create another - ] - ) - result = runner.invoke(app, ["setup"], input=prompts) - - assert result.exit_code == 0, ( - f"Expected setup to succeed without os.getlogin(), but exited with code {result.exit_code}.\n" - f"Output: {result.output}\nException: {result.exception}" - ) - - config_file = os.path.join(tmpdir, "local.yaml") - assert os.path.exists(config_file) - with open(config_file) as f: - config = yaml.safe_load(f) - assert config["executor"] == "local" - assert "/tmp:/workspace" in config["mounts"] - - -def test_slurm_setup_without_login_session(): - """ns setup should work for slurm+SSH even when os.getlogin() fails. - - Regression test for https://github.com/NVIDIA-NeMo/Skills/issues/1269 - The default SSH username now uses getpass.getuser() instead of - os.getlogin(), so it works in environments without a login session. - """ - with tempfile.TemporaryDirectory() as tmpdir: - with patch("os.getlogin", side_effect=OSError("[Errno 6] No such device or address")): - prompts = "\n".join( - [ - tmpdir, # config dir - "slurm", # config type - "slurm", # config name - "/lustre:/lustre", # mounts - "", # HF_HOME - "", # env vars - "y", # SSH access - "cluster.example.com", # SSH hostname - "testuser", # SSH username - "", # SSH key - "/tmp/jobs", # job dir - "myaccount", # account - "batch", # partition - "", # timeouts - "n", # create another - ] - ) - result = runner.invoke(app, ["setup"], input=prompts) - - assert result.exit_code == 0, ( - f"Expected setup to succeed without os.getlogin(), but exited with code {result.exit_code}.\n" - f"Output: {result.output}\nException: {result.exception}" - ) - - config_file = os.path.join(tmpdir, "slurm.yaml") - assert os.path.exists(config_file) - with open(config_file) as f: - config = yaml.safe_load(f) - assert config["executor"] == "slurm" - assert config["ssh_tunnel"]["user"] == "testuser" - - -def test_local_setup_succeeds_with_login_session(): - """Sanity check: ns setup works when os.getlogin() is available.""" - with tempfile.TemporaryDirectory() as tmpdir: - with patch("os.getlogin", return_value="testuser"): - prompts = "\n".join( - [ - tmpdir, # config dir - "local", # config type - "local", # config name - "/tmp:/workspace", # mounts - "", # HF_HOME (skip) - "", # env vars - "n", # pull containers - "n", # create another - ] - ) - result = runner.invoke(app, ["setup"], input=prompts) - - assert result.exit_code == 0, ( - f"Expected success, but exited with code {result.exit_code}.\n" - f"Output: {result.output}\nException: {result.exception}" - ) - - config_file = os.path.join(tmpdir, "local.yaml") - assert os.path.exists(config_file) - with open(config_file) as f: - config = yaml.safe_load(f) - assert config["executor"] == "local" - assert "/tmp:/workspace" in config["mounts"]