From b52b78429485365018c1f593095a390e54a6436a Mon Sep 17 00:00:00 2001 From: Robert Hensing Date: Mon, 15 Dec 2025 21:14:40 +0100 Subject: [PATCH] nixos/test-driver: warn when command exits but stdout stays open The test driver's succeed() method waits for stdout to be fully consumed before returning. When a command spawns background processes that inherit stdout, succeed() will wait silently for those processes to complete or close stdout. This wait can be arbitrarily long and users have no visibility into what's causing the delay. Unfortunately just changing the behavior of these widely used methods is not an option. This change detects when a command has exited but stdout remains open for more than 10 seconds, and emits a warning to help users diagnose the issue. This warning briefly explains the problem and suggests redirecting background process output to avoid the implicit wait. The implementation uses bash coproc to independently track: - The command process exit status (preserved for return) - The stdout closure (via base64 process termination) - A 10-second timeout using wait -n to race these events When stdout closes quickly: no warning When stdout stays open >10s: warning emitted, continues waiting --- .../src/test_driver/machine/__init__.py | 82 ++++++++++++++++--- nixos/tests/all-tests.nix | 24 ++++++ .../succeed-stdout-warning.nix | 45 ++++++++++ 3 files changed, 140 insertions(+), 11 deletions(-) create mode 100644 nixos/tests/nixos-test-driver/succeed-stdout-warning.nix diff --git a/nixos/lib/test-driver/src/test_driver/machine/__init__.py b/nixos/lib/test-driver/src/test_driver/machine/__init__.py index 926b743b17b2e..196de618477f9 100644 --- a/nixos/lib/test-driver/src/test_driver/machine/__init__.py +++ b/nixos/lib/test-driver/src/test_driver/machine/__init__.py @@ -25,6 +25,59 @@ from .ocr import perform_ocr_on_screenshot, perform_ocr_variants_on_screenshot from .qmp import QMPSession +# Bash function to monitor stdout closure after command exits +STDOUT_MONITOR_FN = """ +_nixos_test_run_with_monitor() { + local timeout_arg="$1" + local cmd="$2" + + # Start the user command as a coproc to get its PID and stdout FD + if [ -n "$timeout_arg" ]; then + coproc cmd_proc { timeout "$timeout_arg" bash -c "$cmd"; } + else + coproc cmd_proc { bash -c "$cmd"; } + fi + local cmd_pid=$cmd_proc_PID + + # Duplicate the read FD and close the original coproc FDs + exec {dup_read_fd}<&${cmd_proc[0]} + exec {cmd_proc[0]}<&- + exec {cmd_proc[1]}>&- + + # Start base64 reading from the user command's stdout + { base64 -w 0 <&$dup_read_fd; echo; } & + local base64_pid=$! + + # Wait for user command to exit and capture its status + wait $cmd_pid + local cmd_exit_status=$? + + # Now monitor: either base64 finishes (stdout closed) or 10s timeout expires + sleep 10 & + local timeout_pid=$! + + # Wait for whichever finishes first: base64 or timeout + # Use wait -p to definitively know which process finished + local finished_pid + wait -n -p finished_pid $base64_pid $timeout_pid + + # Check which one finished + if [ "$finished_pid" = "$base64_pid" ]; then + # base64 finished first (fast path - no warning) + kill $timeout_pid 2>/dev/null + wait $timeout_pid 2>/dev/null + else + # Timeout finished first, base64 still running (slow path - emit warning) + echo -e "\033[1;31mWARNING:\033[0m Command '$cmd' for the test script has exited, but not closed stdout. This is indicative of a background process that still references stdout and could produce output. This is likely not intentional. Calls such as machine.succeed(), execute() and fail() record the entirety of stdout. Consider closing stdout by redirecting your background process stdout elsewhere. Typically that's >&2 to log to the console, >some/path.log for a file on the machine, or >/dev/null to discard stdout entirely." >&2 + # Continue waiting for base64 to finish + wait $base64_pid + fi + + # Return the command's exit status + return $cmd_exit_status +} +""" + CHAR_TO_KEY = { "A": "shift-a", "N": "shift-n", @@ -503,15 +556,10 @@ def execute( # Always run command with shell opts command = f"set -euo pipefail; {command}" - timeout_str = "" - if timeout is not None: - timeout_str = f"timeout {timeout}" - - # While sh is bash on NixOS, this is not the case for every distro. - # We explicitly call bash here to allow for the driver to boot other distros as well. - out_command = ( - f"{timeout_str} bash -c {shlex.quote(command)} | (base64 -w 0; echo)\n" - ) + # Use the monitoring function that will warn if stdout stays open + # Pass timeout as first arg (empty string if None) + timeout_arg = str(timeout) if timeout is not None else "" + out_command = f"_nixos_test_run_with_monitor {shlex.quote(timeout_arg)} {shlex.quote(command)}\n" assert self.shell self.shell.send(out_command.encode()) @@ -525,8 +573,8 @@ def execute( if not check_return: return (-1, output.decode()) - # Get the return code - self.shell.send(b"echo ${PIPESTATUS[0]}\n") + # Get the return code (the monitor function returns the child command's exit status) + self.shell.send(b"echo $?\n") rc = int(self._next_newline_closed_block_from_shell().strip()) return (rc, output.decode(errors="replace")) @@ -790,6 +838,15 @@ def start_job(self, jobname: str, user: str | None = None) -> tuple[int, str]: def stop_job(self, jobname: str, user: str | None = None) -> tuple[int, str]: return self.systemctl(f"stop {jobname}", user) + def configure_backdoor_shell(self) -> None: + """ + Configure the backdoor shell with helper functions and settings. + Called automatically after initial connection. + """ + assert self.shell + # Send function definition with trailing newline to ensure it's complete + self.shell.send((STDOUT_MONITOR_FN + "\n").encode()) + def connect(self) -> None: def shell_ready(timeout_secs: int) -> bool: """We sent some data from the backdoor service running on the guest @@ -831,6 +888,9 @@ def shell_ready(timeout_secs: int) -> bool: self.log(f"(connecting took {toc - tic:.2f} seconds)") self.connected = True + # Configure the shell with helper functions + self.configure_backdoor_shell() + @contextmanager def _managed_screenshot(self) -> Generator[Path]: """ diff --git a/nixos/tests/all-tests.nix b/nixos/tests/all-tests.nix index e8c426682f096..80cc991ac4c29 100644 --- a/nixos/tests/all-tests.nix +++ b/nixos/tests/all-tests.nix @@ -181,6 +181,30 @@ in [[ 143 = $(cat $failed/testBuildFailure.exit) ]] touch $out ''; + succeed-stdout-warning = + pkgs.runCommand "ensure-succeed-stdout-warning" + { + failed = pkgs.testers.testBuildFailure ( + (runTest ./nixos-test-driver/succeed-stdout-warning.nix).config.rawTestDerivation + ); + } + '' + # Verify the warning appears for slow command + grep -F "(echo ONE; sleep 15; echo TWO) &' for the test script has exited, but not closed stdout" $failed/testBuildFailure.log + + # Verify NO warning appears between fast command markers + if sed -n '/MARKER: before fast command/,/MARKER: after fast command/p' $failed/testBuildFailure.log | grep -F "for the test script has exited, but not closed stdout"; then + echo "ERROR: Warning appeared for fast command when it should not have" + exit 1 + fi + + # Verify User Command stderr goes to console + grep -F "STDERR_TEST_MARKER: user command stderr output" $failed/testBuildFailure.log + + # Verify test completed + grep -F "Test script done and successful so far. Intentionally failing so that testBuildFailure can perform remaining log checks" $failed/testBuildFailure.log + touch $out + ''; }; # NixOS vm tests and non-vm unit tests diff --git a/nixos/tests/nixos-test-driver/succeed-stdout-warning.nix b/nixos/tests/nixos-test-driver/succeed-stdout-warning.nix new file mode 100644 index 0000000000000..5a8a6d5fc9862 --- /dev/null +++ b/nixos/tests/nixos-test-driver/succeed-stdout-warning.nix @@ -0,0 +1,45 @@ +# Run with: +# cd nixpkgs +# nix-build -A nixosTests.nixos-test-driver.succeed-stdout-warning +{ + name = "Test that succeed() warns when stdout stays open after command exits"; + + nodes = { + machine = ( + { pkgs, ... }: + { + } + ); + }; + + testScript = '' + start_all() + machine.wait_for_unit("multi-user.target") + + with subtest("Fast command should NOT produce warning"): + machine.succeed("echo 'MARKER: before fast command' >&2") + output = machine.succeed("echo HELLO") + assert output == "HELLO\n", f"Expected 'HELLO\\n', got {output!r}" + machine.succeed("echo 'MARKER: after fast command' >&2") + + with subtest("Background process keeps stdout open - should produce warning"): + machine.succeed("echo 'MARKER: before slow command' >&2") + output = machine.succeed("(echo ONE; sleep 15; echo TWO) &") + assert output == "ONE\nTWO\n", f"Expected 'ONE\\nTWO\\n', got {output!r}" + machine.succeed("echo 'MARKER: after slow command' >&2") + + with subtest("Exit status should be preserved (non-zero)"): + try: + machine.succeed("exit 42") + assert False, "Expected command to fail with exit code 42" + except Exception as e: + # Verify the exception message contains the correct exit code + assert "exit code 42" in str(e), f"Expected 'exit code 42' in exception, got: {e}" + + with subtest("User Command stderr should go to console"): + machine.succeed("echo 'STDERR_TEST_MARKER: user command stderr output' >&2") + + # Fail the test so testBuildFailure can access the log + raise Exception("Test script done and successful so far. Intentionally failing so that testBuildFailure can perform remaining log checks") + ''; +}