From b52b78429485365018c1f593095a390e54a6436a Mon Sep 17 00:00:00 2001
From: Robert Hensing <robert@roberthensing.nl>
Date: Mon, 15 Dec 2025 21:14:40 +0100
Subject: [PATCH] nixos/test-driver: warn when command exits but stdout stays
 open

The test driver's succeed() method waits for stdout to be fully
consumed before returning. When a command spawns background processes
that inherit stdout, succeed() will wait silently for those processes
to complete or close stdout. This wait can be arbitrarily long and
users have no visibility into what's causing the delay.

Unfortunately just changing the behavior of these widely used methods
is not an option.

This change detects when a command has exited but stdout remains open
for more than 10 seconds, and emits a warning to help users diagnose
the issue. This warning briefly explains the problem and suggests
redirecting background process output to avoid the implicit wait.

The implementation uses bash coproc to independently track:
- The command process exit status (preserved for return)
- The stdout closure (via base64 process termination)
- A 10-second timeout using wait -n to race these events

When stdout closes quickly: no warning
When stdout stays open >10s: warning emitted, continues waiting
---
 .../src/test_driver/machine/__init__.py       | 82 ++++++++++++++++---
 nixos/tests/all-tests.nix                     | 24 ++++++
 .../succeed-stdout-warning.nix                | 45 ++++++++++
 3 files changed, 140 insertions(+), 11 deletions(-)
 create mode 100644 nixos/tests/nixos-test-driver/succeed-stdout-warning.nix

diff --git a/nixos/lib/test-driver/src/test_driver/machine/__init__.py b/nixos/lib/test-driver/src/test_driver/machine/__init__.py
index 926b743b17b2e..196de618477f9 100644
--- a/nixos/lib/test-driver/src/test_driver/machine/__init__.py
+++ b/nixos/lib/test-driver/src/test_driver/machine/__init__.py
@@ -25,6 +25,59 @@
 from .ocr import perform_ocr_on_screenshot, perform_ocr_variants_on_screenshot
 from .qmp import QMPSession
 
+# Bash function to monitor stdout closure after command exits
+STDOUT_MONITOR_FN = """
+_nixos_test_run_with_monitor() {
+  local timeout_arg="$1"
+  local cmd="$2"
+
+  # Start the user command as a coproc to get its PID and stdout FD
+  if [ -n "$timeout_arg" ]; then
+    coproc cmd_proc { timeout "$timeout_arg" bash -c "$cmd"; }
+  else
+    coproc cmd_proc { bash -c "$cmd"; }
+  fi
+  local cmd_pid=$cmd_proc_PID
+
+  # Duplicate the read FD and close the original coproc FDs
+  exec {dup_read_fd}<&${cmd_proc[0]}
+  exec {cmd_proc[0]}<&-
+  exec {cmd_proc[1]}>&-
+
+  # Start base64 reading from the user command's stdout
+  { base64 -w 0 <&$dup_read_fd; echo; } &
+  local base64_pid=$!
+
+  # Wait for user command to exit and capture its status
+  wait $cmd_pid
+  local cmd_exit_status=$?
+
+  # Now monitor: either base64 finishes (stdout closed) or 10s timeout expires
+  sleep 10 &
+  local timeout_pid=$!
+
+  # Wait for whichever finishes first: base64 or timeout
+  # Use wait -p to definitively know which process finished
+  local finished_pid
+  wait -n -p finished_pid $base64_pid $timeout_pid
+
+  # Check which one finished
+  if [ "$finished_pid" = "$base64_pid" ]; then
+    # base64 finished first (fast path - no warning)
+    kill $timeout_pid 2>/dev/null
+    wait $timeout_pid 2>/dev/null
+  else
+    # Timeout finished first, base64 still running (slow path - emit warning)
+    echo -e "\033[1;31mWARNING:\033[0m Command '$cmd' for the test script has exited, but not closed stdout. This is indicative of a background process that still references stdout and could produce output. This is likely not intentional. Calls such as machine.succeed(), execute() and fail() record the entirety of stdout. Consider closing stdout by redirecting your background process stdout elsewhere. Typically that's >&2 to log to the console, >some/path.log for a file on the machine, or >/dev/null to discard stdout entirely." >&2
+    # Continue waiting for base64 to finish
+    wait $base64_pid
+  fi
+
+  # Return the command's exit status
+  return $cmd_exit_status
+}
+"""
+
 CHAR_TO_KEY = {
     "A": "shift-a",
     "N": "shift-n",
@@ -503,15 +556,10 @@ def execute(
         # Always run command with shell opts
         command = f"set -euo pipefail; {command}"
 
-        timeout_str = ""
-        if timeout is not None:
-            timeout_str = f"timeout {timeout}"
-
-        # While sh is bash on NixOS, this is not the case for every distro.
-        # We explicitly call bash here to allow for the driver to boot other distros as well.
-        out_command = (
-            f"{timeout_str} bash -c {shlex.quote(command)} | (base64 -w 0; echo)\n"
-        )
+        # Use the monitoring function that will warn if stdout stays open
+        # Pass timeout as first arg (empty string if None)
+        timeout_arg = str(timeout) if timeout is not None else ""
+        out_command = f"_nixos_test_run_with_monitor {shlex.quote(timeout_arg)} {shlex.quote(command)}\n"
 
         assert self.shell
         self.shell.send(out_command.encode())
@@ -525,8 +573,8 @@ def execute(
         if not check_return:
             return (-1, output.decode())
 
-        # Get the return code
-        self.shell.send(b"echo ${PIPESTATUS[0]}\n")
+        # Get the return code (the monitor function returns the child command's exit status)
+        self.shell.send(b"echo $?\n")
         rc = int(self._next_newline_closed_block_from_shell().strip())
 
         return (rc, output.decode(errors="replace"))
@@ -790,6 +838,15 @@ def start_job(self, jobname: str, user: str | None = None) -> tuple[int, str]:
     def stop_job(self, jobname: str, user: str | None = None) -> tuple[int, str]:
         return self.systemctl(f"stop {jobname}", user)
 
+    def configure_backdoor_shell(self) -> None:
+        """
+        Configure the backdoor shell with helper functions and settings.
+        Called automatically after initial connection.
+        """
+        assert self.shell
+        # Send function definition with trailing newline to ensure it's complete
+        self.shell.send((STDOUT_MONITOR_FN + "\n").encode())
+
     def connect(self) -> None:
         def shell_ready(timeout_secs: int) -> bool:
             """We sent some data from the backdoor service running on the guest
@@ -831,6 +888,9 @@ def shell_ready(timeout_secs: int) -> bool:
             self.log(f"(connecting took {toc - tic:.2f} seconds)")
             self.connected = True
 
+            # Configure the shell with helper functions
+            self.configure_backdoor_shell()
+
     @contextmanager
     def _managed_screenshot(self) -> Generator[Path]:
         """
diff --git a/nixos/tests/all-tests.nix b/nixos/tests/all-tests.nix
index e8c426682f096..80cc991ac4c29 100644
--- a/nixos/tests/all-tests.nix
+++ b/nixos/tests/all-tests.nix
@@ -181,6 +181,30 @@ in
           [[ 143 = $(cat $failed/testBuildFailure.exit) ]]
           touch $out
         '';
+    succeed-stdout-warning =
+      pkgs.runCommand "ensure-succeed-stdout-warning"
+        {
+          failed = pkgs.testers.testBuildFailure (
+            (runTest ./nixos-test-driver/succeed-stdout-warning.nix).config.rawTestDerivation
+          );
+        }
+        ''
+          # Verify the warning appears for slow command
+          grep -F "(echo ONE; sleep 15; echo TWO) &' for the test script has exited, but not closed stdout" $failed/testBuildFailure.log
+
+          # Verify NO warning appears between fast command markers
+          if sed -n '/MARKER: before fast command/,/MARKER: after fast command/p' $failed/testBuildFailure.log | grep -F "for the test script has exited, but not closed stdout"; then
+            echo "ERROR: Warning appeared for fast command when it should not have"
+            exit 1
+          fi
+
+          # Verify User Command stderr goes to console
+          grep -F "STDERR_TEST_MARKER: user command stderr output" $failed/testBuildFailure.log
+
+          # Verify test completed
+          grep -F "Test script done and successful so far. Intentionally failing so that testBuildFailure can perform remaining log checks" $failed/testBuildFailure.log
+          touch $out
+        '';
   };
 
   # NixOS vm tests and non-vm unit tests
diff --git a/nixos/tests/nixos-test-driver/succeed-stdout-warning.nix b/nixos/tests/nixos-test-driver/succeed-stdout-warning.nix
new file mode 100644
index 0000000000000..5a8a6d5fc9862
--- /dev/null
+++ b/nixos/tests/nixos-test-driver/succeed-stdout-warning.nix
@@ -0,0 +1,45 @@
+# Run with:
+#   cd nixpkgs
+#   nix-build -A nixosTests.nixos-test-driver.succeed-stdout-warning
+{
+  name = "Test that succeed() warns when stdout stays open after command exits";
+
+  nodes = {
+    machine = (
+      { pkgs, ... }:
+      {
+      }
+    );
+  };
+
+  testScript = ''
+    start_all()
+    machine.wait_for_unit("multi-user.target")
+
+    with subtest("Fast command should NOT produce warning"):
+        machine.succeed("echo 'MARKER: before fast command' >&2")
+        output = machine.succeed("echo HELLO")
+        assert output == "HELLO\n", f"Expected 'HELLO\\n', got {output!r}"
+        machine.succeed("echo 'MARKER: after fast command' >&2")
+
+    with subtest("Background process keeps stdout open - should produce warning"):
+        machine.succeed("echo 'MARKER: before slow command' >&2")
+        output = machine.succeed("(echo ONE; sleep 15; echo TWO) &")
+        assert output == "ONE\nTWO\n", f"Expected 'ONE\\nTWO\\n', got {output!r}"
+        machine.succeed("echo 'MARKER: after slow command' >&2")
+
+    with subtest("Exit status should be preserved (non-zero)"):
+        try:
+            machine.succeed("exit 42")
+            assert False, "Expected command to fail with exit code 42"
+        except Exception as e:
+            # Verify the exception message contains the correct exit code
+            assert "exit code 42" in str(e), f"Expected 'exit code 42' in exception, got: {e}"
+
+    with subtest("User Command stderr should go to console"):
+        machine.succeed("echo 'STDERR_TEST_MARKER: user command stderr output' >&2")
+
+    # Fail the test so testBuildFailure can access the log
+    raise Exception("Test script done and successful so far. Intentionally failing so that testBuildFailure can perform remaining log checks")
+  '';
+}