From b4d8d5cb17e0e3b64f5304990cc29d8809ec23ec Mon Sep 17 00:00:00 2001
From: Jeremie Dimino <jeremie@dimino.org>
Date: Tue, 12 Oct 2021 13:34:31 +0100
Subject: [PATCH 1/2] Update spawn

Signed-off-by: Jeremie Dimino <jeremie@dimino.org>
---
 vendor/spawn/src/spawn.ml      | 41 +++++++++++++++++++---------------
 vendor/spawn/src/spawn.mli     | 22 ++++++++++++++++++
 vendor/spawn/src/spawn_stubs.c | 23 ++++++++++++++++---
 vendor/update-spawn.sh         |  2 +-
 4 files changed, 66 insertions(+), 22 deletions(-)

diff --git a/vendor/spawn/src/spawn.ml b/vendor/spawn/src/spawn.ml
index 658ccc466c2..cc0e72d405c 100644
--- a/vendor/spawn/src/spawn.ml
+++ b/vendor/spawn/src/spawn.ml
@@ -75,6 +75,21 @@ module Env : Env = (val if Sys.win32 then
                         else
                           (module Env_unix) : Env)
 
+module Pgid = struct
+  type t = int
+
+  let new_process_group = 0
+
+  let of_pid = function
+    | 0 ->
+      raise (Invalid_argument "bad pid: 0 (hint: use [Pgid.new_process_group])")
+    | t ->
+      if t < 0 then
+        raise (Invalid_argument ("bad pid: " ^ string_of_int t))
+      else
+        t
+end
+
 external spawn_unix :
      env:Env.t option
   -> cwd:Working_dir.t
@@ -84,6 +99,7 @@ external spawn_unix :
   -> stdout:Unix.file_descr
   -> stderr:Unix.file_descr
   -> use_vfork:bool
+  -> setpgid:int option
   -> int = "spawn_unix_byte" "spawn_unix"
 
 external spawn_windows :
@@ -96,7 +112,8 @@ external spawn_windows :
   -> stderr:Unix.file_descr
   -> int = "spawn_windows_byte" "spawn_windows"
 
-let spawn_windows ~env ~cwd ~prog ~argv ~stdin ~stdout ~stderr ~use_vfork:_ =
+let spawn_windows ~env ~cwd ~prog ~argv ~stdin ~stdout ~stderr ~use_vfork:_
+    ~setpgid:_ =
   let cwd =
     match (cwd : Working_dir.t) with
     | Path p -> Some p
@@ -120,7 +137,7 @@ let no_null s =
 
 let spawn ?env ?(cwd = Working_dir.Inherit) ~prog ~argv ?(stdin = Unix.stdin)
     ?(stdout = Unix.stdout) ?(stderr = Unix.stderr)
-    ?(unix_backend = Unix_backend.default) () =
+    ?(unix_backend = Unix_backend.default) ?setpgid () =
   (match cwd with
   | Path s -> no_null s
   | Fd _
@@ -139,25 +156,13 @@ let spawn ?env ?(cwd = Working_dir.Inherit) ~prog ~argv ?(stdin = Unix.stdin)
     | Vfork -> true
     | Fork -> false
   in
-  backend ~env ~cwd ~prog ~argv ~stdin ~stdout ~stderr ~use_vfork
+  backend ~env ~cwd ~prog ~argv ~stdin ~stdout ~stderr ~use_vfork ~setpgid
 
 external safe_pipe : unit -> Unix.file_descr * Unix.file_descr = "spawn_pipe"
 
 let safe_pipe =
-  if Sys.win32 then (
+  if Sys.win32 then
     fun () ->
-  (* CR-someday jdimino: fix race conditions on Windows *)
-  let fdr, fdw = Unix.pipe () in
-  match
-    Unix.set_close_on_exec fdr;
-    Unix.set_close_on_exec fdw
-  with
-  | () -> (fdr, fdw)
-  | exception exn ->
-    (try Unix.close fdr with
-    | _ -> ());
-    (try Unix.close fdw with
-    | _ -> ());
-    raise exn
-  ) else
+  Unix.pipe ~cloexec:true ()
+  else
     safe_pipe
diff --git a/vendor/spawn/src/spawn.mli b/vendor/spawn/src/spawn.mli
index ce9c7240d1c..ca2bc1ddfa2 100644
--- a/vendor/spawn/src/spawn.mli
+++ b/vendor/spawn/src/spawn.mli
@@ -31,6 +31,21 @@ module Env : sig
   val of_list : string list -> t
 end
 
+(** Process group IDs *)
+module Pgid : sig
+  (** Representation of the second parameter to [setpgid]. If a value of this
+      type is provided to [spawn], the child will immediately set its pgid
+      accordingly. *)
+  type t
+
+  (** Sets the child's pgid to the same as its process id. Equivalent to calling
+      [setpgid(0, 0)]. *)
+  val new_process_group : t
+
+  (** Raises [Invalid_arg] if the value is not strictly positive. *)
+  val of_pid : int -> t
+end
+
 (** Spawn a sub-command and return its PID. This function is low-level and
     should be used to build higher-level APIs.
 
@@ -73,6 +88,12 @@ end
     input, output and error output of the sub-process. When not specified, they
     default to the ones from the calling process.
 
+    {b Process groups}
+
+    If [setpgid] is provided, the child will immediately call [setpgid(0,pid)],
+    where [pid] is a [pid_t] determined from the [Pgid.t] given (see that
+    module). This parameter has no effect on Windows platforms.
+
     {b Signals}
 
     On Unix, the sub-process will have all its signals unblocked.
@@ -91,6 +112,7 @@ val spawn :
   -> ?stdout:Unix.file_descr
   -> ?stderr:Unix.file_descr
   -> ?unix_backend:Unix_backend.t (* default: [Unix_backend.default] *)
+  -> ?setpgid:Pgid.t
   -> unit
   -> int
 
diff --git a/vendor/spawn/src/spawn_stubs.c b/vendor/spawn/src/spawn_stubs.c
index aebe124e1b5..eabf6f6af7a 100644
--- a/vendor/spawn/src/spawn_stubs.c
+++ b/vendor/spawn/src/spawn_stubs.c
@@ -208,6 +208,8 @@ struct spawn_info {
   char *prog;
   char **argv;
   int std_fds[3];
+  int set_pgid;
+  pid_t pgid;
 };
 
 static void subprocess(int failure_fd, struct spawn_info *info)
@@ -216,6 +218,13 @@ static void subprocess(int failure_fd, struct spawn_info *info)
   struct sigaction sa;
   sigset_t sigset;
 
+  if (info->set_pgid) {
+    if (setpgid(0, info->pgid) == -1) {
+      subprocess_failure(failure_fd, "setpgid", NOTHING);
+      return;
+    }
+  }
+
   /* Restore all signals to their default behavior before unblocking
      them, to avoid invoking handlers from the parent */
   sa.sa_handler = SIG_DFL;
@@ -349,7 +358,8 @@ CAMLprim value spawn_unix(value v_env,
                           value v_stdin,
                           value v_stdout,
                           value v_stderr,
-                          value v_use_vfork)
+                          value v_use_vfork,
+                          value v_setpgid)
 {
   CAMLparam4(v_env, v_cwd, v_prog, v_argv);
   pid_t ret;
@@ -394,6 +404,10 @@ CAMLprim value spawn_unix(value v_env,
   info.env =
     Is_block(v_env) ?
     alloc_string_vect(Field(v_env, 0)) : copy_c_string_array(environ);
+  info.set_pgid = Is_block(v_setpgid);
+  info.pgid =
+    Is_block(v_setpgid) ?
+    Long_val(Field(v_setpgid, 0)) : 0;
 
   caml_enter_blocking_section();
   enter_safe_pipe_section();
@@ -508,7 +522,8 @@ CAMLprim value spawn_unix(value v_env,
                           value v_stdin,
                           value v_stdout,
                           value v_stderr,
-                          value v_use_vfork)
+                          value v_use_vfork,
+                          value v_setpgid)
 {
   (void)v_env;
   (void)v_cwd;
@@ -518,6 +533,7 @@ CAMLprim value spawn_unix(value v_env,
   (void)v_stdout;
   (void)v_stderr;
   (void)v_use_vfork;
+  (void)v_setpgid;
   unix_error(ENOSYS, "spawn_unix", Nothing);
 }
 
@@ -598,7 +614,8 @@ CAMLprim value spawn_unix_byte(value * argv)
                     argv[4],
                     argv[5],
                     argv[6],
-                    argv[7]);
+                    argv[7],
+                    argv[8]);
 }
 
 CAMLprim value spawn_windows_byte(value * argv)
diff --git a/vendor/update-spawn.sh b/vendor/update-spawn.sh
index 31bf2a39839..85cc86c342e 100755
--- a/vendor/update-spawn.sh
+++ b/vendor/update-spawn.sh
@@ -1,6 +1,6 @@
 #!/bin/bash
 
-version=b5a25cab2f53a5ee9e10a7b8a96506cc61ce1198
+version=f7b883ebc9bfa5c142a2f10a62b415a51c83bd36
 
 set -e -o pipefail
 

From c908289b60fbe6e036476fa64581187c119b9590 Mon Sep 17 00:00:00 2001
From: Jeremie Dimino <jeremie@dimino.org>
Date: Wed, 13 Oct 2021 17:29:15 +0100
Subject: [PATCH 2/2] Run each action in its own process group

Signed-off-by: Jeremie Dimino <jeremie@dimino.org>
---
 CHANGES.md                                    |  3 ++
 src/dune_engine/process.ml                    |  5 ++-
 src/dune_engine/scheduler.ml                  | 43 ++++++++++++++++---
 src/dune_engine/scheduler.mli                 | 10 ++++-
 .../test-cases/actions/stray-process.t        |  4 +-
 5 files changed, 53 insertions(+), 12 deletions(-)

diff --git a/CHANGES.md b/CHANGES.md
index 509c265b73d..d94865d99f6 100644
--- a/CHANGES.md
+++ b/CHANGES.md
@@ -222,6 +222,9 @@ Unreleased
   cram test cannot escape the sandbox and pick up some random git or
   mercurial repository on the file system (#4996, @jeremiedimino)
 
+- Run each action in its own process group so that we don't leave
+  stray processes behind when killing actions (#4998, @jeremiedimino)
+
 2.9.1 (07/09/2021)
 ------------------
 
diff --git a/src/dune_engine/process.ml b/src/dune_engine/process.ml
index 2edb8ed0ef4..28762a8f786 100644
--- a/src/dune_engine/process.ml
+++ b/src/dune_engine/process.ml
@@ -681,6 +681,7 @@ let run_internal ?dir ?(stdout_to = Io.stdout) ?(stderr_to = Io.stderr)
           let now = Unix.gettimeofday () in
           ( now
           , Spawn.spawn () ~prog:prog_str ~argv ~env ~stdout ~stderr ~stdin
+              ~setpgid:Spawn.Pgid.new_process_group
               ~cwd:
                 (match dir with
                 | None -> Inherit
@@ -697,7 +698,9 @@ let run_internal ?dir ?(stdout_to = Io.stdout) ?(stderr_to = Io.stderr)
       in
       Io.release stdout_to;
       Io.release stderr_to;
-      let+ process_info = Scheduler.wait_for_process pid in
+      let+ process_info =
+        Scheduler.wait_for_process pid ~is_process_group_leader:true
+      in
       let times =
         { Proc.Times.elapsed_time = process_info.end_time -. started_at
         ; resource_usage = process_info.resource_usage
diff --git a/src/dune_engine/scheduler.ml b/src/dune_engine/scheduler.ml
index 1bfe0c6fd14..f99f89a1629 100644
--- a/src/dune_engine/scheduler.ml
+++ b/src/dune_engine/scheduler.ml
@@ -470,6 +470,33 @@ end
 
 module Event_queue = Event.Queue
 
+let kill_process_group pid signal =
+  match Sys.win32 with
+  | false -> (
+    (* Send to the entire process group so that any child processes created by
+       the job are also terminated.
+
+       Here we could consider sending a signal to the job process directly in
+       addition to sending it to the process group. This is what GNU [timeout]
+       does, for example.
+
+       The upside would be that we deliver the signal to that process even if it
+       changes its process group. This upside is small because moving between
+       the process groups is a very unusual thing to do (creation of a new
+       process group is not a problem for us, unlike for [timeout]).
+
+       The downside is that it's more complicated, but also that by sending the
+       signal twice we're greatly increasing the existing race condition where
+       we call [wait] in parallel with [kill]. *)
+    try Unix.kill (-Pid.to_int pid) signal with
+    | Unix.Unix_error _ -> ())
+  | true -> (
+    (* Process groups are not supported on Windows (or even if they are, [spawn]
+       does not know how to use them), so we're only sending the signal to the
+       job itself. *)
+    try Unix.kill (Pid.to_int pid) signal with
+    | Unix.Unix_error _ -> ())
+
 module Process_watcher : sig
   (** Initialize the process watcher thread. *)
   type t
@@ -550,9 +577,7 @@ end = struct
 
   let killall t signal =
     Mutex.lock t.mutex;
-    Process_table.iter t ~f:(fun job ->
-        try Unix.kill (Pid.to_int job.pid) signal with
-        | Unix.Unix_error _ -> ());
+    Process_table.iter t ~f:(fun job -> kill_process_group job.pid signal);
     Mutex.unlock t.mutex
 
   exception Finished of Proc.Process_info.t
@@ -1289,7 +1314,7 @@ let inject_memo_invalidation invalidation =
   Event.Queue.send_invalidation_event t.events invalidation;
   Fiber.return ()
 
-let wait_for_process_with_timeout t pid ~timeout =
+let wait_for_process_with_timeout t pid ~timeout ~is_process_group_leader =
   Fiber.of_thunk (fun () ->
       let sleep = Alarm_clock.sleep (Lazy.force t.alarm_clock) timeout in
       Fiber.fork_and_join_unit
@@ -1297,17 +1322,21 @@ let wait_for_process_with_timeout t pid ~timeout =
           let+ res = Alarm_clock.await sleep in
           if res = `Finished && Process_watcher.is_running t.process_watcher pid
           then
-            Unix.kill (Pid.to_int pid) Sys.sigkill)
+            if is_process_group_leader then
+              kill_process_group pid Sys.sigkill
+            else
+              Unix.kill (Pid.to_int pid) Sys.sigkill)
         (fun () ->
           let+ res = wait_for_process t pid in
           Alarm_clock.cancel (Lazy.force t.alarm_clock) sleep;
           res))
 
-let wait_for_process ?timeout pid =
+let wait_for_process ?timeout ?(is_process_group_leader = false) pid =
   let* t = t () in
   match timeout with
   | None -> wait_for_process t pid
-  | Some timeout -> wait_for_process_with_timeout t pid ~timeout
+  | Some timeout ->
+    wait_for_process_with_timeout t pid ~timeout ~is_process_group_leader
 
 let sleep duration =
   let* t = t () in
diff --git a/src/dune_engine/scheduler.mli b/src/dune_engine/scheduler.mli
index 1dcbb7d3094..453189066fc 100644
--- a/src/dune_engine/scheduler.mli
+++ b/src/dune_engine/scheduler.mli
@@ -120,8 +120,14 @@ val t : unit -> t Fiber.t
     available and then calls [f]. *)
 val with_job_slot : (Config.t -> 'a Fiber.t) -> 'a Fiber.t
 
-(** Wait for the following process to terminate *)
-val wait_for_process : ?timeout:float -> Pid.t -> Proc.Process_info.t Fiber.t
+(** Wait for the following process to terminate. If [is_process_group_leader] is
+    true, kill the entire process group instead of just the process in case of
+    timeout. *)
+val wait_for_process :
+     ?timeout:float
+  -> ?is_process_group_leader:bool
+  -> Pid.t
+  -> Proc.Process_info.t Fiber.t
 
 val yield_if_there_are_pending_events : unit -> unit Fiber.t
 
diff --git a/test/blackbox-tests/test-cases/actions/stray-process.t b/test/blackbox-tests/test-cases/actions/stray-process.t
index 8ab52833913..ac6d747f1f9 100644
--- a/test/blackbox-tests/test-cases/actions/stray-process.t
+++ b/test/blackbox-tests/test-cases/actions/stray-process.t
@@ -29,9 +29,9 @@ Now we stop Dune, which should normally kill all sub-processes:
   waiting for inotify sync
   waited for inotify sync
 
-  $ if kill -s 0 $CHILD_PID; then
+  $ if kill -s 0 $CHILD_PID 2> /dev/null; then
   >   echo "FAILURE: child is still running"
   > else
   >   echo "SUCCESS: child has exited"
   > fi
-  FAILURE: child is still running
+  SUCCESS: child has exited