From b4d8d5cb17e0e3b64f5304990cc29d8809ec23ec Mon Sep 17 00:00:00 2001 From: Jeremie Dimino <jeremie@dimino.org> Date: Tue, 12 Oct 2021 13:34:31 +0100 Subject: [PATCH 1/2] Update spawn Signed-off-by: Jeremie Dimino <jeremie@dimino.org> --- vendor/spawn/src/spawn.ml | 41 +++++++++++++++++++--------------- vendor/spawn/src/spawn.mli | 22 ++++++++++++++++++ vendor/spawn/src/spawn_stubs.c | 23 ++++++++++++++++--- vendor/update-spawn.sh | 2 +- 4 files changed, 66 insertions(+), 22 deletions(-) diff --git a/vendor/spawn/src/spawn.ml b/vendor/spawn/src/spawn.ml index 658ccc466c2..cc0e72d405c 100644 --- a/vendor/spawn/src/spawn.ml +++ b/vendor/spawn/src/spawn.ml @@ -75,6 +75,21 @@ module Env : Env = (val if Sys.win32 then else (module Env_unix) : Env) +module Pgid = struct + type t = int + + let new_process_group = 0 + + let of_pid = function + | 0 -> + raise (Invalid_argument "bad pid: 0 (hint: use [Pgid.new_process_group])") + | t -> + if t < 0 then + raise (Invalid_argument ("bad pid: " ^ string_of_int t)) + else + t +end + external spawn_unix : env:Env.t option -> cwd:Working_dir.t @@ -84,6 +99,7 @@ external spawn_unix : -> stdout:Unix.file_descr -> stderr:Unix.file_descr -> use_vfork:bool + -> setpgid:int option -> int = "spawn_unix_byte" "spawn_unix" external spawn_windows : @@ -96,7 +112,8 @@ external spawn_windows : -> stderr:Unix.file_descr -> int = "spawn_windows_byte" "spawn_windows" -let spawn_windows ~env ~cwd ~prog ~argv ~stdin ~stdout ~stderr ~use_vfork:_ = +let spawn_windows ~env ~cwd ~prog ~argv ~stdin ~stdout ~stderr ~use_vfork:_ + ~setpgid:_ = let cwd = match (cwd : Working_dir.t) with | Path p -> Some p @@ -120,7 +137,7 @@ let no_null s = let spawn ?env ?(cwd = Working_dir.Inherit) ~prog ~argv ?(stdin = Unix.stdin) ?(stdout = Unix.stdout) ?(stderr = Unix.stderr) - ?(unix_backend = Unix_backend.default) () = + ?(unix_backend = Unix_backend.default) ?setpgid () = (match cwd with | Path s -> no_null s | Fd _ @@ -139,25 +156,13 @@ let spawn ?env ?(cwd = Working_dir.Inherit) ~prog ~argv ?(stdin = Unix.stdin) | Vfork -> true | Fork -> false in - backend ~env ~cwd ~prog ~argv ~stdin ~stdout ~stderr ~use_vfork + backend ~env ~cwd ~prog ~argv ~stdin ~stdout ~stderr ~use_vfork ~setpgid external safe_pipe : unit -> Unix.file_descr * Unix.file_descr = "spawn_pipe" let safe_pipe = - if Sys.win32 then ( + if Sys.win32 then fun () -> - (* CR-someday jdimino: fix race conditions on Windows *) - let fdr, fdw = Unix.pipe () in - match - Unix.set_close_on_exec fdr; - Unix.set_close_on_exec fdw - with - | () -> (fdr, fdw) - | exception exn -> - (try Unix.close fdr with - | _ -> ()); - (try Unix.close fdw with - | _ -> ()); - raise exn - ) else + Unix.pipe ~cloexec:true () + else safe_pipe diff --git a/vendor/spawn/src/spawn.mli b/vendor/spawn/src/spawn.mli index ce9c7240d1c..ca2bc1ddfa2 100644 --- a/vendor/spawn/src/spawn.mli +++ b/vendor/spawn/src/spawn.mli @@ -31,6 +31,21 @@ module Env : sig val of_list : string list -> t end +(** Process group IDs *) +module Pgid : sig + (** Representation of the second parameter to [setpgid]. If a value of this + type is provided to [spawn], the child will immediately set its pgid + accordingly. *) + type t + + (** Sets the child's pgid to the same as its process id. Equivalent to calling + [setpgid(0, 0)]. *) + val new_process_group : t + + (** Raises [Invalid_arg] if the value is not strictly positive. *) + val of_pid : int -> t +end + (** Spawn a sub-command and return its PID. This function is low-level and should be used to build higher-level APIs. @@ -73,6 +88,12 @@ end input, output and error output of the sub-process. When not specified, they default to the ones from the calling process. + {b Process groups} + + If [setpgid] is provided, the child will immediately call [setpgid(0,pid)], + where [pid] is a [pid_t] determined from the [Pgid.t] given (see that + module). This parameter has no effect on Windows platforms. + {b Signals} On Unix, the sub-process will have all its signals unblocked. @@ -91,6 +112,7 @@ val spawn : -> ?stdout:Unix.file_descr -> ?stderr:Unix.file_descr -> ?unix_backend:Unix_backend.t (* default: [Unix_backend.default] *) + -> ?setpgid:Pgid.t -> unit -> int diff --git a/vendor/spawn/src/spawn_stubs.c b/vendor/spawn/src/spawn_stubs.c index aebe124e1b5..eabf6f6af7a 100644 --- a/vendor/spawn/src/spawn_stubs.c +++ b/vendor/spawn/src/spawn_stubs.c @@ -208,6 +208,8 @@ struct spawn_info { char *prog; char **argv; int std_fds[3]; + int set_pgid; + pid_t pgid; }; static void subprocess(int failure_fd, struct spawn_info *info) @@ -216,6 +218,13 @@ static void subprocess(int failure_fd, struct spawn_info *info) struct sigaction sa; sigset_t sigset; + if (info->set_pgid) { + if (setpgid(0, info->pgid) == -1) { + subprocess_failure(failure_fd, "setpgid", NOTHING); + return; + } + } + /* Restore all signals to their default behavior before unblocking them, to avoid invoking handlers from the parent */ sa.sa_handler = SIG_DFL; @@ -349,7 +358,8 @@ CAMLprim value spawn_unix(value v_env, value v_stdin, value v_stdout, value v_stderr, - value v_use_vfork) + value v_use_vfork, + value v_setpgid) { CAMLparam4(v_env, v_cwd, v_prog, v_argv); pid_t ret; @@ -394,6 +404,10 @@ CAMLprim value spawn_unix(value v_env, info.env = Is_block(v_env) ? alloc_string_vect(Field(v_env, 0)) : copy_c_string_array(environ); + info.set_pgid = Is_block(v_setpgid); + info.pgid = + Is_block(v_setpgid) ? + Long_val(Field(v_setpgid, 0)) : 0; caml_enter_blocking_section(); enter_safe_pipe_section(); @@ -508,7 +522,8 @@ CAMLprim value spawn_unix(value v_env, value v_stdin, value v_stdout, value v_stderr, - value v_use_vfork) + value v_use_vfork, + value v_setpgid) { (void)v_env; (void)v_cwd; @@ -518,6 +533,7 @@ CAMLprim value spawn_unix(value v_env, (void)v_stdout; (void)v_stderr; (void)v_use_vfork; + (void)v_setpgid; unix_error(ENOSYS, "spawn_unix", Nothing); } @@ -598,7 +614,8 @@ CAMLprim value spawn_unix_byte(value * argv) argv[4], argv[5], argv[6], - argv[7]); + argv[7], + argv[8]); } CAMLprim value spawn_windows_byte(value * argv) diff --git a/vendor/update-spawn.sh b/vendor/update-spawn.sh index 31bf2a39839..85cc86c342e 100755 --- a/vendor/update-spawn.sh +++ b/vendor/update-spawn.sh @@ -1,6 +1,6 @@ #!/bin/bash -version=b5a25cab2f53a5ee9e10a7b8a96506cc61ce1198 +version=f7b883ebc9bfa5c142a2f10a62b415a51c83bd36 set -e -o pipefail From c908289b60fbe6e036476fa64581187c119b9590 Mon Sep 17 00:00:00 2001 From: Jeremie Dimino <jeremie@dimino.org> Date: Wed, 13 Oct 2021 17:29:15 +0100 Subject: [PATCH 2/2] Run each action in its own process group Signed-off-by: Jeremie Dimino <jeremie@dimino.org> --- CHANGES.md | 3 ++ src/dune_engine/process.ml | 5 ++- src/dune_engine/scheduler.ml | 43 ++++++++++++++++--- src/dune_engine/scheduler.mli | 10 ++++- .../test-cases/actions/stray-process.t | 4 +- 5 files changed, 53 insertions(+), 12 deletions(-) diff --git a/CHANGES.md b/CHANGES.md index 509c265b73d..d94865d99f6 100644 --- a/CHANGES.md +++ b/CHANGES.md @@ -222,6 +222,9 @@ Unreleased cram test cannot escape the sandbox and pick up some random git or mercurial repository on the file system (#4996, @jeremiedimino) +- Run each action in its own process group so that we don't leave + stray processes behind when killing actions (#4998, @jeremiedimino) + 2.9.1 (07/09/2021) ------------------ diff --git a/src/dune_engine/process.ml b/src/dune_engine/process.ml index 2edb8ed0ef4..28762a8f786 100644 --- a/src/dune_engine/process.ml +++ b/src/dune_engine/process.ml @@ -681,6 +681,7 @@ let run_internal ?dir ?(stdout_to = Io.stdout) ?(stderr_to = Io.stderr) let now = Unix.gettimeofday () in ( now , Spawn.spawn () ~prog:prog_str ~argv ~env ~stdout ~stderr ~stdin + ~setpgid:Spawn.Pgid.new_process_group ~cwd: (match dir with | None -> Inherit @@ -697,7 +698,9 @@ let run_internal ?dir ?(stdout_to = Io.stdout) ?(stderr_to = Io.stderr) in Io.release stdout_to; Io.release stderr_to; - let+ process_info = Scheduler.wait_for_process pid in + let+ process_info = + Scheduler.wait_for_process pid ~is_process_group_leader:true + in let times = { Proc.Times.elapsed_time = process_info.end_time -. started_at ; resource_usage = process_info.resource_usage diff --git a/src/dune_engine/scheduler.ml b/src/dune_engine/scheduler.ml index 1bfe0c6fd14..f99f89a1629 100644 --- a/src/dune_engine/scheduler.ml +++ b/src/dune_engine/scheduler.ml @@ -470,6 +470,33 @@ end module Event_queue = Event.Queue +let kill_process_group pid signal = + match Sys.win32 with + | false -> ( + (* Send to the entire process group so that any child processes created by + the job are also terminated. + + Here we could consider sending a signal to the job process directly in + addition to sending it to the process group. This is what GNU [timeout] + does, for example. + + The upside would be that we deliver the signal to that process even if it + changes its process group. This upside is small because moving between + the process groups is a very unusual thing to do (creation of a new + process group is not a problem for us, unlike for [timeout]). + + The downside is that it's more complicated, but also that by sending the + signal twice we're greatly increasing the existing race condition where + we call [wait] in parallel with [kill]. *) + try Unix.kill (-Pid.to_int pid) signal with + | Unix.Unix_error _ -> ()) + | true -> ( + (* Process groups are not supported on Windows (or even if they are, [spawn] + does not know how to use them), so we're only sending the signal to the + job itself. *) + try Unix.kill (Pid.to_int pid) signal with + | Unix.Unix_error _ -> ()) + module Process_watcher : sig (** Initialize the process watcher thread. *) type t @@ -550,9 +577,7 @@ end = struct let killall t signal = Mutex.lock t.mutex; - Process_table.iter t ~f:(fun job -> - try Unix.kill (Pid.to_int job.pid) signal with - | Unix.Unix_error _ -> ()); + Process_table.iter t ~f:(fun job -> kill_process_group job.pid signal); Mutex.unlock t.mutex exception Finished of Proc.Process_info.t @@ -1289,7 +1314,7 @@ let inject_memo_invalidation invalidation = Event.Queue.send_invalidation_event t.events invalidation; Fiber.return () -let wait_for_process_with_timeout t pid ~timeout = +let wait_for_process_with_timeout t pid ~timeout ~is_process_group_leader = Fiber.of_thunk (fun () -> let sleep = Alarm_clock.sleep (Lazy.force t.alarm_clock) timeout in Fiber.fork_and_join_unit @@ -1297,17 +1322,21 @@ let wait_for_process_with_timeout t pid ~timeout = let+ res = Alarm_clock.await sleep in if res = `Finished && Process_watcher.is_running t.process_watcher pid then - Unix.kill (Pid.to_int pid) Sys.sigkill) + if is_process_group_leader then + kill_process_group pid Sys.sigkill + else + Unix.kill (Pid.to_int pid) Sys.sigkill) (fun () -> let+ res = wait_for_process t pid in Alarm_clock.cancel (Lazy.force t.alarm_clock) sleep; res)) -let wait_for_process ?timeout pid = +let wait_for_process ?timeout ?(is_process_group_leader = false) pid = let* t = t () in match timeout with | None -> wait_for_process t pid - | Some timeout -> wait_for_process_with_timeout t pid ~timeout + | Some timeout -> + wait_for_process_with_timeout t pid ~timeout ~is_process_group_leader let sleep duration = let* t = t () in diff --git a/src/dune_engine/scheduler.mli b/src/dune_engine/scheduler.mli index 1dcbb7d3094..453189066fc 100644 --- a/src/dune_engine/scheduler.mli +++ b/src/dune_engine/scheduler.mli @@ -120,8 +120,14 @@ val t : unit -> t Fiber.t available and then calls [f]. *) val with_job_slot : (Config.t -> 'a Fiber.t) -> 'a Fiber.t -(** Wait for the following process to terminate *) -val wait_for_process : ?timeout:float -> Pid.t -> Proc.Process_info.t Fiber.t +(** Wait for the following process to terminate. If [is_process_group_leader] is + true, kill the entire process group instead of just the process in case of + timeout. *) +val wait_for_process : + ?timeout:float + -> ?is_process_group_leader:bool + -> Pid.t + -> Proc.Process_info.t Fiber.t val yield_if_there_are_pending_events : unit -> unit Fiber.t diff --git a/test/blackbox-tests/test-cases/actions/stray-process.t b/test/blackbox-tests/test-cases/actions/stray-process.t index 8ab52833913..ac6d747f1f9 100644 --- a/test/blackbox-tests/test-cases/actions/stray-process.t +++ b/test/blackbox-tests/test-cases/actions/stray-process.t @@ -29,9 +29,9 @@ Now we stop Dune, which should normally kill all sub-processes: waiting for inotify sync waited for inotify sync - $ if kill -s 0 $CHILD_PID; then + $ if kill -s 0 $CHILD_PID 2> /dev/null; then > echo "FAILURE: child is still running" > else > echo "SUCCESS: child has exited" > fi - FAILURE: child is still running + SUCCESS: child has exited