Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Run each action in its own process group #4998

Merged
2 commits merged into from Oct 14, 2021
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
3 changes: 3 additions & 0 deletions CHANGES.md
Original file line number Diff line number Diff line change
Expand Up @@ -222,6 +222,9 @@ Unreleased
cram test cannot escape the sandbox and pick up some random git or
mercurial repository on the file system (#4996, @jeremiedimino)

- Run each action in its own process group so that we don't leave
stray processes behind when killing actions (#4998, @jeremiedimino)

2.9.1 (07/09/2021)
------------------

Expand Down
5 changes: 4 additions & 1 deletion src/dune_engine/process.ml
Original file line number Diff line number Diff line change
Expand Up @@ -681,6 +681,7 @@ let run_internal ?dir ?(stdout_to = Io.stdout) ?(stderr_to = Io.stderr)
let now = Unix.gettimeofday () in
( now
, Spawn.spawn () ~prog:prog_str ~argv ~env ~stdout ~stderr ~stdin
~setpgid:Spawn.Pgid.new_process_group
~cwd:
(match dir with
| None -> Inherit
Expand All @@ -697,7 +698,9 @@ let run_internal ?dir ?(stdout_to = Io.stdout) ?(stderr_to = Io.stderr)
in
Io.release stdout_to;
Io.release stderr_to;
let+ process_info = Scheduler.wait_for_process pid in
let+ process_info =
Scheduler.wait_for_process pid ~is_process_group_leader:true
in
let times =
{ Proc.Times.elapsed_time = process_info.end_time -. started_at
; resource_usage = process_info.resource_usage
Expand Down
43 changes: 36 additions & 7 deletions src/dune_engine/scheduler.ml
Original file line number Diff line number Diff line change
Expand Up @@ -470,6 +470,33 @@ end

module Event_queue = Event.Queue

let kill_process_group pid signal =
match Sys.win32 with
| false -> (
(* Send to the entire process group so that any child processes created by
the job are also terminated.

Here we could consider sending a signal to the job process directly in
addition to sending it to the process group. This is what GNU [timeout]
does, for example.

The upside would be that we deliver the signal to that process even if it
changes its process group. This upside is small because moving between
the process groups is a very unusual thing to do (creation of a new
process group is not a problem for us, unlike for [timeout]).

The downside is that it's more complicated, but also that by sending the
signal twice we're greatly increasing the existing race condition where
we call [wait] in parallel with [kill]. *)
try Unix.kill (-Pid.to_int pid) signal with
| Unix.Unix_error _ -> ())
This conversation was marked as resolved.
Show resolved Hide resolved
| true -> (
(* Process groups are not supported on Windows (or even if they are, [spawn]
does not know how to use them), so we're only sending the signal to the
job itself. *)
try Unix.kill (Pid.to_int pid) signal with
| Unix.Unix_error _ -> ())

module Process_watcher : sig
(** Initialize the process watcher thread. *)
type t
Expand Down Expand Up @@ -550,9 +577,7 @@ end = struct

let killall t signal =
Mutex.lock t.mutex;
Process_table.iter t ~f:(fun job ->
try Unix.kill (Pid.to_int job.pid) signal with
| Unix.Unix_error _ -> ());
Process_table.iter t ~f:(fun job -> kill_process_group job.pid signal);
Mutex.unlock t.mutex

exception Finished of Proc.Process_info.t
Expand Down Expand Up @@ -1289,25 +1314,29 @@ let inject_memo_invalidation invalidation =
Event.Queue.send_invalidation_event t.events invalidation;
Fiber.return ()

let wait_for_process_with_timeout t pid ~timeout =
let wait_for_process_with_timeout t pid ~timeout ~is_process_group_leader =
Fiber.of_thunk (fun () ->
let sleep = Alarm_clock.sleep (Lazy.force t.alarm_clock) timeout in
Fiber.fork_and_join_unit
(fun () ->
let+ res = Alarm_clock.await sleep in
if res = `Finished && Process_watcher.is_running t.process_watcher pid
then
Unix.kill (Pid.to_int pid) Sys.sigkill)
if is_process_group_leader then
kill_process_group pid Sys.sigkill
else
Unix.kill (Pid.to_int pid) Sys.sigkill)
(fun () ->
let+ res = wait_for_process t pid in
Alarm_clock.cancel (Lazy.force t.alarm_clock) sleep;
res))

let wait_for_process ?timeout pid =
let wait_for_process ?timeout ?(is_process_group_leader = false) pid =
let* t = t () in
match timeout with
| None -> wait_for_process t pid
| Some timeout -> wait_for_process_with_timeout t pid ~timeout
| Some timeout ->
wait_for_process_with_timeout t pid ~timeout ~is_process_group_leader

let sleep duration =
let* t = t () in
Expand Down
10 changes: 8 additions & 2 deletions src/dune_engine/scheduler.mli
Original file line number Diff line number Diff line change
Expand Up @@ -120,8 +120,14 @@ val t : unit -> t Fiber.t
available and then calls [f]. *)
val with_job_slot : (Config.t -> 'a Fiber.t) -> 'a Fiber.t

(** Wait for the following process to terminate *)
val wait_for_process : ?timeout:float -> Pid.t -> Proc.Process_info.t Fiber.t
(** Wait for the following process to terminate. If [is_process_group_leader] is
true, kill the entire process group instead of just the process in case of
timeout. *)
val wait_for_process :
?timeout:float
-> ?is_process_group_leader:bool
-> Pid.t
This conversation was marked as resolved.
Show resolved Hide resolved
-> Proc.Process_info.t Fiber.t

val yield_if_there_are_pending_events : unit -> unit Fiber.t

Expand Down
4 changes: 2 additions & 2 deletions test/blackbox-tests/test-cases/actions/stray-process.t
Original file line number Diff line number Diff line change
Expand Up @@ -29,9 +29,9 @@ Now we stop Dune, which should normally kill all sub-processes:
waiting for inotify sync
waited for inotify sync

$ if kill -s 0 $CHILD_PID; then
$ if kill -s 0 $CHILD_PID 2> /dev/null; then
> echo "FAILURE: child is still running"
> else
> echo "SUCCESS: child has exited"
> fi
FAILURE: child is still running
SUCCESS: child has exited
41 changes: 23 additions & 18 deletions vendor/spawn/src/spawn.ml
Original file line number Diff line number Diff line change
Expand Up @@ -75,6 +75,21 @@ module Env : Env = (val if Sys.win32 then
else
(module Env_unix) : Env)

module Pgid = struct
type t = int

let new_process_group = 0

let of_pid = function
| 0 ->
raise (Invalid_argument "bad pid: 0 (hint: use [Pgid.new_process_group])")
| t ->
if t < 0 then
raise (Invalid_argument ("bad pid: " ^ string_of_int t))
else
t
end

external spawn_unix :
env:Env.t option
-> cwd:Working_dir.t
Expand All @@ -84,6 +99,7 @@ external spawn_unix :
-> stdout:Unix.file_descr
-> stderr:Unix.file_descr
-> use_vfork:bool
-> setpgid:int option
-> int = "spawn_unix_byte" "spawn_unix"

external spawn_windows :
Expand All @@ -96,7 +112,8 @@ external spawn_windows :
-> stderr:Unix.file_descr
-> int = "spawn_windows_byte" "spawn_windows"

let spawn_windows ~env ~cwd ~prog ~argv ~stdin ~stdout ~stderr ~use_vfork:_ =
let spawn_windows ~env ~cwd ~prog ~argv ~stdin ~stdout ~stderr ~use_vfork:_
~setpgid:_ =
let cwd =
match (cwd : Working_dir.t) with
| Path p -> Some p
Expand All @@ -120,7 +137,7 @@ let no_null s =

let spawn ?env ?(cwd = Working_dir.Inherit) ~prog ~argv ?(stdin = Unix.stdin)
?(stdout = Unix.stdout) ?(stderr = Unix.stderr)
?(unix_backend = Unix_backend.default) () =
?(unix_backend = Unix_backend.default) ?setpgid () =
(match cwd with
| Path s -> no_null s
| Fd _
Expand All @@ -139,25 +156,13 @@ let spawn ?env ?(cwd = Working_dir.Inherit) ~prog ~argv ?(stdin = Unix.stdin)
| Vfork -> true
| Fork -> false
in
backend ~env ~cwd ~prog ~argv ~stdin ~stdout ~stderr ~use_vfork
backend ~env ~cwd ~prog ~argv ~stdin ~stdout ~stderr ~use_vfork ~setpgid

external safe_pipe : unit -> Unix.file_descr * Unix.file_descr = "spawn_pipe"

let safe_pipe =
if Sys.win32 then (
if Sys.win32 then
fun () ->
(* CR-someday jdimino: fix race conditions on Windows *)
let fdr, fdw = Unix.pipe () in
match
Unix.set_close_on_exec fdr;
Unix.set_close_on_exec fdw
with
| () -> (fdr, fdw)
| exception exn ->
(try Unix.close fdr with
| _ -> ());
(try Unix.close fdw with
| _ -> ());
raise exn
) else
Unix.pipe ~cloexec:true ()
else
safe_pipe
22 changes: 22 additions & 0 deletions vendor/spawn/src/spawn.mli
Original file line number Diff line number Diff line change
Expand Up @@ -31,6 +31,21 @@ module Env : sig
val of_list : string list -> t
end

(** Process group IDs *)
module Pgid : sig
(** Representation of the second parameter to [setpgid]. If a value of this
type is provided to [spawn], the child will immediately set its pgid
accordingly. *)
type t

(** Sets the child's pgid to the same as its process id. Equivalent to calling
[setpgid(0, 0)]. *)
val new_process_group : t

(** Raises [Invalid_arg] if the value is not strictly positive. *)
val of_pid : int -> t
end

(** Spawn a sub-command and return its PID. This function is low-level and
should be used to build higher-level APIs.

Expand Down Expand Up @@ -73,6 +88,12 @@ end
input, output and error output of the sub-process. When not specified, they
default to the ones from the calling process.

{b Process groups}

If [setpgid] is provided, the child will immediately call [setpgid(0,pid)],
where [pid] is a [pid_t] determined from the [Pgid.t] given (see that
module). This parameter has no effect on Windows platforms.

{b Signals}

On Unix, the sub-process will have all its signals unblocked.
Expand All @@ -91,6 +112,7 @@ val spawn :
-> ?stdout:Unix.file_descr
-> ?stderr:Unix.file_descr
-> ?unix_backend:Unix_backend.t (* default: [Unix_backend.default] *)
-> ?setpgid:Pgid.t
-> unit
-> int

Expand Down
23 changes: 20 additions & 3 deletions vendor/spawn/src/spawn_stubs.c
Original file line number Diff line number Diff line change
Expand Up @@ -208,6 +208,8 @@ struct spawn_info {
char *prog;
char **argv;
int std_fds[3];
int set_pgid;
pid_t pgid;
};

static void subprocess(int failure_fd, struct spawn_info *info)
Expand All @@ -216,6 +218,13 @@ static void subprocess(int failure_fd, struct spawn_info *info)
struct sigaction sa;
sigset_t sigset;

if (info->set_pgid) {
if (setpgid(0, info->pgid) == -1) {
subprocess_failure(failure_fd, "setpgid", NOTHING);
return;
}
}

/* Restore all signals to their default behavior before unblocking
them, to avoid invoking handlers from the parent */
sa.sa_handler = SIG_DFL;
Expand Down Expand Up @@ -349,7 +358,8 @@ CAMLprim value spawn_unix(value v_env,
value v_stdin,
value v_stdout,
value v_stderr,
value v_use_vfork)
value v_use_vfork,
value v_setpgid)
{
CAMLparam4(v_env, v_cwd, v_prog, v_argv);
pid_t ret;
Expand Down Expand Up @@ -394,6 +404,10 @@ CAMLprim value spawn_unix(value v_env,
info.env =
Is_block(v_env) ?
alloc_string_vect(Field(v_env, 0)) : copy_c_string_array(environ);
info.set_pgid = Is_block(v_setpgid);
info.pgid =
Is_block(v_setpgid) ?
Long_val(Field(v_setpgid, 0)) : 0;

caml_enter_blocking_section();
enter_safe_pipe_section();
Expand Down Expand Up @@ -508,7 +522,8 @@ CAMLprim value spawn_unix(value v_env,
value v_stdin,
value v_stdout,
value v_stderr,
value v_use_vfork)
value v_use_vfork,
value v_setpgid)
{
(void)v_env;
(void)v_cwd;
Expand All @@ -518,6 +533,7 @@ CAMLprim value spawn_unix(value v_env,
(void)v_stdout;
(void)v_stderr;
(void)v_use_vfork;
(void)v_setpgid;
unix_error(ENOSYS, "spawn_unix", Nothing);
}

Expand Down Expand Up @@ -598,7 +614,8 @@ CAMLprim value spawn_unix_byte(value * argv)
argv[4],
argv[5],
argv[6],
argv[7]);
argv[7],
argv[8]);
}

CAMLprim value spawn_windows_byte(value * argv)
Expand Down
2 changes: 1 addition & 1 deletion vendor/update-spawn.sh
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
#!/bin/bash

version=b5a25cab2f53a5ee9e10a7b8a96506cc61ce1198
version=f7b883ebc9bfa5c142a2f10a62b415a51c83bd36

set -e -o pipefail

Expand Down