Skip to content

Commit

Permalink
Make profiling more robust with many tasks (#42978)
Browse files Browse the repository at this point in the history
This patch includes two sets of changes.

(1) `jl_thread_suspend_and_get_state` uses `pthread_cond_timedwait` to
recover from the case where the request is not received by the signal
handler. This is required because `usr2_handler` contains some paths
for the case where it is not possible to obtain `ptls`.

(2) `ctx_switch` now makes sure to null out `ptls` of the last task
(`lastt->ptls = NULL`) after changing the current task by updating
pgcstack (`jl_set_pgcstack(&t->gcstack)`).  This closes the gap in
which `usr2_handler` can observe the null `ptls`.

Co-authored-by: Jameson Nash <[email protected]>
  • Loading branch information
tkf and vtjnash authored Nov 14, 2021
1 parent 5665e8b commit 8131580
Show file tree
Hide file tree
Showing 4 changed files with 71 additions and 3 deletions.
19 changes: 18 additions & 1 deletion src/signals-unix.c
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,7 @@
#include <sys/stat.h>
#include <sys/mman.h>
#include <pthread.h>
#include <time.h>
#include <errno.h>
#if defined(_OS_DARWIN_) && !defined(MAP_ANONYMOUS)
#define MAP_ANONYMOUS MAP_ANON
Expand Down Expand Up @@ -368,11 +369,25 @@ static pthread_cond_t signal_caught_cond;

static void jl_thread_suspend_and_get_state(int tid, unw_context_t **ctx)
{
struct timespec ts;
clock_gettime(CLOCK_REALTIME, &ts);
ts.tv_sec += 1;
pthread_mutex_lock(&in_signal_lock);
jl_ptls_t ptls2 = jl_all_tls_states[tid];
jl_atomic_store_release(&ptls2->signal_request, 1);
pthread_kill(ptls2->system_id, SIGUSR2);
pthread_cond_wait(&signal_caught_cond, &in_signal_lock); // wait for thread to acknowledge
// wait for thread to acknowledge
int err = pthread_cond_timedwait(&signal_caught_cond, &in_signal_lock, &ts);
if (err == ETIMEDOUT) {
sig_atomic_t request = 1;
if (jl_atomic_cmpswap(&ptls2->signal_request, &request, 0)) {
*ctx = NULL;
pthread_mutex_unlock(&in_signal_lock);
return;
}
err = pthread_cond_wait(&signal_caught_cond, &in_signal_lock);
}
assert(!err);
assert(jl_atomic_load_acquire(&ptls2->signal_request) == 0);
*ctx = signal_context;
}
Expand Down Expand Up @@ -758,6 +773,8 @@ static void *signal_listener(void *arg)
int i = critical ? idx : profile_round_robin_thread_order[idx];
// notify thread to stop
jl_thread_suspend_and_get_state(i, &signal_context);
if (signal_context == NULL)
continue;

// do backtrace on thread contexts for critical signals
// this part must be signal-handler safe
Expand Down
5 changes: 3 additions & 2 deletions src/task.c
Original file line number Diff line number Diff line change
Expand Up @@ -423,18 +423,19 @@ static void ctx_switch(jl_task_t *lastt)
else
#endif
*pt = NULL; // can't fail after here: clear the gc-root for the target task now
lastt->ptls = NULL;
}

// set up global state for new task and clear global state for old task
t->ptls = ptls;
jl_atomic_store_relaxed(&ptls->current_task, t);
JL_GC_PROMISE_ROOTED(t);
jl_signal_fence();
jl_set_pgcstack(&t->gcstack);
jl_signal_fence();
lastt->ptls = NULL;
#ifdef MIGRATE_TASKS
ptls->previous_task = lastt;
#endif
jl_set_pgcstack(&t->gcstack);

if (t->started) {
#ifdef COPY_STACKS
Expand Down
14 changes: 14 additions & 0 deletions test/profile_spawnmany_exec.jl
Original file line number Diff line number Diff line change
@@ -0,0 +1,14 @@
# This file is a part of Julia. License is MIT: https://julialang.org/license

using Profile

function spawnmany(n)
if n > 2
m = n ÷ 2
t = Threads.@spawn spawnmany(m)
spawnmany(m)
wait(t)
end
end

@profile spawnmany(parse(Int, get(ENV, "NTASKS", "2000000")))
36 changes: 36 additions & 0 deletions test/threads.jl
Original file line number Diff line number Diff line change
Expand Up @@ -147,3 +147,39 @@ end

# We don't need the watchdog anymore
close(proc.in)

# https://github.com/JuliaLang/julia/pull/42973
Sys.islinux() && @testset "spawn and wait *a lot* of tasks in @profile" begin
# Not using threads_exec.jl for better isolation, reproducibility, and a
# tighter timeout.
script = "profile_spawnmany_exec.jl"
cmd = `$(Base.julia_cmd()) --depwarn=error --rr-detach --startup-file=no $script`
@testset for n in [20000, 200000, 2000000]
proc = run(ignorestatus(setenv(cmd, "NTASKS" => n; dir = @__DIR__)); wait = false)
done = Threads.Atomic{Bool}(false)
timeout = false
timer = Timer(100) do _
timeout = true
for sig in [Base.SIGTERM, Base.SIGHUP, Base.SIGKILL]
for _ in 1:1000
kill(proc, sig)
if done[]
if sig != Base.SIGTERM
@warn "Terminating `$script` required signal $sig"
end
return
end
sleep(0.001)
end
end
end
try
wait(proc)
finally
done[] = true
close(timer)
end
@test success(proc)
@test !timeout
end
end

4 comments on commit 8131580

@nanosoldier
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Executing the daily package evaluation, I will reply here when finished:

@nanosoldier runtests(ALL, isdaily = true)

@nanosoldier
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Your package evaluation job has completed - possible new issues were detected. A full report can be found here.

@nanosoldier
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Executing the daily package evaluation, I will reply here when finished:

@nanosoldier runtests(ALL, isdaily = true)

@nanosoldier
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Your package evaluation job has completed - possible issues were detected. A full report can be found here.

Please sign in to comment.