Skip to content

Commit

Permalink
[backports-release-1.11] 1.11 Profiling threading fix backports (#56358)
Browse files Browse the repository at this point in the history
Fixes #56327
  • Loading branch information
IanButterworth authored Oct 28, 2024
2 parents 6be0afc + 0495045 commit 0f7f762
Show file tree
Hide file tree
Showing 2 changed files with 56 additions and 14 deletions.
67 changes: 54 additions & 13 deletions src/signals-unix.c
Original file line number Diff line number Diff line change
Expand Up @@ -410,9 +410,11 @@ pthread_mutex_t in_signal_lock; // shared with jl_delete_thread
static bt_context_t *signal_context; // protected by in_signal_lock
static int exit_signal_cond = -1;
static int signal_caught_cond = -1;
static int signals_inflight = 0;

int jl_thread_suspend_and_get_state(int tid, int timeout, bt_context_t *ctx)
{
int err;
pthread_mutex_lock(&in_signal_lock);
jl_ptls_t ptls2 = jl_atomic_load_relaxed(&jl_all_tls_states)[tid];
jl_task_t *ct2 = ptls2 ? jl_atomic_load_relaxed(&ptls2->current_task) : NULL;
Expand All @@ -421,24 +423,45 @@ int jl_thread_suspend_and_get_state(int tid, int timeout, bt_context_t *ctx)
pthread_mutex_unlock(&in_signal_lock);
return 0;
}
sig_atomic_t request = 0;
if (!jl_atomic_cmpswap(&ptls2->signal_request, &request, 1)) {
while (signals_inflight) {
// something is wrong, or there is already a usr2 in flight elsewhere
pthread_mutex_unlock(&in_signal_lock);
return 0;
// try to wait for it to finish or wait for timeout
struct pollfd event = {signal_caught_cond, POLLIN, 0};
do {
err = poll(&event, 1, timeout * 1000);
} while (err == -1 && errno == EINTR);
if (err == -1 || (event.revents & POLLIN) == 0) {
// not ready after timeout: cancel this request
pthread_mutex_unlock(&in_signal_lock);
return 0;
}
// consume it before continuing
eventfd_t got;
do {
err = read(signal_caught_cond, &got, sizeof(eventfd_t));
} while (err == -1 && errno == EINTR);
if (err != sizeof(eventfd_t)) abort();
assert(signals_inflight >= got);
signals_inflight -= got;
}
signals_inflight++;
sig_atomic_t request = jl_atomic_exchange(&ptls2->signal_request, 1);
assert(request == 0 || request == -1);
request = 1;
int err = pthread_kill(ptls2->system_id, SIGUSR2);
// wait for thread to acknowledge or timeout
struct pollfd event = {signal_caught_cond, POLLIN, 0};
err = pthread_kill(ptls2->system_id, SIGUSR2);
if (err == 0) {
// wait for thread to acknowledge or timeout
struct pollfd event = {signal_caught_cond, POLLIN, 0};
do {
err = poll(&event, 1, timeout * 1000);
} while (err == -1 && errno == EINTR);
if (err != 1 || (event.revents & POLLIN) == 0)
err = -1;
}
if ((event.revents & POLLIN) == 0) {
if (err == -1) {
// not ready after timeout: try to cancel this request
if (jl_atomic_cmpswap(&ptls2->signal_request, &request, 0)) {
signals_inflight--;
pthread_mutex_unlock(&in_signal_lock);
return 0;
}
Expand All @@ -448,11 +471,13 @@ int jl_thread_suspend_and_get_state(int tid, int timeout, bt_context_t *ctx)
err = read(signal_caught_cond, &got, sizeof(eventfd_t));
} while (err == -1 && errno == EINTR);
if (err != sizeof(eventfd_t)) abort();
assert(got == 1); (void) got;
assert(signals_inflight >= got);
signals_inflight -= got;
signals_inflight++;
// Now the other thread is waiting on exit_signal_cond (verify that here by
// checking it is 0, and add an acquire barrier for good measure)
request = jl_atomic_load_acquire(&ptls2->signal_request);
assert(request == 0); (void) request;
assert(request == 0 || request == -1); (void) request;
jl_atomic_store_release(&ptls2->signal_request, 4); // prepare to resume normally, but later code may change this
*ctx = *signal_context;
return 1;
Expand All @@ -475,6 +500,7 @@ static void jl_try_deliver_sigint(void)
jl_safepoint_enable_sigint();
jl_wake_libuv();
pthread_mutex_lock(&in_signal_lock);
signals_inflight++;
jl_atomic_store_release(&ptls2->signal_request, 2);
// This also makes sure `sleep` is aborted.
pthread_kill(ptls2->system_id, SIGUSR2);
Expand Down Expand Up @@ -511,6 +537,7 @@ static void jl_exit_thread0(int signo, jl_bt_element_t *bt_data, size_t bt_size)
}

// request:
// -1: processing
// 0: nothing [not from here]
// 1: get state & wait for request
// 2: throw sigint if `!defer_signal && io_wait` or if force throw threshold
Expand All @@ -526,22 +553,36 @@ void usr2_handler(int sig, siginfo_t *info, void *ctx)
if (ptls == NULL)
return;
int errno_save = errno;
// acknowledge that we saw the signal_request
sig_atomic_t request = jl_atomic_exchange(&ptls->signal_request, 0);
sig_atomic_t request = jl_atomic_load(&ptls->signal_request);
if (request == 0)
return;
if (!jl_atomic_cmpswap(&ptls->signal_request, &request, -1))
return;
if (request == 1) {
signal_context = jl_to_bt_context(ctx);
// acknowledge that we saw the signal_request and set signal_context
int err;
eventfd_t got = 1;
err = write(signal_caught_cond, &got, sizeof(eventfd_t));
if (err != sizeof(eventfd_t)) abort();
sig_atomic_t processing = -1;
jl_atomic_cmpswap(&ptls->signal_request, &processing, 0);
// wait for exit signal
do {
err = read(exit_signal_cond, &got, sizeof(eventfd_t));
} while (err == -1 && errno == EINTR);
if (err != sizeof(eventfd_t)) abort();
assert(got == 1);
request = jl_atomic_exchange(&ptls->signal_request, 0);
request = jl_atomic_exchange(&ptls->signal_request, -1);
signal_context = NULL;
assert(request == 2 || request == 3 || request == 4);
}
int err;
eventfd_t got = 1;
err = write(signal_caught_cond, &got, sizeof(eventfd_t));
if (err != sizeof(eventfd_t)) abort();
sig_atomic_t processing = -1;
jl_atomic_cmpswap(&ptls->signal_request, &processing, 0);
if (request == 2) {
int force = jl_check_force_sigint();
if (force || (!ptls->defer_signal && ptls->io_wait)) {
Expand Down
3 changes: 2 additions & 1 deletion stdlib/Profile/test/runtests.jl
Original file line number Diff line number Diff line change
Expand Up @@ -168,7 +168,8 @@ let cmd = Base.julia_cmd()
println("done")
print(Profile.len_data())
"""
p = open(`$cmd -e $script`)
# use multiple threads here to ensure that profiling works with threading
p = open(`$cmd -t2 -e $script`)
t = Timer(120) do t
# should be under 10 seconds, so give it 2 minutes then report failure
println("KILLING debuginfo registration test BY PROFILE TEST WATCHDOG\n")
Expand Down

0 comments on commit 0f7f762

Please sign in to comment.