Skip to content

Commit ff412d5

Browse files
committed
add try/catch around scheduler to reset sleep state
Fixes #54700
1 parent dfca90f commit ff412d5

File tree

1 file changed

+135
-124
lines changed

1 file changed

+135
-124
lines changed

src/scheduler.c

Lines changed: 135 additions & 124 deletions
Original file line numberDiff line numberDiff line change
@@ -456,144 +456,155 @@ JL_DLLEXPORT jl_task_t *jl_task_get_next(jl_value_t *trypoptask, jl_value_t *q,
456456
}
457457
continue;
458458
}
459-
task = get_next_task(trypoptask, q); // note: this should not yield
460-
if (ptls != ct->ptls) {
461-
// sigh, a yield was detected, so let's go ahead and handle it anyway by starting over
462-
ptls = ct->ptls;
463-
if (set_not_sleeping(ptls)) {
464-
JL_PROBE_RT_SLEEP_CHECK_TASK_WAKE(ptls);
459+
volatile int isrunning = 1;
460+
JL_TRY {
461+
task = get_next_task(trypoptask, q); // note: this should not yield
462+
if (ptls != ct->ptls) {
463+
// sigh, a yield was detected, so let's go ahead and handle it anyway by starting over
464+
ptls = ct->ptls;
465+
if (set_not_sleeping(ptls)) {
466+
JL_PROBE_RT_SLEEP_CHECK_TASK_WAKE(ptls);
467+
}
468+
if (task)
469+
return task;
470+
continue;
465471
}
466-
if (task)
472+
if (task) {
473+
if (set_not_sleeping(ptls)) {
474+
JL_PROBE_RT_SLEEP_CHECK_TASK_WAKE(ptls);
475+
}
467476
return task;
468-
continue;
469-
}
470-
if (task) {
471-
if (set_not_sleeping(ptls)) {
472-
JL_PROBE_RT_SLEEP_CHECK_TASK_WAKE(ptls);
473477
}
474-
return task;
475-
}
476478

477-
// IO is always permitted, but outside a threaded region, only
478-
// thread 0 will process messages.
479-
// Inside a threaded region, any thread can listen for IO messages,
480-
// and one thread should win this race and watch the event loop,
481-
// but we bias away from idle threads getting parked here.
482-
//
483-
// The reason this works is somewhat convoluted, and closely tied to [^store_buffering_1]:
484-
// - After decrementing _threadedregion, the thread is required to
485-
// call jl_wakeup_thread(0), that will kick out any thread who is
486-
// already there, and then eventually thread 0 will get here.
487-
// - Inside a _threadedregion, there must exist at least one
488-
// thread that has a happens-before relationship on the libuv lock
489-
// before reaching this decision point in the code who will see
490-
// the lock as unlocked and thus must win this race here.
491-
int uvlock = 0;
492-
if (jl_atomic_load_relaxed(&_threadedregion)) {
493-
uvlock = jl_mutex_trylock(&jl_uv_mutex);
494-
}
495-
else if (ptls->tid == jl_atomic_load_relaxed(&io_loop_tid)) {
496-
uvlock = 1;
497-
JL_UV_LOCK();
498-
}
499-
else {
500-
// Since we might have started some IO work, we might need
501-
// to ensure tid = 0 will go watch that new event source.
502-
// If trylock would have succeeded, that may have been our
503-
// responsibility, so need to make sure thread 0 will take care
504-
// of us.
505-
if (jl_atomic_load_relaxed(&jl_uv_mutex.owner) == NULL) // aka trylock
506-
jl_wakeup_thread(0);
507-
}
508-
if (uvlock) {
509-
int enter_eventloop = may_sleep(ptls);
510-
int active = 0;
511-
if (jl_atomic_load_relaxed(&jl_uv_n_waiters) != 0)
512-
// if we won the race against someone who actually needs
513-
// the lock to do real work, we need to let them have it instead
514-
enter_eventloop = 0;
515-
if (enter_eventloop) {
516-
uv_loop_t *loop = jl_global_event_loop();
517-
loop->stop_flag = 0;
518-
JULIA_DEBUG_SLEEPWAKE( ptls->uv_run_enter = cycleclock() );
519-
active = uv_run(loop, UV_RUN_ONCE);
520-
JULIA_DEBUG_SLEEPWAKE( ptls->uv_run_leave = cycleclock() );
521-
jl_gc_safepoint();
479+
// IO is always permitted, but outside a threaded region, only
480+
// thread 0 will process messages.
481+
// Inside a threaded region, any thread can listen for IO messages,
482+
// and one thread should win this race and watch the event loop,
483+
// but we bias away from idle threads getting parked here.
484+
//
485+
// The reason this works is somewhat convoluted, and closely tied to [^store_buffering_1]:
486+
// - After decrementing _threadedregion, the thread is required to
487+
// call jl_wakeup_thread(0), that will kick out any thread who is
488+
// already there, and then eventually thread 0 will get here.
489+
// - Inside a _threadedregion, there must exist at least one
490+
// thread that has a happens-before relationship on the libuv lock
491+
// before reaching this decision point in the code who will see
492+
// the lock as unlocked and thus must win this race here.
493+
int uvlock = 0;
494+
if (jl_atomic_load_relaxed(&_threadedregion)) {
495+
uvlock = jl_mutex_trylock(&jl_uv_mutex);
522496
}
523-
JL_UV_UNLOCK();
524-
// optimization: check again first if we may have work to do.
525-
// Otherwise we got a spurious wakeup since some other thread
526-
// that just wanted to steal libuv from us. We will just go
527-
// right back to sleep on the individual wake signal to let
528-
// them take it from us without conflict.
529-
if (active || !may_sleep(ptls)) {
530-
if (set_not_sleeping(ptls)) {
531-
JL_PROBE_RT_SLEEP_CHECK_UV_WAKE(ptls);
532-
}
533-
start_cycles = 0;
534-
continue;
497+
else if (ptls->tid == jl_atomic_load_relaxed(&io_loop_tid)) {
498+
uvlock = 1;
499+
JL_UV_LOCK();
535500
}
536-
if (!enter_eventloop && !jl_atomic_load_relaxed(&_threadedregion) && ptls->tid == jl_atomic_load_relaxed(&io_loop_tid)) {
537-
// thread 0 is the only thread permitted to run the event loop
538-
// so it needs to stay alive, just spin-looping if necessary
539-
if (set_not_sleeping(ptls)) {
540-
JL_PROBE_RT_SLEEP_CHECK_UV_WAKE(ptls);
501+
else {
502+
// Since we might have started some IO work, we might need
503+
// to ensure tid = 0 will go watch that new event source.
504+
// If trylock would have succeeded, that may have been our
505+
// responsibility, so need to make sure thread 0 will take care
506+
// of us.
507+
if (jl_atomic_load_relaxed(&jl_uv_mutex.owner) == NULL) // aka trylock
508+
jl_wakeup_thread(0);
509+
}
510+
if (uvlock) {
511+
int enter_eventloop = may_sleep(ptls);
512+
int active = 0;
513+
if (jl_atomic_load_relaxed(&jl_uv_n_waiters) != 0)
514+
// if we won the race against someone who actually needs
515+
// the lock to do real work, we need to let them have it instead
516+
enter_eventloop = 0;
517+
if (enter_eventloop) {
518+
uv_loop_t *loop = jl_global_event_loop();
519+
loop->stop_flag = 0;
520+
JULIA_DEBUG_SLEEPWAKE( ptls->uv_run_enter = cycleclock() );
521+
active = uv_run(loop, UV_RUN_ONCE);
522+
JULIA_DEBUG_SLEEPWAKE( ptls->uv_run_leave = cycleclock() );
523+
jl_gc_safepoint();
524+
}
525+
JL_UV_UNLOCK();
526+
// optimization: check again first if we may have work to do.
527+
// Otherwise we got a spurious wakeup since some other thread
528+
// that just wanted to steal libuv from us. We will just go
529+
// right back to sleep on the individual wake signal to let
530+
// them take it from us without conflict.
531+
if (active || !may_sleep(ptls)) {
532+
if (set_not_sleeping(ptls)) {
533+
JL_PROBE_RT_SLEEP_CHECK_UV_WAKE(ptls);
534+
}
535+
start_cycles = 0;
536+
continue;
537+
}
538+
if (!enter_eventloop && !jl_atomic_load_relaxed(&_threadedregion) && ptls->tid == jl_atomic_load_relaxed(&io_loop_tid)) {
539+
// thread 0 is the only thread permitted to run the event loop
540+
// so it needs to stay alive, just spin-looping if necessary
541+
if (set_not_sleeping(ptls)) {
542+
JL_PROBE_RT_SLEEP_CHECK_UV_WAKE(ptls);
543+
}
544+
start_cycles = 0;
545+
continue;
541546
}
542-
start_cycles = 0;
543-
continue;
544547
}
545-
}
546548

547-
// any thread which wants us running again will have to observe
548-
// sleep_check_state==sleeping and increment nrunning for us
549-
int wasrunning = jl_atomic_fetch_add_relaxed(&nrunning, -1);
550-
assert(wasrunning);
551-
if (wasrunning == 1) {
552-
// This was the last running thread, and there is no thread with !may_sleep
553-
// so make sure io_loop_tid is notified to check wait_empty
554-
// TODO: this also might be a good time to check again that
555-
// libuv's queue is truly empty, instead of during delete_thread
556-
int16_t tid2 = 0;
557-
if (ptls->tid != tid2) {
558-
jl_ptls_t ptls2 = jl_atomic_load_relaxed(&jl_all_tls_states)[tid2];
559-
uv_mutex_lock(&ptls2->sleep_lock);
560-
uv_cond_signal(&ptls2->wake_signal);
561-
uv_mutex_unlock(&ptls2->sleep_lock);
549+
// any thread which wants us running again will have to observe
550+
// sleep_check_state==sleeping and increment nrunning for us
551+
int wasrunning = jl_atomic_fetch_add_relaxed(&nrunning, -1);
552+
assert(wasrunning);
553+
isrunning = 0;
554+
if (wasrunning == 1) {
555+
// This was the last running thread, and there is no thread with !may_sleep
556+
// so make sure io_loop_tid is notified to check wait_empty
557+
// TODO: this also might be a good time to check again that
558+
// libuv's queue is truly empty, instead of during delete_thread
559+
int16_t tid2 = 0;
560+
if (ptls->tid != tid2) {
561+
jl_ptls_t ptls2 = jl_atomic_load_relaxed(&jl_all_tls_states)[tid2];
562+
uv_mutex_lock(&ptls2->sleep_lock);
563+
uv_cond_signal(&ptls2->wake_signal);
564+
uv_mutex_unlock(&ptls2->sleep_lock);
565+
}
562566
}
563-
}
564567

565-
// the other threads will just wait for an individual wake signal to resume
566-
JULIA_DEBUG_SLEEPWAKE( ptls->sleep_enter = cycleclock() );
567-
int8_t gc_state = jl_gc_safe_enter(ptls);
568-
uv_mutex_lock(&ptls->sleep_lock);
569-
while (may_sleep(ptls)) {
570-
if (ptls->tid == 0) {
571-
task = wait_empty;
572-
if (task && jl_atomic_load_relaxed(&nrunning) == 0) {
573-
wasrunning = jl_atomic_fetch_add_relaxed(&nrunning, 1);
574-
assert(!wasrunning);
575-
wasrunning = !set_not_sleeping(ptls);
576-
assert(!wasrunning);
577-
JL_PROBE_RT_SLEEP_CHECK_TASK_WAKE(ptls);
578-
if (!ptls->finalizers_inhibited)
579-
ptls->finalizers_inhibited++; // this annoyingly is rather sticky (we should like to reset it at the end of jl_task_wait_empty)
580-
break;
568+
// the other threads will just wait for an individual wake signal to resume
569+
JULIA_DEBUG_SLEEPWAKE( ptls->sleep_enter = cycleclock() );
570+
int8_t gc_state = jl_gc_safe_enter(ptls);
571+
uv_mutex_lock(&ptls->sleep_lock);
572+
while (may_sleep(ptls)) {
573+
if (ptls->tid == 0) {
574+
task = wait_empty;
575+
if (task && jl_atomic_load_relaxed(&nrunning) == 0) {
576+
wasrunning = jl_atomic_fetch_add_relaxed(&nrunning, 1);
577+
assert(!wasrunning);
578+
wasrunning = !set_not_sleeping(ptls);
579+
assert(!wasrunning);
580+
JL_PROBE_RT_SLEEP_CHECK_TASK_WAKE(ptls);
581+
if (!ptls->finalizers_inhibited)
582+
ptls->finalizers_inhibited++; // this annoyingly is rather sticky (we should like to reset it at the end of jl_task_wait_empty)
583+
break;
584+
}
585+
task = NULL;
581586
}
582-
task = NULL;
587+
// else should we warn the user of certain deadlock here if tid == 0 && nrunning == 0?
588+
uv_cond_wait(&ptls->wake_signal, &ptls->sleep_lock);
589+
}
590+
assert(jl_atomic_load_relaxed(&ptls->sleep_check_state) == not_sleeping);
591+
assert(jl_atomic_load_relaxed(&nrunning));
592+
start_cycles = 0;
593+
uv_mutex_unlock(&ptls->sleep_lock);
594+
JULIA_DEBUG_SLEEPWAKE( ptls->sleep_leave = cycleclock() );
595+
jl_gc_safe_leave(ptls, gc_state); // contains jl_gc_safepoint
596+
if (task) {
597+
assert(task == wait_empty);
598+
wait_empty = NULL;
599+
return task;
583600
}
584-
// else should we warn the user of certain deadlock here if tid == 0 && nrunning == 0?
585-
uv_cond_wait(&ptls->wake_signal, &ptls->sleep_lock);
586601
}
587-
assert(jl_atomic_load_relaxed(&ptls->sleep_check_state) == not_sleeping);
588-
assert(jl_atomic_load_relaxed(&nrunning));
589-
start_cycles = 0;
590-
uv_mutex_unlock(&ptls->sleep_lock);
591-
JULIA_DEBUG_SLEEPWAKE( ptls->sleep_leave = cycleclock() );
592-
jl_gc_safe_leave(ptls, gc_state); // contains jl_gc_safepoint
593-
if (task) {
594-
assert(task == wait_empty);
595-
wait_empty = NULL;
596-
return task;
602+
JL_CATCH { // probably SIGINT, but possibly a user mistake in trypoptask
603+
if (!isrunning)
604+
jl_atomic_fetch_add_relaxed(&nrunning, 1);
605+
set_not_sleeping(ptls);
606+
// TODO: if (task != NULL) instead attach this exception there, so that we don't forgot to ever reschedule it
607+
jl_rethrow();
597608
}
598609
}
599610
else {

0 commit comments

Comments
 (0)