Skip to content

Commit 037dc51

Browse files
authored
Adjust heartbeat behavior (#180)
* Add heartbeat pause/resume capability * Add check to avoid negative sleep duration * Disable heartbeats in `jl_print_task_backtraces()` `jl_print_task_backtraces()` can take long enough that there can be heartbeat loss, which can trigger printing task backtraces again, unless it is called from the heartbeat thread which takes care of that possible problem. * Pause heartbeats for GC * Address review comment * Address review comment
1 parent a911d00 commit 037dc51

File tree

3 files changed

+77
-5
lines changed

3 files changed

+77
-5
lines changed

src/gc.c

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -3734,6 +3734,9 @@ static int _jl_gc_collect(jl_ptls_t ptls, jl_gc_collection_t collection)
37343734
return recollect;
37353735
}
37363736

3737+
extern int jl_heartbeat_pause(void);
3738+
extern int jl_heartbeat_resume(void);
3739+
37373740
JL_DLLEXPORT void jl_gc_collect(jl_gc_collection_t collection)
37383741
{
37393742
JL_PROBE_GC_BEGIN(collection);
@@ -3775,6 +3778,7 @@ JL_DLLEXPORT void jl_gc_collect(jl_gc_collection_t collection)
37753778
// existence of the thread in the jl_n_threads count.
37763779
//
37773780
// TODO: concurrently queue objects
3781+
jl_heartbeat_pause();
37783782
jl_fence();
37793783
gc_n_threads = jl_atomic_load_acquire(&jl_n_threads);
37803784
gc_all_tls_states = jl_atomic_load_relaxed(&jl_all_tls_states);
@@ -3806,6 +3810,7 @@ JL_DLLEXPORT void jl_gc_collect(jl_gc_collection_t collection)
38063810

38073811
gc_n_threads = 0;
38083812
gc_all_tls_states = NULL;
3813+
jl_heartbeat_resume();
38093814
jl_safepoint_end_gc();
38103815
jl_gc_state_set(ptls, old_state, JL_GC_STATE_WAITING);
38113816
JL_PROBE_GC_END();

src/stackwalk.c

Lines changed: 17 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1166,10 +1166,22 @@ JL_DLLEXPORT void jl_print_backtrace(void) JL_NOTSAFEPOINT
11661166
}
11671167

11681168
extern int gc_first_tid;
1169+
extern int jl_inside_heartbeat_thread(void);
1170+
extern int jl_heartbeat_pause(void);
1171+
extern int jl_heartbeat_resume(void);
11691172

1170-
// Print backtraces for all live tasks, for all threads, to jl_safe_printf stderr
1173+
// Print backtraces for all live tasks, for all threads, to jl_safe_printf
1174+
// stderr. This can take a _long_ time!
11711175
JL_DLLEXPORT void jl_print_task_backtraces(int show_done) JL_NOTSAFEPOINT
11721176
{
1177+
// disable heartbeats to prevent heartbeat loss while running this,
1178+
// unless this is called from the heartbeat thread itself; in that
1179+
// situation, the thread is busy running this and it will not be
1180+
// updating the missed heartbeats counter
1181+
if (!jl_inside_heartbeat_thread()) {
1182+
jl_heartbeat_pause();
1183+
}
1184+
11731185
size_t nthreads = jl_atomic_load_acquire(&jl_n_threads);
11741186
jl_ptls_t *allstates = jl_atomic_load_relaxed(&jl_all_tls_states);
11751187
int ctid = -1;
@@ -1232,6 +1244,10 @@ JL_DLLEXPORT void jl_print_task_backtraces(int show_done) JL_NOTSAFEPOINT
12321244
jl_safe_printf("thread (%d) ==== End thread %d\n", ctid, ptls2->tid + 1);
12331245
}
12341246
jl_safe_printf("thread (%d) ++++ Done\n", ctid);
1247+
1248+
if (!jl_inside_heartbeat_thread()) {
1249+
jl_heartbeat_resume();
1250+
}
12351251
}
12361252

12371253
#ifdef __cplusplus

src/threading.c

Lines changed: 55 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -1008,6 +1008,45 @@ JL_DLLEXPORT int jl_heartbeat_enable(int heartbeat_s, int show_tasks_after_n,
10081008
return 0;
10091009
}
10101010

1011+
// temporarily pause the heartbeat thread
1012+
JL_DLLEXPORT int jl_heartbeat_pause(void)
1013+
{
1014+
if (!heartbeat_enabled) {
1015+
return -1;
1016+
}
1017+
heartbeat_enabled = 0;
1018+
return 0;
1019+
}
1020+
1021+
// resume the paused heartbeat thread
1022+
JL_DLLEXPORT int jl_heartbeat_resume(void)
1023+
{
1024+
// cannot resume if the heartbeat thread is already running
1025+
if (heartbeat_enabled) {
1026+
return -1;
1027+
}
1028+
1029+
// cannot resume if we weren't paused (disabled != paused)
1030+
if (heartbeat_interval_s == 0) {
1031+
return -1;
1032+
}
1033+
1034+
// heartbeat thread must be ready
1035+
if (uv_sem_trywait(&heartbeat_off_sem) != 0) {
1036+
return -1;
1037+
}
1038+
1039+
// reset state as we've been paused
1040+
n_hbs_missed = 0;
1041+
n_hbs_recvd = 0;
1042+
tasks_showed = 0;
1043+
1044+
// resume
1045+
heartbeat_enabled = 1;
1046+
uv_sem_post(&heartbeat_on_sem); // wake the heartbeat thread
1047+
return 0;
1048+
}
1049+
10111050
// heartbeat
10121051
JL_DLLEXPORT void jl_heartbeat(void)
10131052
{
@@ -1099,7 +1138,7 @@ void jl_heartbeat_threadfun(void *arg)
10991138
uv_sem_post(&heartbeat_off_sem);
11001139

11011140
// sleep the thread here; this semaphore is posted in
1102-
// jl_heartbeat_enable()
1141+
// jl_heartbeat_enable() or jl_heartbeat_resume()
11031142
uv_sem_wait(&heartbeat_on_sem);
11041143

11051144
// Set the sleep duration.
@@ -1111,7 +1150,7 @@ void jl_heartbeat_threadfun(void *arg)
11111150
// heartbeat is enabled; sleep, waiting for the desired interval
11121151
sleep_for(s, ns);
11131152

1114-
// if heartbeats were turned off while we were sleeping, reset
1153+
// if heartbeats were turned off/paused while we were sleeping, reset
11151154
if (!heartbeat_enabled) {
11161155
continue;
11171156
}
@@ -1122,13 +1161,15 @@ void jl_heartbeat_threadfun(void *arg)
11221161
tchb = jl_hrtime() - t0;
11231162

11241163
// adjust the next sleep duration based on how long the heartbeat
1125-
// check took
1164+
// check took, but if it took too long then use the normal duration
11261165
rs = 1;
11271166
while (tchb > 1e9) {
11281167
rs++;
11291168
tchb -= 1e9;
11301169
}
1131-
s = heartbeat_interval_s - rs;
1170+
if (rs < heartbeat_interval_s) {
1171+
s = heartbeat_interval_s - rs;
1172+
}
11321173
ns = 1e9 - tchb;
11331174
}
11341175
}
@@ -1150,6 +1191,16 @@ JL_DLLEXPORT int jl_heartbeat_enable(int heartbeat_s, int show_tasks_after_n,
11501191
return -1;
11511192
}
11521193

1194+
JL_DLLEXPORT int jl_heartbeat_pause(void)
1195+
{
1196+
return -1;
1197+
}
1198+
1199+
JL_DLLEXPORT int jl_heartbeat_resume(void)
1200+
{
1201+
return -1;
1202+
}
1203+
11531204
JL_DLLEXPORT void jl_heartbeat(void)
11541205
{
11551206
}

0 commit comments

Comments
 (0)