Skip to content

Commit 1ddc97f

Browse files
kpamnanyRAI CI (GitHub Action Automation)
authored andcommitted
Change heartbeat thread controls
When enabling heartbeats, the user must specify: - heartbeat_s: jl_heartbeat() must be called at least once every heartbeat_s; if it isn't, a one-line heartbeat loss report is printed - show_tasks_after_n: after these many heartbeat_s have passed without jl_heartbeat() being called, print task backtraces and stop all reporting - reset_after_n: after these many heartbeat_s have passed with jl_heartbeat() being called, print a heartbeats recovered message and reset reporting
1 parent 3c1cebe commit 1ddc97f

File tree

1 file changed

+46
-49
lines changed

1 file changed

+46
-49
lines changed

src/threading.c

Lines changed: 46 additions & 49 deletions
Original file line numberDiff line numberDiff line change
@@ -955,9 +955,9 @@ volatile int heartbeat_enabled;
955955
uv_sem_t heartbeat_on_sem, // jl_heartbeat_enable -> thread
956956
heartbeat_off_sem; // thread -> jl_heartbeat_enable
957957
int heartbeat_interval_s,
958-
n_loss_reports,
959-
reset_reporting_s;
960-
int last_report_s, report_interval_s, n_reported;
958+
tasks_after_n,
959+
reset_tasks_after_n;
960+
int tasks_showed, n_hbs_missed, n_hbs_recvd;
961961
_Atomic(int) heartbeats;
962962

963963
JL_DLLEXPORT void jl_print_task_backtraces(int show_done) JL_NOTSAFEPOINT;
@@ -976,21 +976,19 @@ void jl_init_heartbeat(void)
976976

977977
// enable/disable heartbeats
978978
// heartbeat_s: interval within which jl_heartbeat() must be called
979-
// n_reports: for one heartbeat loss interval, how many times to report
980-
// reset_reporting_after_s: how long to wait after a heartbeat loss
981-
// interval and a return to steady heartbeats, before resetting
982-
// reporting behavior
979+
// show_tasks_after_n: number of heartbeats missed before printing task backtraces
980+
// reset_after_n: number of heartbeats after which to reset
983981
//
984982
// When disabling heartbeats, the heartbeat thread must wake up,
985983
// find out that heartbeats are now diabled, and reset. For now, we
986984
// handle this by preventing re-enabling of heartbeats until this
987985
// completes.
988-
JL_DLLEXPORT int jl_heartbeat_enable(int heartbeat_s, int n_reports,
989-
int reset_reporting_after_s)
986+
JL_DLLEXPORT int jl_heartbeat_enable(int heartbeat_s, int show_tasks_after_n,
987+
int reset_after_n)
990988
{
991989
if (heartbeat_s <= 0) {
992990
heartbeat_enabled = 0;
993-
heartbeat_interval_s = n_loss_reports = reset_reporting_s = 0;
991+
heartbeat_interval_s = tasks_after_n = reset_tasks_after_n = 0;
994992
}
995993
else {
996994
// must disable before enabling
@@ -1004,10 +1002,11 @@ JL_DLLEXPORT int jl_heartbeat_enable(int heartbeat_s, int n_reports,
10041002

10051003
jl_atomic_store_relaxed(&heartbeats, 0);
10061004
heartbeat_interval_s = heartbeat_s;
1007-
n_loss_reports = n_reports;
1008-
reset_reporting_s = reset_reporting_after_s;
1009-
last_report_s = 0;
1010-
report_interval_s = heartbeat_interval_s;
1005+
tasks_after_n = show_tasks_after_n;
1006+
reset_tasks_after_n = reset_after_n;
1007+
tasks_showed = 0;
1008+
n_hbs_missed = 0;
1009+
n_hbs_recvd = 0;
10111010
heartbeat_enabled = 1;
10121011
uv_sem_post(&heartbeat_on_sem); // wake the heartbeat thread
10131012
}
@@ -1043,44 +1042,42 @@ void sleep_for(int secs, int nsecs)
10431042
uint8_t check_heartbeats(uint8_t gc_state)
10441043
{
10451044
int hb = jl_atomic_exchange(&heartbeats, 0);
1046-
uint64_t curr_s = jl_hrtime() / 1e9;
10471045

10481046
if (hb <= 0) {
1049-
// we didn't get a heartbeat in the last interval; should we report?
1050-
if (n_reported < n_loss_reports &&
1051-
curr_s - last_report_s >= report_interval_s) {
1052-
jl_task_t *ct = jl_current_task;
1053-
jl_ptls_t ptls = ct->ptls;
1054-
1055-
// exit GC-safe region to report then re-enter
1056-
jl_gc_safe_leave(ptls, gc_state);
1057-
jl_safe_printf("==== heartbeat loss ====\n");
1058-
jl_print_task_backtraces(0);
1059-
gc_state = jl_gc_safe_enter(ptls);
1060-
1061-
// we've reported
1062-
n_reported++;
1063-
1064-
// record the reporting time _after_ the report
1065-
last_report_s = jl_hrtime() / 1e9;
1066-
1067-
// double the reporting interval up to a maximum
1068-
if (report_interval_s < 60 * heartbeat_interval_s) {
1069-
report_interval_s *= 2;
1047+
// we didn't get a heartbeat
1048+
n_hbs_recvd = 0;
1049+
n_hbs_missed++;
1050+
1051+
// if we've printed task backtraces already, do nothing
1052+
if (!tasks_showed) {
1053+
// otherwise, at least show this message
1054+
jl_safe_printf("==== heartbeat loss (%ds) ====\n",
1055+
n_hbs_missed * heartbeat_interval_s);
1056+
// if we've missed enough heartbeats, print task backtraces
1057+
if (n_hbs_missed >= tasks_after_n) {
1058+
jl_task_t *ct = jl_current_task;
1059+
jl_ptls_t ptls = ct->ptls;
1060+
1061+
// exit GC-safe region to report then re-enter
1062+
jl_gc_safe_leave(ptls, gc_state);
1063+
jl_print_task_backtraces(0);
1064+
gc_state = jl_gc_safe_enter(ptls);
1065+
1066+
// we printed task backtraces
1067+
tasks_showed = 1;
10701068
}
10711069
}
1072-
// no heartbeats, don't change reporting state
1073-
return gc_state;
10741070
}
10751071
else {
1076-
// we got a heartbeat; reset the report count
1077-
n_reported = 0;
1078-
}
1079-
1080-
// reset the reporting interval only once we're steadily getting
1081-
// heartbeats for the requested reset interval
1082-
if (curr_s - reset_reporting_s > last_report_s) {
1083-
report_interval_s = heartbeat_interval_s;
1072+
// got a heartbeat
1073+
n_hbs_recvd++;
1074+
// if we'd printed task backtraces, check for reset
1075+
if (tasks_showed && n_hbs_recvd >= reset_tasks_after_n) {
1076+
tasks_showed = 0;
1077+
jl_safe_printf("==== heartbeats recovered (lost for %ds) ====\n",
1078+
n_hbs_missed * heartbeat_interval_s);
1079+
}
1080+
n_hbs_missed = 0;
10841081
}
10851082

10861083
return gc_state;
@@ -1089,7 +1086,7 @@ uint8_t check_heartbeats(uint8_t gc_state)
10891086
// heartbeat thread function
10901087
void jl_heartbeat_threadfun(void *arg)
10911088
{
1092-
int s, ns = 1e9 - 1, rs;
1089+
int s = 59, ns = 1e9 - 1, rs;
10931090
uint64_t t0, tchb;
10941091

10951092
// We need a TLS because backtraces are accumulated into ptls->bt_size
@@ -1147,8 +1144,8 @@ void jl_init_heartbeat(void)
11471144
{
11481145
}
11491146

1150-
JL_DLLEXPORT int jl_heartbeat_enable(int heartbeat_s, int n_reports,
1151-
int reset_reporting_after_s)
1147+
JL_DLLEXPORT int jl_heartbeat_enable(int heartbeat_s, int show_tasks_after_n,
1148+
int reset_after_n)
11521149
{
11531150
return -1;
11541151
}

0 commit comments

Comments
 (0)