@@ -955,9 +955,9 @@ volatile int heartbeat_enabled;
955955uv_sem_t heartbeat_on_sem , // jl_heartbeat_enable -> thread
956956 heartbeat_off_sem ; // thread -> jl_heartbeat_enable
957957int heartbeat_interval_s ,
958- n_loss_reports ,
959- reset_reporting_s ;
960- int last_report_s , report_interval_s , n_reported ;
958+ tasks_after_n ,
959+ reset_tasks_after_n ;
960+ int tasks_showed , n_hbs_missed , n_hbs_recvd ;
961961_Atomic(int ) heartbeats ;
962962
963963JL_DLLEXPORT void jl_print_task_backtraces (int show_done ) JL_NOTSAFEPOINT ;
@@ -976,21 +976,19 @@ void jl_init_heartbeat(void)
976976
977977// enable/disable heartbeats
978978// heartbeat_s: interval within which jl_heartbeat() must be called
979- // n_reports: for one heartbeat loss interval, how many times to report
980- // reset_reporting_after_s: how long to wait after a heartbeat loss
981- // interval and a return to steady heartbeats, before resetting
982- // reporting behavior
979+ // show_tasks_after_n: number of heartbeats missed before printing task backtraces
980+ // reset_after_n: number of heartbeats after which to reset
983981//
984982// When disabling heartbeats, the heartbeat thread must wake up,
985983// find out that heartbeats are now diabled, and reset. For now, we
986984// handle this by preventing re-enabling of heartbeats until this
987985// completes.
988- JL_DLLEXPORT int jl_heartbeat_enable (int heartbeat_s , int n_reports ,
989- int reset_reporting_after_s )
986+ JL_DLLEXPORT int jl_heartbeat_enable (int heartbeat_s , int show_tasks_after_n ,
987+ int reset_after_n )
990988{
991989 if (heartbeat_s <= 0 ) {
992990 heartbeat_enabled = 0 ;
993- heartbeat_interval_s = n_loss_reports = reset_reporting_s = 0 ;
991+ heartbeat_interval_s = tasks_after_n = reset_tasks_after_n = 0 ;
994992 }
995993 else {
996994 // must disable before enabling
@@ -1004,10 +1002,11 @@ JL_DLLEXPORT int jl_heartbeat_enable(int heartbeat_s, int n_reports,
10041002
10051003 jl_atomic_store_relaxed (& heartbeats , 0 );
10061004 heartbeat_interval_s = heartbeat_s ;
1007- n_loss_reports = n_reports ;
1008- reset_reporting_s = reset_reporting_after_s ;
1009- last_report_s = 0 ;
1010- report_interval_s = heartbeat_interval_s ;
1005+ tasks_after_n = show_tasks_after_n ;
1006+ reset_tasks_after_n = reset_after_n ;
1007+ tasks_showed = 0 ;
1008+ n_hbs_missed = 0 ;
1009+ n_hbs_recvd = 0 ;
10111010 heartbeat_enabled = 1 ;
10121011 uv_sem_post (& heartbeat_on_sem ); // wake the heartbeat thread
10131012 }
@@ -1043,44 +1042,42 @@ void sleep_for(int secs, int nsecs)
10431042uint8_t check_heartbeats (uint8_t gc_state )
10441043{
10451044 int hb = jl_atomic_exchange (& heartbeats , 0 );
1046- uint64_t curr_s = jl_hrtime () / 1e9 ;
10471045
10481046 if (hb <= 0 ) {
1049- // we didn't get a heartbeat in the last interval; should we report?
1050- if ( n_reported < n_loss_reports &&
1051- curr_s - last_report_s >= report_interval_s ) {
1052- jl_task_t * ct = jl_current_task ;
1053- jl_ptls_t ptls = ct -> ptls ;
1054-
1055- // exit GC-safe region to report then re-enter
1056- jl_gc_safe_leave ( ptls , gc_state );
1057- jl_safe_printf ( "==== heartbeat loss ====\n" );
1058- jl_print_task_backtraces ( 0 );
1059- gc_state = jl_gc_safe_enter ( ptls );
1060-
1061- // we've reported
1062- n_reported ++ ;
1063-
1064- // record the reporting time _after_ the report
1065- last_report_s = jl_hrtime () / 1e9 ;
1066-
1067- // double the reporting interval up to a maximum
1068- if ( report_interval_s < 60 * heartbeat_interval_s ) {
1069- report_interval_s *= 2 ;
1047+ // we didn't get a heartbeat
1048+ n_hbs_recvd = 0 ;
1049+ n_hbs_missed ++ ;
1050+
1051+ // if we've printed task backtraces already, do nothing
1052+ if (! tasks_showed ) {
1053+ // otherwise, at least show this message
1054+ jl_safe_printf ( "==== heartbeat loss (%ds) ====\n" ,
1055+ n_hbs_missed * heartbeat_interval_s );
1056+ // if we've missed enough heartbeats, print task backtraces
1057+ if ( n_hbs_missed >= tasks_after_n ) {
1058+ jl_task_t * ct = jl_current_task ;
1059+ jl_ptls_t ptls = ct -> ptls ;
1060+
1061+ // exit GC-safe region to report then re-enter
1062+ jl_gc_safe_leave ( ptls , gc_state );
1063+ jl_print_task_backtraces ( 0 ) ;
1064+ gc_state = jl_gc_safe_enter ( ptls );
1065+
1066+ // we printed task backtraces
1067+ tasks_showed = 1 ;
10701068 }
10711069 }
1072- // no heartbeats, don't change reporting state
1073- return gc_state ;
10741070 }
10751071 else {
1076- // we got a heartbeat; reset the report count
1077- n_reported = 0 ;
1078- }
1079-
1080- // reset the reporting interval only once we're steadily getting
1081- // heartbeats for the requested reset interval
1082- if (curr_s - reset_reporting_s > last_report_s ) {
1083- report_interval_s = heartbeat_interval_s ;
1072+ // got a heartbeat
1073+ n_hbs_recvd ++ ;
1074+ // if we'd printed task backtraces, check for reset
1075+ if (tasks_showed && n_hbs_recvd >= reset_tasks_after_n ) {
1076+ tasks_showed = 0 ;
1077+ jl_safe_printf ("==== heartbeats recovered (lost for %ds) ====\n" ,
1078+ n_hbs_missed * heartbeat_interval_s );
1079+ }
1080+ n_hbs_missed = 0 ;
10841081 }
10851082
10861083 return gc_state ;
@@ -1089,7 +1086,7 @@ uint8_t check_heartbeats(uint8_t gc_state)
10891086// heartbeat thread function
10901087void jl_heartbeat_threadfun (void * arg )
10911088{
1092- int s , ns = 1e9 - 1 , rs ;
1089+ int s = 59 , ns = 1e9 - 1 , rs ;
10931090 uint64_t t0 , tchb ;
10941091
10951092 // We need a TLS because backtraces are accumulated into ptls->bt_size
@@ -1147,8 +1144,8 @@ void jl_init_heartbeat(void)
11471144{
11481145}
11491146
1150- JL_DLLEXPORT int jl_heartbeat_enable (int heartbeat_s , int n_reports ,
1151- int reset_reporting_after_s )
1147+ JL_DLLEXPORT int jl_heartbeat_enable (int heartbeat_s , int show_tasks_after_n ,
1148+ int reset_after_n )
11521149{
11531150 return -1 ;
11541151}
0 commit comments