diff --git a/src/chain_sync/chain_follower.rs b/src/chain_sync/chain_follower.rs index 7bbdbd8147cb..6fbee38bd476 100644 --- a/src/chain_sync/chain_follower.rs +++ b/src/chain_sync/chain_follower.rs @@ -234,12 +234,14 @@ pub async fn chain_follower( // Update the sync states { - let mut status_report_guard = sync_status.write(); - status_report_guard.update( + let old_status_report = sync_status.read().clone(); + let new_status_report = old_status_report.update( &state_manager, current_active_forks, stateless_mode, ); + + sync_status.write().clone_from(&new_status_report); } for task in task_vec { diff --git a/src/chain_sync/sync_status.rs b/src/chain_sync/sync_status.rs index d40f508426a7..ea6e0d3beadd 100644 --- a/src/chain_sync/sync_status.rs +++ b/src/chain_sync/sync_status.rs @@ -138,36 +138,36 @@ impl SyncStatusReport { } } + /// Updates the sync status report based on the current state of the node and network. + /// This does not modify the existing report but returns a new one with updated information. pub(crate) fn update( - &mut self, + &self, state_manager: &StateManager, - current_active_forks: Vec, + active_forks: Vec, stateless_mode: bool, - ) { + ) -> Self { let heaviest = state_manager.chain_store().heaviest_tipset(); - let current_chain_head_epoch = heaviest.epoch(); - self.current_head_key = Some(heaviest.key().clone()); - self.current_head_epoch = current_chain_head_epoch; + let current_head_epoch = heaviest.epoch(); + let current_head_key = Some(heaviest.key().clone()); - let now = Utc::now(); - let now_ts = now.timestamp() as u64; + let last_updated = Utc::now(); + let last_updated_ts = last_updated.timestamp() as u64; let seconds_per_epoch = state_manager.chain_config().block_delay_secs; let network_head_epoch = calculate_expected_epoch( - now_ts, + last_updated_ts, state_manager.chain_store().genesis_block_header().timestamp, seconds_per_epoch, ); - self.network_head_epoch = network_head_epoch; - self.epochs_behind = network_head_epoch.saturating_sub(current_chain_head_epoch); + let epochs_behind = network_head_epoch.saturating_sub(current_head_epoch); log::trace!( "Sync status report: current head epoch: {}, network head epoch: {}, epochs behind: {}", - current_chain_head_epoch, + current_head_epoch, network_head_epoch, - self.epochs_behind + epochs_behind ); - let time_diff = now_ts.saturating_sub(heaviest.min_timestamp()); + let time_diff = last_updated_ts.saturating_sub(heaviest.min_timestamp()); let status = match stateless_mode { true => NodeSyncStatus::Offline, false => { @@ -179,9 +179,16 @@ impl SyncStatusReport { } }; - self.status = status; - self.active_forks = current_active_forks; - self.last_updated = now; + Self { + node_start_time: self.node_start_time, + current_head_epoch, + current_head_key, + network_head_epoch, + epochs_behind, + status, + active_forks, + last_updated, + } } pub(crate) fn is_synced(&self) -> bool { diff --git a/src/health/endpoints.rs b/src/health/endpoints.rs index 3c29b16c5484..8b6bcd76aee8 100644 --- a/src/health/endpoints.rs +++ b/src/health/endpoints.rs @@ -13,6 +13,11 @@ use crate::rpc::f3::F3IsRunning; /// Query parameter for verbose responses const VERBOSE_PARAM: &str = "verbose"; +/// Maximum duration to wait for a request to complete. This value is kind of arbitrary, we need +/// the health probes to be snappy and unless running on a very slow machine, 2 seconds should be +/// plenty of time for a localhost check. +const MAX_REQ_DURATION_SECS: u64 = 2; + /// Liveness probes determine whether or not an application running in a container is in a healthy state. The idea behind a liveness probe is that it fails for prolonged period of time, then the application should be restarted. /// In our case, we require: /// - The node is not in an error state (i.e., boot-looping) @@ -136,9 +141,12 @@ async fn check_rpc_server_running(state: &ForestState, acc: &mut MessageAccumula if !state.config.client.enable_rpc { acc.push_ok("rpc server disabled"); true - } else if tokio::net::TcpStream::connect(state.config.client.rpc_address) - .await - .is_ok() + } else if tokio::time::timeout( + std::time::Duration::from_secs(MAX_REQ_DURATION_SECS), + tokio::net::TcpStream::connect(state.config.client.rpc_address), + ) + .await + .is_ok_and(|connected| connected.is_ok()) { acc.push_ok("rpc server running"); true @@ -163,7 +171,13 @@ async fn check_f3_running(state: &ForestState, acc: &mut MessageAccumulator) -> if !crate::f3::is_sidecar_ffi_enabled(&state.chain_config) { acc.push_ok("f3 disabled"); true - } else if F3IsRunning::is_f3_running().await.unwrap_or_default() { + } else if tokio::time::timeout( + std::time::Duration::from_secs(MAX_REQ_DURATION_SECS), + F3IsRunning::is_f3_running(), + ) + .await + .is_ok_and(|is_running| is_running.unwrap_or_default()) + { acc.push_ok("f3 running"); true } else if crate::f3::get_f3_sidecar_params(&state.chain_config).bootstrap_epoch diff --git a/src/libp2p/peer_manager.rs b/src/libp2p/peer_manager.rs index cd755641d902..8f5fea1aec31 100644 --- a/src/libp2p/peer_manager.rs +++ b/src/libp2p/peer_manager.rs @@ -168,13 +168,15 @@ impl PeerManager { /// duration. pub fn log_failure(&self, peer: &PeerId, dur: Duration) { trace!("logging failure for {peer}"); - let mut peers = self.peers.write(); - if !peers.bad_peers.contains(peer) { - metrics::PEER_FAILURE_TOTAL.inc(); - let peer_stats = peers.full_peers.entry(*peer).or_default(); - peer_stats.failures += 1; - log_time(peer_stats, dur); + if self.peers.read().bad_peers.contains(peer) { + return; } + + metrics::PEER_FAILURE_TOTAL.inc(); + let mut peers = self.peers.write(); + let peer_stats = peers.full_peers.entry(*peer).or_default(); + peer_stats.failures += 1; + log_time(peer_stats, dur); } /// Removes a peer from the set and returns true if the value was present