diff --git a/beacon_node/network/src/metrics.rs b/beacon_node/network/src/metrics.rs index 309512076a0..bf4cbd09ab5 100644 --- a/beacon_node/network/src/metrics.rs +++ b/beacon_node/network/src/metrics.rs @@ -257,9 +257,9 @@ lazy_static! { "sync_lookups_completed_total", "Total count of sync lookups completed", ); - pub static ref SYNC_LOOKUPS_STUCK: Result = try_create_int_gauge( - "sync_lookups_stuck", - "Current count of sync lookups that may be stuck", + pub static ref SYNC_LOOKUPS_STUCK: Result = try_create_int_counter( + "sync_lookups_stuck_total", + "Total count of sync lookups that are stuck and dropped", ); /* diff --git a/beacon_node/network/src/sync/block_lookups/common.rs b/beacon_node/network/src/sync/block_lookups/common.rs index 2791623f3ff..aef76fb0da8 100644 --- a/beacon_node/network/src/sync/block_lookups/common.rs +++ b/beacon_node/network/src/sync/block_lookups/common.rs @@ -2,7 +2,7 @@ use crate::sync::block_lookups::single_block_lookup::{ LookupRequestError, SingleBlockLookup, SingleLookupRequestState, }; use crate::sync::block_lookups::{BlobRequestState, BlockRequestState, PeerId}; -use crate::sync::manager::{Id, SLOT_IMPORT_TOLERANCE}; +use crate::sync::manager::Id; use crate::sync::network_context::{LookupRequestResult, SyncNetworkContext}; use beacon_chain::block_verification_types::RpcBlock; use beacon_chain::BeaconChainTypes; @@ -19,11 +19,6 @@ pub enum ResponseType { Blob, } -/// The maximum depth we will search for a parent block. In principle we should have sync'd any -/// canonical chain to its head once the peer connects. A chain should not appear where it's depth -/// is further back than the most recent head slot. -pub(crate) const PARENT_DEPTH_TOLERANCE: usize = SLOT_IMPORT_TOLERANCE * 2; - /// This trait unifies common single block lookup functionality across blocks and blobs. This /// includes making requests, verifying responses, and handling processing results. A /// `SingleBlockLookup` includes both a `BlockRequestState` and a `BlobRequestState`, this trait is diff --git a/beacon_node/network/src/sync/block_lookups/mod.rs b/beacon_node/network/src/sync/block_lookups/mod.rs index 0e89eb956cb..94645197c99 100644 --- a/beacon_node/network/src/sync/block_lookups/mod.rs +++ b/beacon_node/network/src/sync/block_lookups/mod.rs @@ -1,10 +1,10 @@ use self::parent_chain::{compute_parent_chains, NodeChain}; pub use self::single_block_lookup::DownloadResult; use self::single_block_lookup::{LookupRequestError, LookupResult, SingleBlockLookup}; -use super::manager::{BlockProcessType, BlockProcessingResult}; +use super::manager::{BlockProcessType, BlockProcessingResult, SLOT_IMPORT_TOLERANCE}; use super::network_context::{RpcResponseResult, SyncNetworkContext}; use crate::metrics; -use crate::sync::block_lookups::common::{ResponseType, PARENT_DEPTH_TOLERANCE}; +use crate::sync::block_lookups::common::ResponseType; use crate::sync::block_lookups::parent_chain::find_oldest_fork_ancestor; use crate::sync::manager::{Id, SingleLookupReqId}; use beacon_chain::block_verification_types::AsBlock; @@ -28,9 +28,18 @@ mod single_block_lookup; #[cfg(test)] mod tests; +/// The maximum depth we will search for a parent block. In principle we should have sync'd any +/// canonical chain to its head once the peer connects. A chain should not appear where it's depth +/// is further back than the most recent head slot. +pub(crate) const PARENT_DEPTH_TOLERANCE: usize = SLOT_IMPORT_TOLERANCE * 2; + const FAILED_CHAINS_CACHE_EXPIRY_SECONDS: u64 = 60; pub const SINGLE_BLOCK_LOOKUP_MAX_ATTEMPTS: u8 = 4; -const LOOKUP_MAX_DURATION_SECS: u64 = 60; + +/// Maximum time we allow a lookup to exist before assuming it is stuck and will never make +/// progress. Assume the worse case processing time per block component set * times max depth. +/// 15 * 2 * 32 = 16 minutes. +const LOOKUP_MAX_DURATION_SECS: usize = 15 * PARENT_DEPTH_TOLERANCE; pub enum BlockComponent { Block(DownloadResult>>), @@ -680,20 +689,72 @@ impl BlockLookups { ); } - pub fn log_stuck_lookups(&self) { - let mut stuck_count = 0; - for lookup in self.single_block_lookups.values() { - if lookup.elapsed_since_created() > Duration::from_secs(LOOKUP_MAX_DURATION_SECS) { - debug!(self.log, "Lookup maybe stuck"; - // Fields id and block_root are also part of the summary. However, logging them - // here allows log parsers o index them and have better search - "id" => lookup.id, - "block_root" => ?lookup.block_root(), - "summary" => ?lookup, + /// Safety mechanism to unstuck lookup sync. Lookup sync if purely event driven and depends on + /// external components to feed it events to make progress. If there is a bug in network, in + /// beacon processor, or here internally: lookups can get stuck forever. A stuck lookup can + /// stall a node indefinitely as other lookup will be awaiting on a parent lookup to make + /// progress. + /// + /// If a lookup lasts more than LOOKUP_MAX_DURATION_SECS this function will find its oldest + /// ancestor and then drop it and all its children. This action will allow the node to unstuck + /// itself. Bugs that cause lookups to get stuck may be triggered consistently. So this strategy + /// is useful for two reasons: + /// + /// - One single clear warn level log per stuck incident + /// - If the original bug is sporadic, it reduces the time a node is stuck from forever to 15 min + pub fn drop_stuck_lookups(&mut self) { + // While loop to find and drop all disjoint trees of potentially stuck lookups. + while let Some(stuck_lookup) = self.single_block_lookups.values().find(|lookup| { + lookup.elapsed_since_created() > Duration::from_secs(LOOKUP_MAX_DURATION_SECS as u64) + }) { + let ancestor_stuck_lookup = match self.find_oldest_ancestor_lookup(stuck_lookup) { + Ok(lookup) => lookup, + Err(e) => { + warn!(self.log, "Error finding oldest ancestor lookup"; "error" => ?e); + // Default to dropping the lookup that exceeds the max duration so at least + // eventually sync should be unstuck + stuck_lookup + } + }; + + if stuck_lookup.id == ancestor_stuck_lookup.id { + warn!(self.log, "Notify the devs, a sync lookup is stuck"; + "block_root" => ?stuck_lookup.block_root(), + "lookup" => ?stuck_lookup, ); - stuck_count += 1; + } else { + warn!(self.log, "Notify the devs, a sync lookup is stuck"; + "block_root" => ?stuck_lookup.block_root(), + "lookup" => ?stuck_lookup, + "ancestor_block_root" => ?ancestor_stuck_lookup.block_root(), + "ancestor_lookup" => ?ancestor_stuck_lookup, + ); + } + + metrics::inc_counter(&metrics::SYNC_LOOKUPS_STUCK); + self.drop_lookup_and_children(ancestor_stuck_lookup.id); + } + } + + /// Recursively find the oldest ancestor lookup of another lookup + fn find_oldest_ancestor_lookup<'a>( + &'a self, + stuck_lookup: &'a SingleBlockLookup, + ) -> Result<&'a SingleBlockLookup, String> { + if let Some(awaiting_parent) = stuck_lookup.awaiting_parent() { + if let Some(lookup) = self + .single_block_lookups + .values() + .find(|l| l.block_root() == awaiting_parent) + { + self.find_oldest_ancestor_lookup(lookup) + } else { + Err(format!( + "Lookup references unknown parent {awaiting_parent:?}" + )) } + } else { + Ok(stuck_lookup) } - metrics::set_gauge(&metrics::SYNC_LOOKUPS_STUCK, stuck_count); } } diff --git a/beacon_node/network/src/sync/block_lookups/tests.rs b/beacon_node/network/src/sync/block_lookups/tests.rs index b99598fe905..5a85e57f632 100644 --- a/beacon_node/network/src/sync/block_lookups/tests.rs +++ b/beacon_node/network/src/sync/block_lookups/tests.rs @@ -10,7 +10,7 @@ use std::sync::Arc; use super::*; -use crate::sync::block_lookups::common::{ResponseType, PARENT_DEPTH_TOLERANCE}; +use crate::sync::block_lookups::common::ResponseType; use beacon_chain::blob_verification::GossipVerifiedBlob; use beacon_chain::block_verification_types::{BlockImportData, RpcBlock}; use beacon_chain::builder::Witness; diff --git a/beacon_node/network/src/sync/manager.rs b/beacon_node/network/src/sync/manager.rs index 71d3113414c..1162f63de0a 100644 --- a/beacon_node/network/src/sync/manager.rs +++ b/beacon_node/network/src/sync/manager.rs @@ -561,7 +561,7 @@ impl SyncManager { self.handle_new_execution_engine_state(engine_state); } _ = interval.tick() => { - self.block_lookups.log_stuck_lookups(); + self.block_lookups.drop_stuck_lookups(); } } }