From 715f750ed5bc090aa7c13158585cd82b825924da Mon Sep 17 00:00:00 2001 From: Conrado Gouvea Date: Fri, 15 Oct 2021 13:38:37 -0300 Subject: [PATCH 1/3] Ignore AlreadyInChain error in the syncer --- zebra-consensus/src/block.rs | 1 + zebra-consensus/src/checkpoint.rs | 1 + zebra-consensus/src/lib.rs | 3 ++ zebrad/src/components/sync.rs | 42 +++++++++++++++++++------ zebrad/src/components/sync/downloads.rs | 42 ++++++++++++++++++++----- 5 files changed, 73 insertions(+), 16 deletions(-) diff --git a/zebra-consensus/src/block.rs b/zebra-consensus/src/block.rs index 8db7564df6b..dfda62165ba 100644 --- a/zebra-consensus/src/block.rs +++ b/zebra-consensus/src/block.rs @@ -48,6 +48,7 @@ pub struct BlockVerifier { // TODO: dedupe with crate::error::BlockError #[non_exhaustive] +#[allow(missing_docs)] #[derive(Debug, Error)] pub enum VerifyBlockError { #[error("unable to verify depth for block {hash} from chain state during block verification")] diff --git a/zebra-consensus/src/checkpoint.rs b/zebra-consensus/src/checkpoint.rs index 965b668133a..b83c0346794 100644 --- a/zebra-consensus/src/checkpoint.rs +++ b/zebra-consensus/src/checkpoint.rs @@ -836,6 +836,7 @@ where } #[derive(Debug, Error)] +#[allow(missing_docs)] pub enum VerifyCheckpointError { #[error("checkpoint request after the final checkpoint has been verified")] Finished, diff --git a/zebra-consensus/src/lib.rs b/zebra-consensus/src/lib.rs index ca55b0bd932..56e43493154 100644 --- a/zebra-consensus/src/lib.rs +++ b/zebra-consensus/src/lib.rs @@ -53,9 +53,12 @@ pub mod chain; #[allow(missing_docs)] pub mod error; +pub use block::VerifyBlockError; +pub use checkpoint::VerifyCheckpointError; pub use checkpoint::MAX_CHECKPOINT_BYTE_COUNT; pub use checkpoint::MAX_CHECKPOINT_HEIGHT_GAP; pub use config::Config; +pub use error::BlockError; /// A boxed [`std::error::Error`]. pub type BoxError = Box; diff --git a/zebrad/src/components/sync.rs b/zebrad/src/components/sync.rs index 018e1a71f73..c5ff132235c 100644 --- a/zebrad/src/components/sync.rs +++ b/zebrad/src/components/sync.rs @@ -15,10 +15,15 @@ use zebra_chain::{ block::{self, Block}, parameters::genesis_hash, }; +use zebra_consensus::{ + chain::VerifyChainError, BlockError, VerifyBlockError, VerifyCheckpointError, +}; use zebra_network as zn; use zebra_state as zs; -use crate::{config::ZebradConfig, BoxError}; +use crate::{ + components::sync::downloads::BlockDownloadVerifyError, config::ZebradConfig, BoxError, +}; mod downloads; mod gossip; @@ -315,10 +320,7 @@ where tracing::trace!(?hash, "verified and committed block to state"); } Err(e) => { - if format!("{:?}", e).contains("AlreadyVerified {") { - tracing::debug!(error = ?e, "block was already verified, possibly from a previous sync run, continuing"); - } else { - tracing::warn!(?e, "error downloading and verifying block"); + if Self::should_restart_sync(e) { continue 'sync; } } @@ -352,10 +354,7 @@ where } Err(e) => { - if format!("{:?}", e).contains("AlreadyVerified {") { - tracing::debug!(error = ?e, "block was already verified, possibly from a previous sync run, continuing"); - } else { - tracing::warn!(?e, "error downloading and verifying block"); + if Self::should_restart_sync(e) { continue 'sync; } } @@ -715,4 +714,29 @@ where self.downloads.in_flight() as f64 ); } + + /// Return if the sync should be restarted based on the given error + /// from the block downloader and verifier stream. + fn should_restart_sync(e: BlockDownloadVerifyError) -> bool { + match e { + BlockDownloadVerifyError::Invalid(VerifyChainError::Checkpoint( + VerifyCheckpointError::AlreadyVerified { .. }, + )) => { + tracing::debug!(error = ?e, "block was already verified, possibly from a previous sync run, continuing"); + false + } + BlockDownloadVerifyError::Invalid(VerifyChainError::Block( + VerifyBlockError::Block { + source: BlockError::AlreadyInChain(_, _), + }, + )) => { + tracing::debug!(error = ?e, "block is already in chain, possibly from a previous sync run, continuing"); + false + } + _ => { + tracing::warn!(?e, "error downloading and verifying block"); + true + } + } + } } diff --git a/zebrad/src/components/sync/downloads.rs b/zebrad/src/components/sync/downloads.rs index ce5089c2d32..143b2082ad0 100644 --- a/zebrad/src/components/sync/downloads.rs +++ b/zebrad/src/components/sync/downloads.rs @@ -12,6 +12,7 @@ use futures::{ stream::{FuturesUnordered, Stream}, }; use pin_project::pin_project; +use thiserror::Error; use tokio::{sync::oneshot, task::JoinHandle}; use tower::{hedge, Service, ServiceExt}; use tracing_futures::Instrument; @@ -33,6 +34,23 @@ impl hedge::Policy for AlwaysHedge { } } +/// Errors that can occur while downloading and verifying a block. +#[derive(Error, Debug)] +#[allow(dead_code)] +pub enum BlockDownloadVerifyError { + #[error("error downloading block")] + DownloadFailed(#[source] BoxError), + + #[error("error from the verifier service")] + VerifierError(#[source] BoxError), + + #[error("block did not pass consensus validation")] + Invalid(#[from] zebra_consensus::chain::VerifyChainError), + + #[error("block download / verification was cancelled")] + Cancelled, +} + /// Represents a [`Stream`] of download and verification tasks during chain sync. #[pin_project] #[derive(Debug)] @@ -54,7 +72,8 @@ where // Internal downloads state /// A list of pending block download and verify tasks. #[pin] - pending: FuturesUnordered>>, + pending: + FuturesUnordered>>, /// A list of channels that can be used to cancel pending block download and /// verify tasks. @@ -68,7 +87,7 @@ where ZV: Service, Response = block::Hash, Error = BoxError> + Send + Clone + 'static, ZV::Future: Send, { - type Item = Result; + type Item = Result; fn poll_next(self: Pin<&mut Self>, cx: &mut Context) -> Poll> { let this = self.project(); @@ -163,9 +182,9 @@ where _ = &mut cancel_rx => { tracing::trace!("task cancelled prior to download completion"); metrics::counter!("sync.cancelled.download.count", 1); - return Err("canceled block_fetch_verify".into()) + return Err(BlockDownloadVerifyError::Cancelled) } - rsp = block_req => rsp?, + rsp = block_req => rsp.map_err(BlockDownloadVerifyError::DownloadFailed)?, }; let block = if let zn::Response::Blocks(blocks) = rsp { @@ -178,14 +197,18 @@ where }; metrics::counter!("sync.downloaded.block.count", 1); - let rsp = verifier.ready_and().await?.call(block); + let rsp = verifier + .ready_and() + .await + .map_err(BlockDownloadVerifyError::VerifierError)? + .call(block); // TODO: if the verifier and cancel are both ready, which should // we prefer? (Currently, select! chooses one at random.) let verification = tokio::select! { _ = &mut cancel_rx => { tracing::trace!("task cancelled prior to verification"); metrics::counter!("sync.cancelled.verify.count", 1); - return Err("canceled block_fetch_verify".into()) + return Err(BlockDownloadVerifyError::Cancelled) } verification = rsp => verification, }; @@ -193,7 +216,12 @@ where metrics::counter!("sync.verified.block.count", 1); } - verification + verification.map_err(|err| { + match err.downcast::() { + Ok(e) => BlockDownloadVerifyError::Invalid(*e), + Err(e) => BlockDownloadVerifyError::VerifierError(e), + } + }) } .in_current_span() // Tack the hash onto the error so we can remove the cancel handle From 23b0360aaf1ce9e98a67d72c0d112dd7f57cb208 Mon Sep 17 00:00:00 2001 From: Conrado Gouvea Date: Mon, 18 Oct 2021 14:09:51 -0300 Subject: [PATCH 2/3] Split Cancelled errors; add them to should_restart_sync exceptions --- zebrad/src/components/sync.rs | 5 +++++ zebrad/src/components/sync/downloads.rs | 11 +++++++---- 2 files changed, 12 insertions(+), 4 deletions(-) diff --git a/zebrad/src/components/sync.rs b/zebrad/src/components/sync.rs index c5ff132235c..7d25fc20e85 100644 --- a/zebrad/src/components/sync.rs +++ b/zebrad/src/components/sync.rs @@ -733,6 +733,11 @@ where tracing::debug!(error = ?e, "block is already in chain, possibly from a previous sync run, continuing"); false } + BlockDownloadVerifyError::CancelledDuringDownload + | BlockDownloadVerifyError::CancelledDuringVerification => { + tracing::debug!(error = ?e, "block verification was cancelled, continuing"); + false + } _ => { tracing::warn!(?e, "error downloading and verifying block"); true diff --git a/zebrad/src/components/sync/downloads.rs b/zebrad/src/components/sync/downloads.rs index 143b2082ad0..63484204542 100644 --- a/zebrad/src/components/sync/downloads.rs +++ b/zebrad/src/components/sync/downloads.rs @@ -47,8 +47,11 @@ pub enum BlockDownloadVerifyError { #[error("block did not pass consensus validation")] Invalid(#[from] zebra_consensus::chain::VerifyChainError), - #[error("block download / verification was cancelled")] - Cancelled, + #[error("block download / verification was cancelled during download")] + CancelledDuringDownload, + + #[error("block download / verification was cancelled during verification")] + CancelledDuringVerification, } /// Represents a [`Stream`] of download and verification tasks during chain sync. @@ -182,7 +185,7 @@ where _ = &mut cancel_rx => { tracing::trace!("task cancelled prior to download completion"); metrics::counter!("sync.cancelled.download.count", 1); - return Err(BlockDownloadVerifyError::Cancelled) + return Err(BlockDownloadVerifyError::CancelledDuringDownload) } rsp = block_req => rsp.map_err(BlockDownloadVerifyError::DownloadFailed)?, }; @@ -208,7 +211,7 @@ where _ = &mut cancel_rx => { tracing::trace!("task cancelled prior to verification"); metrics::counter!("sync.cancelled.verify.count", 1); - return Err(BlockDownloadVerifyError::Cancelled) + return Err(BlockDownloadVerifyError::CancelledDuringVerification) } verification = rsp => verification, }; From c01c0d9ee47e4244b0e94f1b7f294e9b831da782 Mon Sep 17 00:00:00 2001 From: Conrado Gouvea Date: Tue, 19 Oct 2021 15:26:34 -0300 Subject: [PATCH 3/3] Also filter 'block is already comitted'; try to detect a wrong downcast --- zebrad/src/components/sync.rs | 28 ++++++++++++++++++++++++++++ 1 file changed, 28 insertions(+) diff --git a/zebrad/src/components/sync.rs b/zebrad/src/components/sync.rs index 7d25fc20e85..66185510e53 100644 --- a/zebrad/src/components/sync.rs +++ b/zebrad/src/components/sync.rs @@ -733,12 +733,40 @@ where tracing::debug!(error = ?e, "block is already in chain, possibly from a previous sync run, continuing"); false } + BlockDownloadVerifyError::Invalid(VerifyChainError::Block( + VerifyBlockError::Commit(ref source), + )) if format!("{:?}", source).contains("block is already committed to the state") => { + // TODO: improve this by checking the type + // https://github.com/ZcashFoundation/zebra/issues/2908 + tracing::debug!(error = ?e, "block is already committed, possibly from a previous sync run, continuing"); + false + } BlockDownloadVerifyError::CancelledDuringDownload | BlockDownloadVerifyError::CancelledDuringVerification => { tracing::debug!(error = ?e, "block verification was cancelled, continuing"); false } _ => { + // download_and_verify downcasts errors from the block verifier + // into VerifyChainError, and puts the result inside one of the + // BlockDownloadVerifyError enumerations. This downcast could + // become incorrect e.g. after some refactoring, and it is difficult + // to write a test to check it. The test below is a best-effort + // attempt to catch if that happens and log it. + // TODO: add a proper test and remove this + // https://github.com/ZcashFoundation/zebra/issues/2909 + let err_str = format!("{:?}", e); + if err_str.contains("AlreadyVerified") + || err_str.contains("AlreadyInChain") + || err_str.contains("block is already committed to the state") + { + tracing::error!(?e, + "a BlockDownloadVerifyError that should have been filtered out was detected, \ + which possibly indicates a programming error in the downcast inside \ + zebrad::components::sync::downloads::Downloads::download_and_verify" + ) + } + tracing::warn!(?e, "error downloading and verifying block"); true }