From b9b8a7d83d727a3c05b259b62ee3979d922060d8 Mon Sep 17 00:00:00 2001 From: Jimmy Chen Date: Fri, 12 Jul 2024 21:04:58 +1000 Subject: [PATCH] Avoid retrying same sampling peer that previously failed. --- .../network/src/sync/block_lookups/tests.rs | 36 +++++++++++++++++++ beacon_node/network/src/sync/sampling.rs | 6 ++-- 2 files changed, 40 insertions(+), 2 deletions(-) diff --git a/beacon_node/network/src/sync/block_lookups/tests.rs b/beacon_node/network/src/sync/block_lookups/tests.rs index 18d74625145..87a91c74092 100644 --- a/beacon_node/network/src/sync/block_lookups/tests.rs +++ b/beacon_node/network/src/sync/block_lookups/tests.rs @@ -651,6 +651,21 @@ impl TestRig { }); } + fn sampling_requests_failed( + &mut self, + sampling_ids: DCByRootIds, + peer_id: PeerId, + error: RPCError, + ) { + for (request_id, _) in sampling_ids { + self.send_sync_message(SyncMessage::RpcError { + peer_id, + request_id, + error: error.clone(), + }) + } + } + fn complete_valid_block_request( &mut self, id: SingleLookupReqId, @@ -1964,6 +1979,27 @@ fn sampling_with_retries() { r.expect_clean_finished_sampling(); } +#[test] +fn sampling_avoid_retrying_same_peer() { + let Some(mut r) = TestRig::test_setup_after_peerdas() else { + return; + }; + let peer_id_1 = r.new_connected_supernode_peer(); + let peer_id_2 = r.new_connected_supernode_peer(); + let block_root = Hash256::random(); + r.trigger_sample_block(block_root, Slot::new(0)); + // Retrieve all outgoing sample requests for random column indexes, and return empty responses + let sampling_ids = + r.expect_only_data_columns_by_root_requests(block_root, SAMPLING_REQUIRED_SUCCESSES); + r.sampling_requests_failed(sampling_ids, peer_id_1, RPCError::Disconnected); + // Should retry the other peer + let sampling_ids = + r.expect_only_data_columns_by_root_requests(block_root, SAMPLING_REQUIRED_SUCCESSES); + r.sampling_requests_failed(sampling_ids, peer_id_2, RPCError::Disconnected); + // Expect no more retries + r.expect_empty_network(); +} + #[test] fn custody_lookup_happy_path() { let Some(mut r) = TestRig::test_setup_after_peerdas() else { diff --git a/beacon_node/network/src/sync/sampling.rs b/beacon_node/network/src/sync/sampling.rs index 3fb21489d91..9cfd41dead7 100644 --- a/beacon_node/network/src/sync/sampling.rs +++ b/beacon_node/network/src/sync/sampling.rs @@ -433,7 +433,6 @@ mod request { column_index: ColumnIndex, status: Status, // TODO(das): Should downscore peers that claim to not have the sample? - #[allow(dead_code)] peers_dont_have: HashSet, } @@ -490,11 +489,13 @@ mod request { // TODO: When is a fork and only a subset of your peers know about a block, sampling should only // be queried on the peers on that fork. Should this case be handled? How to handle it? - let peer_ids = cx.get_custodial_peers( + let mut peer_ids = cx.get_custodial_peers( block_slot.epoch(::slots_per_epoch()), self.column_index, ); + peer_ids.retain(|peer_id| !self.peers_dont_have.contains(peer_id)); + // TODO(das) randomize custodial peer and avoid failing peers if let Some(peer_id) = peer_ids.first().cloned() { cx.data_column_lookup_request( @@ -521,6 +522,7 @@ mod request { pub(crate) fn on_sampling_error(&mut self) -> Result { match self.status.clone() { Status::Sampling(peer_id) => { + self.peers_dont_have.insert(peer_id); self.status = Status::NotStarted; Ok(peer_id) }