Skip to content

Commit

Permalink
fix av-distribution Jaeger spans mem leak (#5321)
Browse files Browse the repository at this point in the history
Fixes #5258
  • Loading branch information
alindima authored Aug 12, 2024
1 parent 1f49358 commit fc906d5
Show file tree
Hide file tree
Showing 4 changed files with 24 additions and 11 deletions.
12 changes: 6 additions & 6 deletions polkadot/node/network/availability-distribution/src/lib.rs
Original file line number Diff line number Diff line change
Expand Up @@ -25,7 +25,7 @@ use polkadot_node_subsystem::{
jaeger, messages::AvailabilityDistributionMessage, overseer, FromOrchestra, OverseerSignal,
SpawnedSubsystem, SubsystemError,
};
use polkadot_primitives::Hash;
use polkadot_primitives::{BlockNumber, Hash};
use std::collections::HashMap;

/// Error and [`Result`] type for this subsystem.
Expand Down Expand Up @@ -104,7 +104,7 @@ impl AvailabilityDistributionSubsystem {
/// Start processing work as passed on from the Overseer.
async fn run<Context>(self, mut ctx: Context) -> std::result::Result<(), FatalError> {
let Self { mut runtime, recvs, metrics, req_protocol_names } = self;
let mut spans: HashMap<Hash, jaeger::PerLeafSpan> = HashMap::new();
let mut spans: HashMap<Hash, (BlockNumber, jaeger::PerLeafSpan)> = HashMap::new();

let IncomingRequestReceivers {
pov_req_receiver,
Expand Down Expand Up @@ -162,7 +162,7 @@ impl AvailabilityDistributionSubsystem {
};
let span =
jaeger::PerLeafSpan::new(cloned_leaf.span, "availability-distribution");
spans.insert(cloned_leaf.hash, span);
spans.insert(cloned_leaf.hash, (cloned_leaf.number, span));
log_error(
requester
.get_mut()
Expand All @@ -172,8 +172,8 @@ impl AvailabilityDistributionSubsystem {
&mut warn_freq,
)?;
},
FromOrchestra::Signal(OverseerSignal::BlockFinalized(hash, _)) => {
spans.remove(&hash);
FromOrchestra::Signal(OverseerSignal::BlockFinalized(_hash, finalized_number)) => {
spans.retain(|_hash, (block_number, _span)| *block_number > finalized_number);
},
FromOrchestra::Signal(OverseerSignal::Conclude) => return Ok(()),
FromOrchestra::Communication {
Expand All @@ -189,7 +189,7 @@ impl AvailabilityDistributionSubsystem {
} => {
let span = spans
.get(&relay_parent)
.map(|span| span.child("fetch-pov"))
.map(|(_, span)| span.child("fetch-pov"))
.unwrap_or_else(|| jaeger::Span::new(&relay_parent, "fetch-pov"))
.with_trace_id(candidate_hash)
.with_candidate(candidate_hash)
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -39,7 +39,9 @@ use polkadot_node_subsystem_util::{
availability_chunks::availability_chunk_index,
runtime::{get_occupied_cores, RuntimeInfo},
};
use polkadot_primitives::{CandidateHash, CoreIndex, Hash, OccupiedCore, SessionIndex};
use polkadot_primitives::{
BlockNumber, CandidateHash, CoreIndex, Hash, OccupiedCore, SessionIndex,
};

use super::{FatalError, Metrics, Result, LOG_TARGET};

Expand Down Expand Up @@ -112,14 +114,14 @@ impl Requester {
ctx: &mut Context,
runtime: &mut RuntimeInfo,
update: ActiveLeavesUpdate,
spans: &HashMap<Hash, jaeger::PerLeafSpan>,
spans: &HashMap<Hash, (BlockNumber, jaeger::PerLeafSpan)>,
) -> Result<()> {
gum::trace!(target: LOG_TARGET, ?update, "Update fetching heads");
let ActiveLeavesUpdate { activated, deactivated } = update;
if let Some(leaf) = activated {
let span = spans
.get(&leaf.hash)
.map(|span| span.child("update-fetching-heads"))
.map(|(_, span)| span.child("update-fetching-heads"))
.unwrap_or_else(|| jaeger::Span::new(&leaf.hash, "update-fetching-heads"))
.with_string_tag("leaf", format!("{:?}", leaf.hash))
.with_stage(jaeger::Stage::AvailabilityDistribution);
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -208,7 +208,7 @@ fn check_ancestry_lookup_in_same_session() {

test_harness(test_state.clone(), |mut ctx| async move {
let chain = &test_state.relay_chain;
let spans: HashMap<Hash, jaeger::PerLeafSpan> = HashMap::new();
let spans: HashMap<Hash, (u32, jaeger::PerLeafSpan)> = HashMap::new();
let block_number = 1;
let update = ActiveLeavesUpdate {
activated: Some(new_leaf(chain[block_number], block_number as u32)),
Expand Down Expand Up @@ -281,7 +281,7 @@ fn check_ancestry_lookup_in_different_sessions() {

test_harness(test_state.clone(), |mut ctx| async move {
let chain = &test_state.relay_chain;
let spans: HashMap<Hash, jaeger::PerLeafSpan> = HashMap::new();
let spans: HashMap<Hash, (u32, jaeger::PerLeafSpan)> = HashMap::new();
let block_number = 3;
let update = ActiveLeavesUpdate {
activated: Some(new_leaf(chain[block_number], block_number as u32)),
Expand Down
11 changes: 11 additions & 0 deletions prdoc/pr_5321.prdoc
Original file line number Diff line number Diff line change
@@ -0,0 +1,11 @@
title: fix availability-distribution Jaeger spans memory leak

doc:
- audience: Node Dev
description: |
Fixes a memory leak which caused the Jaeger span storage in availability-distribution to never be pruned and therefore increasing indefinitely.
This was caused by improper handling of finalized heads. More info in https://github.com/paritytech/polkadot-sdk/issues/5258

crates:
- name: polkadot-availability-distribution
bump: patch

0 comments on commit fc906d5

Please sign in to comment.