diff --git a/cmd/zfs_object_agent/zettaobject/src/reclaim.rs b/cmd/zfs_object_agent/zettaobject/src/reclaim.rs index 531a1b16ecba..40f61f3a0869 100644 --- a/cmd/zfs_object_agent/zettaobject/src/reclaim.rs +++ b/cmd/zfs_object_agent/zettaobject/src/reclaim.rs @@ -65,7 +65,8 @@ tunable! { // When reclaiming free blocks, allow this many concurrent GetObject+PutObject requests. static ref RECLAIM_QUEUE_DEPTH: usize = 200; - static ref VERIFY_RECLAIM_PUT: bool = true; + static ref VERIFY_RECLAIM_PUT: bool = false; + static ref MITIGATE_OVERWRITE_BUG: bool = true; } lazy_static! { @@ -698,6 +699,18 @@ impl Reclaim { freed_blocks_count += u64::from(later_blocks_freed); freed_blocks_bytes += u64::from(later_bytes_freed); objects_to_consolidate.push((later_object_new_size, frees)); + if *MITIGATE_OVERWRITE_BUG { + // A timed-out PutObject may take effect at an unknown later time. If a + // subsequent PutObject overwrites the same object with different data, + // the first PutObject may be applied at a later time, overwriting the + // second PutObject's data. If the overwritten data contains blocks from + // a different (consolidated) object, those blocks will be lost. To + // mitigate this, we do not consolidate objects. This way, if a + // timed-out PutObject takes effect later, the reverted state of the + // object only contains additional blocks (which will be leaked, at least + // until the entire object is delted). + break; + } } // XXX look for earlier objects too? @@ -869,15 +882,30 @@ async fn reclaim_frees_object( .blocks .retain(|block, _| block < &next_block); - assert_ge!(phys.header.blocks_size, object_size.num_bytes); + if !*MITIGATE_OVERWRITE_BUG { + assert_ge!(phys.header.blocks_size, object_size.num_bytes); + } phys.header.blocks_size = object_size.num_bytes; assert_ge!(phys.header.next_block, next_block); phys.header.next_block = next_block; } - assert_eq!(phys.header.blocks_size, phys.calculate_blocks_size()); - assert_eq!(phys.header.blocks_size, object_size.num_bytes); - assert_eq!(phys.blocks_len(), object_size.num_blocks); + if *MITIGATE_OVERWRITE_BUG { + // A timed-out and then later-applied PutObject may have reverted the object to a + // previous state which has more blocks than we expect. + if phys.header.blocks_size != object_size.num_bytes || + phys.blocks_len() != object_size.num_blocks + { + warn!("reclaim: {object:?} expected size {} blocks {}, found size {} blocks {}", + object_size.num_bytes, object_size.num_blocks, + phys.header.blocks_size, phys.blocks_len()); + } + phys.header.blocks_size = phys.calculate_blocks_size(); + } else { + assert_eq!(phys.header.blocks_size, phys.calculate_blocks_size()); + assert_eq!(phys.header.blocks_size, object_size.num_bytes); + assert_eq!(phys.blocks_len(), object_size.num_blocks); + } phys })