From 98a0de37b07c2cbe558660db615a2b0f602da498 Mon Sep 17 00:00:00 2001 From: Arvid Norberg Date: Tue, 7 Jan 2025 20:11:13 +0100 Subject: [PATCH] fix Serializer to correctly restore() to allow resumption of serialization --- fuzz/fuzz_targets/incremental_serializer.rs | 6 +- fuzz/fuzz_targets/serializer.rs | 4 +- src/serde/identity_hash.rs | 1 + src/serde/incremental.rs | 183 ++++++++++++++++---- src/serde/read_cache_lookup.rs | 2 +- src/serde/test.rs | 4 +- 6 files changed, 161 insertions(+), 39 deletions(-) diff --git a/fuzz/fuzz_targets/incremental_serializer.rs b/fuzz/fuzz_targets/incremental_serializer.rs index 1109af66..5bb980d8 100644 --- a/fuzz/fuzz_targets/incremental_serializer.rs +++ b/fuzz/fuzz_targets/incremental_serializer.rs @@ -94,10 +94,10 @@ fn do_fuzz(data: &[u8], short_atoms: bool) { { node_idx += 1; - let mut ser = Serializer::new(); - let (done, _) = ser.add(&allocator, first_step, Some(sentinel)).unwrap(); + let mut ser = Serializer::new(Some(sentinel)); + let (done, _) = ser.add(&allocator, first_step).unwrap(); assert!(!done); - let (done, _) = ser.add(&allocator, second_step, None).unwrap(); + let (done, _) = ser.add(&allocator, second_step).unwrap(); assert!(done); // now, make sure that we deserialize to the exact same structure, by diff --git a/fuzz/fuzz_targets/serializer.rs b/fuzz/fuzz_targets/serializer.rs index 2e58a2e6..f116a86a 100644 --- a/fuzz/fuzz_targets/serializer.rs +++ b/fuzz/fuzz_targets/serializer.rs @@ -19,8 +19,8 @@ fn do_fuzz(data: &[u8], short_atoms: bool) { let b1 = node_to_bytes_backrefs(&allocator, program).unwrap(); - let mut ser = Serializer::new(); - let (done, _) = ser.add(&allocator, program, None).unwrap(); + let mut ser = Serializer::new(None); + let (done, _) = ser.add(&allocator, program).unwrap(); assert!(done); let b2 = ser.into_inner(); diff --git a/src/serde/identity_hash.rs b/src/serde/identity_hash.rs index 1a1396cb..34856356 100644 --- a/src/serde/identity_hash.rs +++ b/src/serde/identity_hash.rs @@ -25,6 +25,7 @@ impl Hasher for IdentityHash { } } +#[derive(Clone)] pub struct RandomState(u64); impl Default for RandomState { diff --git a/src/serde/incremental.rs b/src/serde/incremental.rs index a859d71d..11581cec 100644 --- a/src/serde/incremental.rs +++ b/src/serde/incremental.rs @@ -27,19 +27,15 @@ pub struct Serializer { thc: ObjectCache, slc: ObjectCache, + sentinel: Option, output: Cursor>, } -impl Default for Serializer { - fn default() -> Self { - Self::new() - } -} - #[derive(Clone)] pub struct UndoState { read_op_stack: Vec, write_stack: Vec, + read_cache_lookup: ReadCacheLookup, output_position: u64, } @@ -47,13 +43,14 @@ pub struct UndoState { /// The compression cannot "see through" the sentinel node, so some compression /// opportunities may be missed when serializing and compressing incrementally. impl Serializer { - pub fn new() -> Self { + pub fn new(sentinel: Option) -> Self { Self { read_op_stack: vec![ReadOp::Parse], write_stack: vec![], read_cache_lookup: ReadCacheLookup::new(), thc: ObjectCache::new(treehash), slc: ObjectCache::new(serialized_length), + sentinel, output: Cursor::new(vec![]), } } @@ -72,12 +69,7 @@ impl Serializer { /// beginning if this is the first call. Returns true when we're done /// serializing. i.e. no sentinel token was encountered. Once this function /// returns true, it may not be called again. - pub fn add( - &mut self, - a: &Allocator, - node: NodePtr, - sentinel: Option, - ) -> io::Result<(bool, UndoState)> { + pub fn add(&mut self, a: &Allocator, node: NodePtr) -> io::Result<(bool, UndoState)> { // once we're done serializing (i.e. there was no sentinel in the last // call to add()), we can't resume assert!(!self.read_op_stack.is_empty()); @@ -85,12 +77,13 @@ impl Serializer { let undo_state = UndoState { read_op_stack: self.read_op_stack.clone(), write_stack: self.write_stack.clone(), + read_cache_lookup: self.read_cache_lookup.clone(), output_position: self.output.position(), }; self.write_stack.push(node); while let Some(node_to_write) = self.write_stack.pop() { - if Some(node_to_write) == sentinel { + if Some(node_to_write) == self.sentinel { // we're not done serializing yet, we're stopping, and the // caller will call add() again with the node to serialize // here @@ -99,8 +92,9 @@ impl Serializer { let op = self.read_op_stack.pop(); assert!(op == Some(ReadOp::Parse)); - let node_serialized_length = self.slc.get_or_calculate(a, &node_to_write, sentinel); - let node_tree_hash = self.thc.get_or_calculate(a, &node_to_write, sentinel); + let node_serialized_length = + self.slc.get_or_calculate(a, &node_to_write, self.sentinel); + let node_tree_hash = self.thc.get_or_calculate(a, &node_to_write, self.sentinel); if let (Some(node_tree_hash), Some(node_serialized_length)) = (node_tree_hash, node_serialized_length) { @@ -148,6 +142,7 @@ impl Serializer { pub fn restore(&mut self, state: UndoState) { self.read_op_stack = state.read_op_stack; self.write_stack = state.write_stack; + self.read_cache_lookup = state.read_cache_lookup; self.output.set_position(state.output_position); self.output .get_mut() @@ -168,8 +163,6 @@ impl Serializer { /// It's only valid to convert to the inner serialized form once /// serialization is complete. i.e. after add() returns true. pub fn into_inner(self) -> Vec { - // if the sentinel is set, it means we're in the middle of serialization - // still assert!(self.read_op_stack.is_empty()); self.output.into_inner() } @@ -192,18 +185,18 @@ mod tests { let item = node_from_bytes(&mut a, &hex!("ffff0102ff0304")).unwrap(); let list = a.new_pair(item, sentinel).unwrap(); - let mut ser = Serializer::new(); + let mut ser = Serializer::new(Some(sentinel)); let mut size = ser.size(); for _ in 0..10 { // this keeps returning false because we encounter a sentinel - let (done, _) = ser.add(&a, list, Some(sentinel)).unwrap(); + let (done, _) = ser.add(&a, list).unwrap(); assert!(!done); assert!(ser.size() > size); size = ser.size(); } // this returns true because we're done now - let (done, _) = ser.add(&a, NodePtr::NIL, None).unwrap(); + let (done, _) = ser.add(&a, NodePtr::NIL).unwrap(); assert!(done); let output = ser.into_inner(); @@ -243,10 +236,10 @@ mod tests { let node5 = a.new_pair(node3, node4).unwrap(); let item = a.new_pair(node2, node5).unwrap(); - let mut ser = Serializer::new(); + let mut ser = Serializer::new(Some(sentinel)); let mut size = ser.size(); - let (done, _) = ser.add(&a, item, Some(sentinel)).unwrap(); + let (done, _) = ser.add(&a, item).unwrap(); assert!(!done); assert!(ser.size() > size); size = ser.size(); @@ -261,14 +254,14 @@ mod tests { for _ in 0..10 { // this keeps returning false because we encounter a sentinel - let (done, _) = ser.add(&a, item, Some(sentinel)).unwrap(); + let (done, _) = ser.add(&a, item).unwrap(); assert!(!done); assert!(ser.size() > size); size = ser.size(); } // this returns true because we're done now - let (done, _) = ser.add(&a, NodePtr::NIL, None).unwrap(); + let (done, _) = ser.add(&a, NodePtr::NIL).unwrap(); assert!(done); // The "foobar" atom is serialized as 86666f6f626172 @@ -299,13 +292,13 @@ mod tests { let item = node_from_bytes(&mut a, &hex!("ffff0102ff0304")).unwrap(); let list = a.new_pair(item, sentinel).unwrap(); - let mut ser = Serializer::new(); - let (done, _) = ser.add(&a, list, Some(sentinel)).unwrap(); + let mut ser = Serializer::new(Some(sentinel)); + let (done, _) = ser.add(&a, list).unwrap(); assert!(!done); assert_eq!(ser.size(), 8); assert_eq!(hex::encode(ser.get_ref()), "ffffff0102ff0304"); - let (done, state) = ser.add(&a, NodePtr::NIL, None).unwrap(); + let (done, state) = ser.add(&a, NodePtr::NIL).unwrap(); assert!(done); assert_eq!(ser.size(), 9); assert_eq!(hex::encode(ser.get_ref()), "ffffff0102ff030480"); @@ -315,17 +308,17 @@ mod tests { assert_eq!(ser.size(), 8); assert_eq!(hex::encode(ser.get_ref()), "ffffff0102ff0304"); - let (done, _) = ser.add(&a, item, None).unwrap(); + let (done, _) = ser.add(&a, item).unwrap(); assert!(done); assert_eq!(ser.size(), 10); - assert_eq!(hex::encode(ser.get_ref()), "ffffff0102ff0304fe04"); + assert_eq!(hex::encode(ser.get_ref()), "ffffff0102ff0304fe02"); ser.restore(state); let item = a.new_small_number(1337).unwrap(); - let (done, _) = ser.add(&a, item, None).unwrap(); + let (done, _) = ser.add(&a, item).unwrap(); assert!(done); assert_eq!(ser.size(), 11); @@ -334,4 +327,132 @@ mod tests { let output = ser.into_inner(); assert_eq!(hex::encode(&output), "ffffff0102ff0304820539"); } + + #[test] + fn test_incremental_restore() { + let mut a = Allocator::new(); + + let sentinel = a.new_pair(NodePtr::NIL, NodePtr::NIL).unwrap(); + // ((0x000000000000 . 0x111111111111) . (0x222222222222 . 0x333333333333)) + let item = node_from_bytes( + &mut a, + &hex!("ffff8600000000000086111111111111ff8622222222222286333333333333"), + ) + .unwrap(); + let item1 = a.new_pair(item, sentinel).unwrap(); + + // ((0x111111111111 . 0x000000000000) . (0x222222222222 . 0x333333333333)) + let item = node_from_bytes( + &mut a, + &hex!("ffff8611111111111186000000000000ff8622222222222286333333333333"), + ) + .unwrap(); + let item2 = a.new_pair(item, sentinel).unwrap(); + + // ((0x000000000000 . 0x111111111111) . (0x333333333333 . 0x222222222222)) + let item = node_from_bytes( + &mut a, + &hex!("ffff8600000000000086111111111111ff8633333333333386222222222222"), + ) + .unwrap(); + let item3 = a.new_pair(item, sentinel).unwrap(); + + // add item1, item2, item3 + // restore to after item1 + // add item3, item2 + // terminate the list + let mut ser = Serializer::new(Some(sentinel)); + let (done, _) = ser.add(&a, item1).unwrap(); + assert!(!done); + println!("{}", hex::encode(ser.get_ref())); + let (done, restore_state) = ser.add(&a, item2).unwrap(); + assert!(!done); + println!("{}", hex::encode(ser.get_ref())); + let (done, _) = ser.add(&a, item3).unwrap(); + assert!(!done); + println!("{}", hex::encode(ser.get_ref())); + println!("restore"); + ser.restore(restore_state); + println!("{}", hex::encode(ser.get_ref())); + + let (done, _) = ser.add(&a, item3).unwrap(); + assert!(!done); + println!("{}", hex::encode(ser.get_ref())); + let (done, _) = ser.add(&a, item2).unwrap(); + assert!(!done); + println!("{}", hex::encode(ser.get_ref())); + + let (done, _) = ser.add(&a, NodePtr::NIL).unwrap(); + assert!(done); + println!("{}", hex::encode(ser.get_ref())); + + let output = ser.into_inner(); + + { + let mut a = Allocator::new(); + let result = node_from_bytes_backrefs(&mut a, &output).expect("invalid serialization"); + let roundtrip = node_to_bytes(&a, result).expect("failed to serialize"); + assert_eq!( + hex::encode(roundtrip), + " + ff + ff + ff + 86000000000000 + 86111111111111 + ff + 86222222222222 + 86333333333333 + ff + ff + ff + 86000000000000 + 86111111111111 + ff + 86333333333333 + 86222222222222 + ff + ff + ff + 86111111111111 + 86000000000000 + ff + 86222222222222 + 86333333333333 + 80" + .chars() + .filter(|c| !c.is_whitespace()) + .collect::() + ); + } + + assert_eq!( + hex::encode(output), + " + ff + ff + ff + 86000000000000 + 86111111111111 + ff + 86222222222222 + 86333333333333 + ff + ff + fe04 + ff + fe1d + fe2b + ff + ff + ff + fe0c + fe11 + fe1b + 80" + .chars() + .filter(|c| !c.is_whitespace()) + .collect::() + ); + } } diff --git a/src/serde/read_cache_lookup.rs b/src/serde/read_cache_lookup.rs index 87df599c..92adeb0a 100644 --- a/src/serde/read_cache_lookup.rs +++ b/src/serde/read_cache_lookup.rs @@ -22,7 +22,7 @@ use std::collections::{HashMap, HashSet}; use super::bytes32::{hash_blob, hash_blobs, Bytes32}; -#[derive(Debug)] +#[derive(Debug, Clone)] pub struct ReadCacheLookup { root_hash: Bytes32, diff --git a/src/serde/test.rs b/src/serde/test.rs index 767be26c..49099d62 100644 --- a/src/serde/test.rs +++ b/src/serde/test.rs @@ -40,8 +40,8 @@ fn check_round_trip(obj_ser_br_hex: &str) { let mut allocator = Allocator::new(); let obj = node_from_bytes(&mut allocator, &obj_ser_no_br_1).unwrap(); - let mut serializer = Serializer::new(); - let (done, _) = serializer.add(&allocator, obj, None).unwrap(); + let mut serializer = Serializer::new(None); + let (done, _) = serializer.add(&allocator, obj).unwrap(); assert!(done); let obj_ser_br_2 = serializer.into_inner();