From 6e8bf3f8369edd702bb116f723dc19847d5e19ef Mon Sep 17 00:00:00 2001 From: Pascal Seitz Date: Wed, 12 May 2021 11:01:10 +0200 Subject: [PATCH] add RawDocument abstraction to access bytes in doc store --- src/indexer/merger.rs | 8 ++++---- src/store/reader.rs | 37 ++++++++++++++++++++++++++++++------- 2 files changed, 34 insertions(+), 11 deletions(-) diff --git a/src/indexer/merger.rs b/src/indexer/merger.rs index 5f792b660e..de07f0355a 100644 --- a/src/indexer/merger.rs +++ b/src/indexer/merger.rs @@ -918,16 +918,16 @@ impl IndexMerger { if let Some(doc_id_mapping) = doc_id_mapping { for (old_doc_id, reader_with_ordinal) in doc_id_mapping { let store_reader = reader_with_ordinal.reader.get_store_reader()?; - let (block, start_pos, end_pos) = store_reader.get_raw(*old_doc_id)?; - store_writer.store_bytes(&block[start_pos..end_pos])?; + let raw_doc = store_reader.get_raw(*old_doc_id)?; + store_writer.store_bytes(raw_doc.get_bytes())?; } } else { for reader in &self.readers { let store_reader = reader.get_store_reader()?; if reader.num_deleted_docs() > 0 { for doc_id in reader.doc_ids_alive() { - let (block, start_pos, end_pos) = store_reader.get_raw(doc_id)?; - store_writer.store_bytes(&block[start_pos..end_pos])?; + let raw_doc = store_reader.get_raw(doc_id)?; + store_writer.store_bytes(raw_doc.get_bytes())?; } } else { store_writer.stack(&store_reader)?; diff --git a/src/store/reader.rs b/src/store/reader.rs index 134bf10198..b6e9acf3c7 100644 --- a/src/store/reader.rs +++ b/src/store/reader.rs @@ -86,23 +86,25 @@ impl StoreReader { /// Reads a given document. /// /// Calling `.get(doc)` is relatively costly as it requires - /// decompressing a compressed block. + /// decompressing a compressed block. The store utilizes a LRU cache, + /// so accessing docs from the same compressed block should be faster. /// /// It should not be called to score documents /// for instance. pub fn get(&self, doc_id: DocId) -> crate::Result { - let (block, start_pos, end_pos) = self.get_raw(doc_id)?; - let mut cursor = &block[start_pos..end_pos]; + let raw_doc = self.get_raw(doc_id)?; + let mut cursor = raw_doc.get_bytes(); Ok(Document::deserialize(&mut cursor)?) } - /// Reads raw bytes of a given document. Returns the block of a document and its start and end + /// Reads raw bytes of a given document. Returns `RawDocument`, which contains the block of a document and its start and end /// position within the block. /// /// Calling `.get(doc)` is relatively costly as it requires - /// decompressing a compressed block. + /// decompressing a compressed block. The store utilizes a LRU cache, + /// so accessing docs from the same compressed block should be faster. /// - pub fn get_raw(&self, doc_id: DocId) -> crate::Result<(Arc>, usize, usize)> { + pub fn get_raw(&self, doc_id: DocId) -> crate::Result { let checkpoint = self.block_checkpoint(doc_id).ok_or_else(|| { crate::TantivyError::InvalidArgument(format!("Failed to lookup Doc #{}.", doc_id)) })?; @@ -117,7 +119,11 @@ impl StoreReader { let doc_length = VInt::deserialize(&mut cursor)?.val() as usize; let start_pos = cursor_len_before - cursor.len(); let end_pos = cursor_len_before - cursor.len() + doc_length; - Ok((block, start_pos, end_pos)) + Ok(RawDocument { + block, + start_pos, + end_pos, + }) } /// Summarize total space usage of this store reader. @@ -126,6 +132,23 @@ impl StoreReader { } } +/// Get the bytes of a serialized `Document` in a decompressed block. +pub struct RawDocument { + /// the block of data containing multiple documents + block: Arc>, + /// start position of the document in the block + start_pos: usize, + /// end position of the document in the block + end_pos: usize, +} + +impl RawDocument { + /// Get the bytes of a serialized `Document` in a decompressed block. + pub fn get_bytes(&self) -> &[u8] { + &self.block[self.start_pos..self.end_pos] + } +} + fn split_file(data: FileSlice) -> io::Result<(FileSlice, FileSlice)> { let (data, footer_len_bytes) = data.split_from_end(size_of::()); let serialized_offset: OwnedBytes = footer_len_bytes.read_bytes()?;