Skip to content

Commit

Permalink
add RawDocument abstraction to access bytes in doc store
Browse files Browse the repository at this point in the history
  • Loading branch information
PSeitz committed May 12, 2021
1 parent aab65f0 commit 6e8bf3f
Show file tree
Hide file tree
Showing 2 changed files with 34 additions and 11 deletions.
8 changes: 4 additions & 4 deletions src/indexer/merger.rs
Original file line number Diff line number Diff line change
Expand Up @@ -918,16 +918,16 @@ impl IndexMerger {
if let Some(doc_id_mapping) = doc_id_mapping {
for (old_doc_id, reader_with_ordinal) in doc_id_mapping {
let store_reader = reader_with_ordinal.reader.get_store_reader()?;
let (block, start_pos, end_pos) = store_reader.get_raw(*old_doc_id)?;
store_writer.store_bytes(&block[start_pos..end_pos])?;
let raw_doc = store_reader.get_raw(*old_doc_id)?;
store_writer.store_bytes(raw_doc.get_bytes())?;
}
} else {
for reader in &self.readers {
let store_reader = reader.get_store_reader()?;
if reader.num_deleted_docs() > 0 {
for doc_id in reader.doc_ids_alive() {
let (block, start_pos, end_pos) = store_reader.get_raw(doc_id)?;
store_writer.store_bytes(&block[start_pos..end_pos])?;
let raw_doc = store_reader.get_raw(doc_id)?;
store_writer.store_bytes(raw_doc.get_bytes())?;
}
} else {
store_writer.stack(&store_reader)?;
Expand Down
37 changes: 30 additions & 7 deletions src/store/reader.rs
Original file line number Diff line number Diff line change
Expand Up @@ -86,23 +86,25 @@ impl StoreReader {
/// Reads a given document.
///
/// Calling `.get(doc)` is relatively costly as it requires
/// decompressing a compressed block.
/// decompressing a compressed block. The store utilizes a LRU cache,
/// so accessing docs from the same compressed block should be faster.
///
/// It should not be called to score documents
/// for instance.
pub fn get(&self, doc_id: DocId) -> crate::Result<Document> {
let (block, start_pos, end_pos) = self.get_raw(doc_id)?;
let mut cursor = &block[start_pos..end_pos];
let raw_doc = self.get_raw(doc_id)?;
let mut cursor = raw_doc.get_bytes();
Ok(Document::deserialize(&mut cursor)?)
}

/// Reads raw bytes of a given document. Returns the block of a document and its start and end
/// Reads raw bytes of a given document. Returns `RawDocument`, which contains the block of a document and its start and end
/// position within the block.
///
/// Calling `.get(doc)` is relatively costly as it requires
/// decompressing a compressed block.
/// decompressing a compressed block. The store utilizes a LRU cache,
/// so accessing docs from the same compressed block should be faster.
///
pub fn get_raw(&self, doc_id: DocId) -> crate::Result<(Arc<Vec<u8>>, usize, usize)> {
pub fn get_raw(&self, doc_id: DocId) -> crate::Result<RawDocument> {
let checkpoint = self.block_checkpoint(doc_id).ok_or_else(|| {
crate::TantivyError::InvalidArgument(format!("Failed to lookup Doc #{}.", doc_id))
})?;
Expand All @@ -117,7 +119,11 @@ impl StoreReader {
let doc_length = VInt::deserialize(&mut cursor)?.val() as usize;
let start_pos = cursor_len_before - cursor.len();
let end_pos = cursor_len_before - cursor.len() + doc_length;
Ok((block, start_pos, end_pos))
Ok(RawDocument {
block,
start_pos,
end_pos,
})
}

/// Summarize total space usage of this store reader.
Expand All @@ -126,6 +132,23 @@ impl StoreReader {
}
}

/// Get the bytes of a serialized `Document` in a decompressed block.
pub struct RawDocument {
/// the block of data containing multiple documents
block: Arc<Vec<u8>>,
/// start position of the document in the block
start_pos: usize,
/// end position of the document in the block
end_pos: usize,
}

impl RawDocument {
/// Get the bytes of a serialized `Document` in a decompressed block.
pub fn get_bytes(&self) -> &[u8] {
&self.block[self.start_pos..self.end_pos]
}
}

fn split_file(data: FileSlice) -> io::Result<(FileSlice, FileSlice)> {
let (data, footer_len_bytes) = data.split_from_end(size_of::<u64>());
let serialized_offset: OwnedBytes = footer_len_bytes.read_bytes()?;
Expand Down

0 comments on commit 6e8bf3f

Please sign in to comment.