From 9c795a4b907af900cedd299260b0f6672f17fec5 Mon Sep 17 00:00:00 2001 From: Michal Siedlaczek Date: Fri, 25 Feb 2022 21:14:09 -0500 Subject: [PATCH 1/5] Implement RandomAccessBinaryCollection RandomAccessBinaryCollection is a wrapper over BinaryCollection that collects sequence offsets at construction time to allow for random access. --- src/binary_collection.rs | 264 ++++++++++++++++++++++++++++++++++++--- src/lib.rs | 7 +- src/pisa2ciff.rs | 2 +- 3 files changed, 254 insertions(+), 19 deletions(-) diff --git a/src/binary_collection.rs b/src/binary_collection.rs index 5a94518..2eaab8f 100644 --- a/src/binary_collection.rs +++ b/src/binary_collection.rs @@ -3,8 +3,10 @@ use std::convert::TryInto; use std::error::Error; use std::fmt; +const ELEMENT_SIZE: usize = std::mem::size_of::(); + /// Error raised when the bytes cannot be properly parsed into the collection format. -#[derive(Debug)] +#[derive(Debug, Default)] pub struct InvalidFormat(Option); impl InvalidFormat { @@ -14,12 +16,6 @@ impl InvalidFormat { } } -impl Default for InvalidFormat { - fn default() -> Self { - Self(None) - } -} - impl Error for InvalidFormat {} impl fmt::Display for InvalidFormat { @@ -72,6 +68,7 @@ impl fmt::Display for InvalidFormat { /// # Ok(()) /// # } /// ``` +#[derive(Debug, Clone, Copy)] pub struct BinaryCollection<'a> { bytes: &'a [u8], } @@ -89,23 +86,25 @@ impl<'a> TryFrom<&'a [u8]> for BinaryCollection<'a> { } } -fn get_next<'a>( - collection: &mut BinaryCollection<'a>, -) -> Result, InvalidFormat> { - const ELEMENT_SIZE: usize = std::mem::size_of::(); - let length_bytes = collection - .bytes +fn get_from(bytes: &[u8]) -> Result, InvalidFormat> { + let length_bytes = bytes .get(..ELEMENT_SIZE) .ok_or_else(InvalidFormat::default)?; let length = u32::from_le_bytes(length_bytes.try_into().unwrap()) as usize; - let bytes = collection - .bytes + let bytes = bytes .get(ELEMENT_SIZE..(ELEMENT_SIZE * (length + 1))) .ok_or_else(InvalidFormat::default)?; - collection.bytes = &collection.bytes[length_bytes.len() + bytes.len()..]; Ok(BinarySequence { bytes, length }) } +fn get_next<'a>( + collection: &mut BinaryCollection<'a>, +) -> Result, InvalidFormat> { + let sequence = get_from(collection.bytes)?; + collection.bytes = &collection.bytes[ELEMENT_SIZE * (sequence.len() + 1)..]; + Ok(sequence) +} + impl<'a> Iterator for BinaryCollection<'a> { type Item = Result, InvalidFormat>; @@ -118,6 +117,142 @@ impl<'a> Iterator for BinaryCollection<'a> { } } +/// A version of [`BinaryCollection`] with random access to sequences. +/// +/// Because the binary format underlying [`BinaryCollection`] does not +/// support random access, implementing it requires precomputing memory +/// offsets for the sequences, and storing them in the struct. +/// This means [`RandomAccessBinaryCollection::try_from`] will have to +/// perform one full pass through the entire collection to collect the +/// offsets. Thus, use this class only if you need the random access +/// funcionality. +/// +/// Note that the because offsets are stored within the struct, it is +/// not `Copy` as opposed to [`BinaryCollection`], which is simply a view +/// over a memory buffer. +/// +/// # Examples +/// +/// ``` +/// # use ciff::{encode_u32_sequence, RandomAccessBinaryCollection, InvalidFormat}; +/// # use std::convert::TryFrom; +/// # fn main() -> Result<(), anyhow::Error> { +/// let mut buffer: Vec = Vec::new(); +/// encode_u32_sequence(&mut buffer, 3, &[1, 2, 3])?; +/// encode_u32_sequence(&mut buffer, 1, &[4])?; +/// encode_u32_sequence(&mut buffer, 3, &[5, 6, 7])?; +/// +/// let mut collection = RandomAccessBinaryCollection::try_from(&buffer[..])?; +/// assert_eq!( +/// collection.get(0).map(|seq| seq.iter().collect::>()), +/// Some(vec![1_u32, 2, 3]), +/// ); +/// assert_eq!( +/// collection.at(2).iter().collect::>(), +/// vec![5_u32, 6, 7], +/// ); +/// assert_eq!(collection.get(3), None); +/// # Ok(()) +/// # } +/// ``` +/// +/// ```should_panic +/// # use ciff::{encode_u32_sequence, RandomAccessBinaryCollection, InvalidFormat}; +/// # use std::convert::TryFrom; +/// # fn main() -> Result<(), anyhow::Error> { +/// # let mut buffer: Vec = Vec::new(); +/// # encode_u32_sequence(&mut buffer, 3, &[1, 2, 3])?; +/// # encode_u32_sequence(&mut buffer, 1, &[4])?; +/// # encode_u32_sequence(&mut buffer, 3, &[5, 6, 7])?; +/// # let mut collection = RandomAccessBinaryCollection::try_from(&buffer[..])?; +/// collection.at(3); // out of bounds +/// # Ok(()) +/// # } +/// ``` +#[derive(Debug, Clone)] +pub struct RandomAccessBinaryCollection<'a> { + inner: BinaryCollection<'a>, + offsets: Vec, +} + +impl<'a> TryFrom<&'a [u8]> for RandomAccessBinaryCollection<'a> { + type Error = InvalidFormat; + fn try_from(bytes: &'a [u8]) -> Result { + let collection = BinaryCollection::try_from(bytes)?; + let offsets = collection + .map(|sequence| sequence.map(|s| s.len())) + .scan(0, |offset, len| { + Some(len.map(|len| { + let result = *offset; + *offset += ELEMENT_SIZE * (len + 1); + result + })) + }) + .collect::, _>>()?; + + Ok(Self { + inner: collection, + offsets, + }) + } +} + +impl<'a> IntoIterator for RandomAccessBinaryCollection<'a> { + type Item = Result, InvalidFormat>; + type IntoIter = BinaryCollection<'a>; + fn into_iter(self) -> BinaryCollection<'a> { + self.inner + } +} + +impl<'a> RandomAccessBinaryCollection<'a> { + /// Returns an iterator over sequences. + pub fn iter(&self) -> impl Iterator, InvalidFormat>> { + self.inner + } + + /// Returns the sequence at the given index. + /// + /// # Panics + /// + /// Panics if the index is out of bounds. + #[must_use] + pub fn at(&self, index: usize) -> BinarySequence<'a> { + if let Some(sequence) = self.get(index) { + sequence + } else { + panic!("out of bounds"); + } + } + + /// Returns the sequence at the given index or `None` if out of bounds. + #[must_use] + pub fn get(&self, index: usize) -> Option> { + let byte_offset = *self.offsets.get(index)?; + if let Ok(sequence) = get_from(self.inner.bytes.get(byte_offset..)?) { + Some(sequence) + } else { + // The following case should be unreachable, because when constructing + // the collection, we iterate through all sequences. Though there still + // can be an error when iterating the sequence elements, the sequence + // itself must be Ok. + unreachable!() + } + } + + /// Returns the number of sequences in the collection. + #[must_use] + pub fn len(&self) -> usize { + self.offsets.len() + } + + /// Checks if the collection is empty. + #[must_use] + pub fn is_empty(&self) -> bool { + self.offsets.len() == 0 + } +} + /// A single binary sequence. /// /// # Examples @@ -138,6 +273,7 @@ impl<'a> Iterator for BinaryCollection<'a> { /// # Ok(()) /// # } /// ``` +#[derive(Debug, Clone, Copy, PartialEq, Eq)] pub struct BinarySequence<'a> { /// All bytes, **excluding** the length bytes. bytes: &'a [u8], @@ -272,4 +408,100 @@ mod test { let _ = sequence.get(idx); } } + + #[test] + fn test_binary_collection() { + let input: Vec = vec![ + 1, 0, 0, 0, 3, 0, 0, 0, // Number of documents + 1, 0, 0, 0, 0, 0, 0, 0, // t0 + 1, 0, 0, 0, 0, 0, 0, 0, // t1 + 1, 0, 0, 0, 0, 0, 0, 0, // t2 + 1, 0, 0, 0, 0, 0, 0, 0, // t3 + 1, 0, 0, 0, 2, 0, 0, 0, // t4 + 3, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 2, 0, 0, 0, // t5 + 2, 0, 0, 0, 1, 0, 0, 0, 2, 0, 0, 0, // t6 + 3, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 2, 0, 0, 0, // t7 + 1, 0, 0, 0, 1, 0, 0, 0, // t8 + ]; + let coll = BinaryCollection::try_from(input.as_ref()).unwrap(); + let sequences = coll + .map(|sequence| { + sequence.map(|sequence| (sequence.len(), sequence.iter().collect::>())) + }) + .collect::, _>>() + .unwrap(); + assert_eq!( + sequences, + vec![ + (1, vec![3]), + (1, vec![0]), + (1, vec![0]), + (1, vec![0]), + (1, vec![0]), + (1, vec![2]), + (3, vec![0, 1, 2]), + (2, vec![1, 2]), + (3, vec![0, 1, 2]), + (1, vec![1]), + ] + ); + } + + #[test] + fn test_random_access_binary_collection() { + let input = vec![ + 1, 0, 0, 0, 3, 0, 0, 0, // Number of documents + 1, 0, 0, 0, 0, 0, 0, 0, // t0 + 1, 0, 0, 0, 0, 0, 0, 0, // t1 + 1, 0, 0, 0, 0, 0, 0, 0, // t2 + 1, 0, 0, 0, 0, 0, 0, 0, // t3 + 1, 0, 0, 0, 2, 0, 0, 0, // t4 + 3, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 2, 0, 0, 0, // t5 + 2, 0, 0, 0, 1, 0, 0, 0, 2, 0, 0, 0, // t6 + 3, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 2, 0, 0, 0, // t7 + 1, 0, 0, 0, 1, 0, 0, 0, // t8 + ]; + let coll = RandomAccessBinaryCollection::try_from(input.as_ref()).unwrap(); + let sequences = coll + .iter() + .map(|sequence| { + sequence.map(|sequence| (sequence.len(), sequence.iter().collect::>())) + }) + .collect::, _>>() + .unwrap(); + assert_eq!( + sequences, + vec![ + (1, vec![3]), + (1, vec![0]), + (1, vec![0]), + (1, vec![0]), + (1, vec![0]), + (1, vec![2]), + (3, vec![0, 1, 2]), + (2, vec![1, 2]), + (3, vec![0, 1, 2]), + (1, vec![1]), + ] + ); + assert_eq!(coll.offsets, vec![0, 8, 16, 24, 32, 40, 48, 64, 76, 92]); + assert_eq!(coll.len(), 10); + assert_eq!( + (0..coll.len()) + .map(|idx| coll.at(idx).iter().collect()) + .collect::>>(), + vec![ + vec![3], + vec![0], + vec![0], + vec![0], + vec![0], + vec![2], + vec![0, 1, 2], + vec![1, 2], + vec![0, 1, 2], + vec![1], + ] + ); + } } diff --git a/src/lib.rs b/src/lib.rs index e315783..f4d67c5 100644 --- a/src/lib.rs +++ b/src/lib.rs @@ -18,7 +18,8 @@ clippy::module_name_repetitions, clippy::default_trait_access, clippy::cast_possible_wrap, - clippy::cast_possible_truncation + clippy::cast_possible_truncation, + clippy::copy_iterator )] use anyhow::{anyhow, Context}; @@ -37,7 +38,9 @@ use std::path::{Path, PathBuf}; mod proto; pub use proto::{DocRecord, Posting, PostingsList}; mod binary_collection; -pub use binary_collection::{BinaryCollection, BinarySequence, InvalidFormat}; +pub use binary_collection::{ + BinaryCollection, BinarySequence, InvalidFormat, RandomAccessBinaryCollection, +}; type Result = anyhow::Result; diff --git a/src/pisa2ciff.rs b/src/pisa2ciff.rs index b6754d3..a451346 100644 --- a/src/pisa2ciff.rs +++ b/src/pisa2ciff.rs @@ -41,7 +41,7 @@ fn main() { &args.terms, &args.documents, &args.output, - &args.description.unwrap_or_else(String::new), + &args.description.unwrap_or_default(), ) { eprintln!("ERROR: {}", error); std::process::exit(1); From 631cb2f996090319cef6d6a6fad287fe233c9baf Mon Sep 17 00:00:00 2001 From: Michal Siedlaczek Date: Fri, 25 Feb 2022 21:46:51 -0500 Subject: [PATCH 2/5] Fix downloading grcov --- .github/workflows/rust.yml | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/.github/workflows/rust.yml b/.github/workflows/rust.yml index fe9ecdf..df2f40f 100644 --- a/.github/workflows/rust.yml +++ b/.github/workflows/rust.yml @@ -78,7 +78,10 @@ jobs: rust-version: nightly - run: rustup component add llvm-tools-preview - run: echo "PATH=/home/runner/.cargo/bin:$PATH" >> $GITHUB_ENV - - run: curl -L https://github.com/mozilla/grcov/releases/latest/download/grcov-linux-x86_64.tar.bz2 | tar jxf - + - run: echo "7817b621f62dddfadd35fb84999b441bbce72b70cd8a61a9fe8e0998ccf75898 *grcov-v0.8.6-x86_64-unknown-linux-gnu.tar.gz" > checksums + - run: curl -sOL https://github.com/mozilla/grcov/releases/download/v0.8.6/grcov-v0.8.6-x86_64-unknown-linux-gnu.tar.gz + - run: sha256sum -c checksums --ignore-missing + - run: tar xf grcov-v0.8.6-x86_64-unknown-linux-gnu.tar.gz - run: cargo test --verbose --workspace - run: mkdir ./coverage - run: ./grcov . --binary-path ./target/debug/ -s . -t lcov --branch --ignore-not-existing --ignore "*cargo*" --ignore "build.rs" --ignore "*target*" --ignore "tests/*" --ignore "*ciff2pisa.rs" --ignore "*pisa2ciff.rs" -o ./coverage/lcov.info From 5f312725877e39871e92a6bf2498db59bf22b6dc Mon Sep 17 00:00:00 2001 From: Michal Siedlaczek Date: Sat, 26 Feb 2022 08:10:27 -0500 Subject: [PATCH 3/5] Add tests --- src/binary_collection.rs | 83 +++++++++++++++++++++------------------- 1 file changed, 43 insertions(+), 40 deletions(-) diff --git a/src/binary_collection.rs b/src/binary_collection.rs index 2eaab8f..40de3da 100644 --- a/src/binary_collection.rs +++ b/src/binary_collection.rs @@ -6,7 +6,7 @@ use std::fmt; const ELEMENT_SIZE: usize = std::mem::size_of::(); /// Error raised when the bytes cannot be properly parsed into the collection format. -#[derive(Debug, Default)] +#[derive(Debug, Default, PartialEq, Eq)] pub struct InvalidFormat(Option); impl InvalidFormat { @@ -197,14 +197,6 @@ impl<'a> TryFrom<&'a [u8]> for RandomAccessBinaryCollection<'a> { } } -impl<'a> IntoIterator for RandomAccessBinaryCollection<'a> { - type Item = Result, InvalidFormat>; - type IntoIter = BinaryCollection<'a>; - fn into_iter(self) -> BinaryCollection<'a> { - self.inner - } -} - impl<'a> RandomAccessBinaryCollection<'a> { /// Returns an iterator over sequences. pub fn iter(&self) -> impl Iterator, InvalidFormat>> { @@ -221,7 +213,11 @@ impl<'a> RandomAccessBinaryCollection<'a> { if let Some(sequence) = self.get(index) { sequence } else { - panic!("out of bounds"); + panic!( + "out of bounds: requested {} out of {} elements", + index, + self.len() + ); } } @@ -385,13 +381,24 @@ mod test { use super::*; use quickcheck_macros::quickcheck; + const COLLECTION_BYTES: [u8; 100] = [ + 1, 0, 0, 0, 3, 0, 0, 0, // Number of documents + 1, 0, 0, 0, 0, 0, 0, 0, // t0 + 1, 0, 0, 0, 0, 0, 0, 0, // t1 + 1, 0, 0, 0, 0, 0, 0, 0, // t2 + 1, 0, 0, 0, 0, 0, 0, 0, // t3 + 1, 0, 0, 0, 2, 0, 0, 0, // t4 + 3, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 2, 0, 0, 0, // t5 + 2, 0, 0, 0, 1, 0, 0, 0, 2, 0, 0, 0, // t6 + 3, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 2, 0, 0, 0, // t7 + 1, 0, 0, 0, 1, 0, 0, 0, // t8 + ]; + #[test] fn test_binary_sequence() { let bytes: Vec = (0_u32..10).flat_map(|i| i.to_le_bytes().to_vec()).collect(); - let sequence = BinarySequence { - bytes: &bytes, - length: 10, - }; + let sequence = BinarySequence::try_from(bytes.as_ref()).unwrap(); + assert!(!sequence.is_empty()); for n in 0..10 { assert_eq!(sequence.get(n).unwrap(), n as u32); } @@ -411,19 +418,7 @@ mod test { #[test] fn test_binary_collection() { - let input: Vec = vec![ - 1, 0, 0, 0, 3, 0, 0, 0, // Number of documents - 1, 0, 0, 0, 0, 0, 0, 0, // t0 - 1, 0, 0, 0, 0, 0, 0, 0, // t1 - 1, 0, 0, 0, 0, 0, 0, 0, // t2 - 1, 0, 0, 0, 0, 0, 0, 0, // t3 - 1, 0, 0, 0, 2, 0, 0, 0, // t4 - 3, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 2, 0, 0, 0, // t5 - 2, 0, 0, 0, 1, 0, 0, 0, 2, 0, 0, 0, // t6 - 3, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 2, 0, 0, 0, // t7 - 1, 0, 0, 0, 1, 0, 0, 0, // t8 - ]; - let coll = BinaryCollection::try_from(input.as_ref()).unwrap(); + let coll = BinaryCollection::try_from(COLLECTION_BYTES.as_ref()).unwrap(); let sequences = coll .map(|sequence| { sequence.map(|sequence| (sequence.len(), sequence.iter().collect::>())) @@ -447,21 +442,22 @@ mod test { ); } + #[test] + fn test_binary_collection_invalid_format() { + let input: Vec = vec![1, 0, 0, 0, 3, 0, 0, 0, 1]; + let coll = BinaryCollection::try_from(input.as_ref()); + assert_eq!( + coll.err(), + Some(InvalidFormat::new( + "The byte-length of the collection is not divisible by the element size (4)" + )) + ); + } + #[test] fn test_random_access_binary_collection() { - let input = vec![ - 1, 0, 0, 0, 3, 0, 0, 0, // Number of documents - 1, 0, 0, 0, 0, 0, 0, 0, // t0 - 1, 0, 0, 0, 0, 0, 0, 0, // t1 - 1, 0, 0, 0, 0, 0, 0, 0, // t2 - 1, 0, 0, 0, 0, 0, 0, 0, // t3 - 1, 0, 0, 0, 2, 0, 0, 0, // t4 - 3, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 2, 0, 0, 0, // t5 - 2, 0, 0, 0, 1, 0, 0, 0, 2, 0, 0, 0, // t6 - 3, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 2, 0, 0, 0, // t7 - 1, 0, 0, 0, 1, 0, 0, 0, // t8 - ]; - let coll = RandomAccessBinaryCollection::try_from(input.as_ref()).unwrap(); + let coll = RandomAccessBinaryCollection::try_from(COLLECTION_BYTES.as_ref()).unwrap(); + assert!(!coll.is_empty()); let sequences = coll .iter() .map(|sequence| { @@ -504,4 +500,11 @@ mod test { ] ); } + + #[test] + #[should_panic] + fn test_random_access_binary_collection_out_of_bounds() { + let coll = RandomAccessBinaryCollection::try_from(COLLECTION_BYTES.as_ref()).unwrap(); + let _ = coll.at(10); + } } From dea60430e1e8216ca14ca1b9c87b365d174488f8 Mon Sep 17 00:00:00 2001 From: Michal Siedlaczek Date: Sat, 26 Feb 2022 14:50:18 -0500 Subject: [PATCH 4/5] Implement index reordering --- Cargo.toml | 2 +- src/binary_collection.rs | 51 ++++++++++++++++++++++ src/lib.rs | 92 +++++++++++++++++++++++++++++++++++++--- tests/toy.rs | 76 +++++++++++++++++++++++++++++++++ 4 files changed, 215 insertions(+), 6 deletions(-) diff --git a/Cargo.toml b/Cargo.toml index 59ba76a..89dc60a 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -26,11 +26,11 @@ num-traits = "0" indicatif = "0.15" anyhow = "1.0" memmap = "0.7" +tempfile = "3" [build-dependencies] protobuf-codegen-pure = "2.22" [dev-dependencies] -tempfile = "3" quickcheck = "1" quickcheck_macros = "1" diff --git a/src/binary_collection.rs b/src/binary_collection.rs index 40de3da..754569f 100644 --- a/src/binary_collection.rs +++ b/src/binary_collection.rs @@ -2,6 +2,7 @@ use std::convert::TryFrom; use std::convert::TryInto; use std::error::Error; use std::fmt; +use std::io::{self, Write}; const ELEMENT_SIZE: usize = std::mem::size_of::(); @@ -376,6 +377,25 @@ impl<'a> Iterator for BinarySequenceIterator<'a> { } } +/// Reorders a collection according to the given order. +/// +/// The new collection will be written to `output`, such that a sequence at position `i` +/// in `collection` will be at position `order[i]` in the new collection. +pub fn reorder( + collection: &RandomAccessBinaryCollection<'_>, + order: &[usize], + output: &mut W, +) -> io::Result<()> { + for &pos in order { + let sequence = collection.at(pos); + let length = sequence.len() as u32; + output.write_all(&length.to_le_bytes())?; + output.write_all(sequence.bytes)?; + } + output.flush()?; + Ok(()) +} + #[cfg(test)] mod test { use super::*; @@ -507,4 +527,35 @@ mod test { let coll = RandomAccessBinaryCollection::try_from(COLLECTION_BYTES.as_ref()).unwrap(); let _ = coll.at(10); } + + #[test] + fn test_reorder_collection() { + let coll = RandomAccessBinaryCollection::try_from(COLLECTION_BYTES.as_ref()).unwrap(); + let order = vec![0, 1, 4, 9, 5, 6, 7, 2, 3, 8]; + let mut output = Vec::::new(); + reorder(&coll, &order, &mut output).unwrap(); + println!("{:?}", output); + let reordered = BinaryCollection::try_from(output.as_ref()).unwrap(); + let sequences = reordered + .map(|sequence| { + sequence.map(|sequence| (sequence.len(), sequence.iter().collect::>())) + }) + .collect::, _>>() + .unwrap(); + assert_eq!( + sequences, + vec![ + (1, vec![3]), // 0 + (1, vec![0]), // 1 + (1, vec![0]), // 4 + (1, vec![1]), // 9 + (1, vec![2]), // 5 + (3, vec![0, 1, 2]), // 6 + (2, vec![1, 2]), // 7 + (1, vec![0]), // 2 + (1, vec![0]), // 3 + (3, vec![0, 1, 2]), // 8 + ] + ); + } } diff --git a/src/lib.rs b/src/lib.rs index f4d67c5..4ee8820 100644 --- a/src/lib.rs +++ b/src/lib.rs @@ -30,10 +30,12 @@ use num_traits::ToPrimitive; use protobuf::{CodedInputStream, CodedOutputStream}; use std::borrow::Borrow; use std::convert::TryFrom; +use std::ffi::OsStr; use std::fmt; use std::fs::File; use std::io::{self, BufRead, BufReader, BufWriter, Write}; use std::path::{Path, PathBuf}; +use tempfile::TempDir; mod proto; pub use proto::{DocRecord, Posting, PostingsList}; @@ -169,6 +171,79 @@ where Ok(()) } +fn check_lines_sorted(reader: R) -> io::Result { + let mut prev = String::from(""); + for line in reader.lines() { + let line = line?; + if line < prev { + return Ok(false); + } + prev = line; + } + Ok(true) +} + +struct PisaIndexPaths { + terms: PathBuf, + documents: PathBuf, + frequencies: PathBuf, + sizes: PathBuf, + titles: PathBuf, +} + +impl PisaIndexPaths { + fn from_base_path(path: &Path) -> Option { + let file_name = path.file_name()?; + let parent = path.parent()?; + let format_name = |file: &OsStr, suffix| { + let mut full_name = file.to_owned(); + full_name.push(suffix); + full_name + }; + Some(Self { + terms: parent.join(format_name(file_name, ".terms")), + documents: parent.join(format_name(file_name, ".docs")), + frequencies: parent.join(format_name(file_name, ".freqs")), + sizes: parent.join(format_name(file_name, ".sizes")), + titles: parent.join(format_name(file_name, ".documents")), + }) + } +} + +fn reorder_postings(path: &Path, order: &[usize], skip_first: bool) -> Result<()> { + let temp = TempDir::new()?; + let tmp_path = temp.path().join("coll"); + std::fs::rename(path, &tmp_path)?; + let mmap = unsafe { Mmap::map(&File::open(tmp_path)?)? }; + let coll = RandomAccessBinaryCollection::try_from(mmap.as_ref())?; + let mut writer = BufWriter::new(File::create(path)?); + if skip_first { + let order: Vec<_> = std::iter::once(0) + .chain(order.iter().map(|&i| i + 1)) + .collect(); + binary_collection::reorder(&coll, &order, &mut writer)?; + } else { + binary_collection::reorder(&coll, order, &mut writer)?; + } + writer.flush()?; + Ok(()) +} + +fn reorder_pisa_index(paths: &PisaIndexPaths) -> Result<()> { + let terms = BufReader::new(File::open(&paths.terms)?) + .lines() + .collect::>>()?; + let mut order: Vec<_> = (0..terms.len()).collect(); + order.sort_by_key(|&i| &terms[i]); + reorder_postings(&paths.documents, &order, true)?; + reorder_postings(&paths.frequencies, &order, false)?; + let mut term_writer = BufWriter::new(File::create(&paths.terms)?); + for index in order { + writeln!(&mut term_writer, "{}", terms[index])?; + } + Ok(()) +} + /// Converts a CIFF index stored in `path` to a PISA "binary collection" (uncompressed inverted /// index) with a basename `output`. /// @@ -180,12 +255,15 @@ where /// - data format is valid but any ID, frequency, or a count is negative, /// - document records is out of order. pub fn ciff_to_pisa(input: &Path, output: &Path) -> Result<()> { + let index_paths = + PisaIndexPaths::from_base_path(output).ok_or_else(|| anyhow!("invalid output path"))?; + let mut ciff_reader = File::open(input).with_context(|| format!("Unable to open {}", input.display()))?; let mut input = CodedInputStream::new(&mut ciff_reader); - let mut documents = BufWriter::new(File::create(format!("{}.docs", output.display()))?); - let mut frequencies = BufWriter::new(File::create(format!("{}.freqs", output.display()))?); - let mut terms = BufWriter::new(File::create(format!("{}.terms", output.display()))?); + let mut documents = BufWriter::new(File::create(&index_paths.documents)?); + let mut frequencies = BufWriter::new(File::create(&index_paths.frequencies)?); + let mut terms = BufWriter::new(File::create(&index_paths.terms)?); let header = Header::from_stream(&mut input)?; println!("{}", header); @@ -211,8 +289,8 @@ pub fn ciff_to_pisa(input: &Path, output: &Path) -> Result<()> { terms.flush()?; eprintln!("Processing document lengths"); - let mut sizes = BufWriter::new(File::create(format!("{}.sizes", output.display()))?); - let mut trecids = BufWriter::new(File::create(format!("{}.documents", output.display()))?); + let mut sizes = BufWriter::new(File::create(&index_paths.sizes)?); + let mut trecids = BufWriter::new(File::create(&index_paths.titles)?); let progress = ProgressBar::new(u64::from(header.num_documents)); progress.set_style(pb_style()); @@ -245,6 +323,10 @@ pub fn ciff_to_pisa(input: &Path, output: &Path) -> Result<()> { } progress.finish(); + if !check_lines_sorted(BufReader::new(File::open(&index_paths.terms)?))? { + reorder_pisa_index(&index_paths)?; + } + Ok(()) } diff --git a/tests/toy.rs b/tests/toy.rs index 1f9f60f..e7010d0 100644 --- a/tests/toy.rs +++ b/tests/toy.rs @@ -109,3 +109,79 @@ fn test_to_and_from_ciff() -> anyhow::Result<()> { Ok(()) } + +#[test] +fn test_reorder_terms() -> anyhow::Result<()> { + let input_path = PathBuf::from("tests/test_data/toy-complete-20200309.ciff"); + let temp = TempDir::new().unwrap(); + let pisa_path = temp.path().join("coll"); + ciff_to_pisa(&input_path, &pisa_path)?; + + // Rewrite the terms; later, we will check if the posting lists are in reverse order. + std::fs::write( + temp.path().join("coll.terms"), + vec![ + "veri", "text", "simpl", "head", "enough", "content", "30", "03", "01", + ] + .join("\n"), + )?; + + let ciff_output_path = temp.path().join("ciff"); + pisa_to_ciff( + &pisa_path, + &temp.path().join("coll.terms"), + &temp.path().join("coll.documents"), + &ciff_output_path, + "", + )?; + + // Convert back to PISA to verify list order + let pisa_copy = temp.path().join("copy"); + ciff_to_pisa(&ciff_output_path, &pisa_copy)?; + + assert_eq!( + std::fs::read_to_string(temp.path().join("copy.documents"))?, + "WSJ_1\nTREC_DOC_1\nDOC222\n" + ); + assert_eq!( + std::fs::read(temp.path().join("coll.sizes"))?, + vec![3, 0, 0, 0, 6, 0, 0, 0, 4, 0, 0, 0, 6, 0, 0, 0] + ); + assert_eq!( + std::fs::read_to_string(temp.path().join("copy.terms"))? + .lines() + .collect::>(), + vec!["01", "03", "30", "content", "enough", "head", "simpl", "text", "veri"] + ); + assert_eq!( + std::fs::read(temp.path().join("copy.docs"))?, + vec![ + 1, 0, 0, 0, 3, 0, 0, 0, // Number of documents + 1, 0, 0, 0, 1, 0, 0, 0, // t8 + 3, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 2, 0, 0, 0, // t7 + 2, 0, 0, 0, 1, 0, 0, 0, 2, 0, 0, 0, // t6 + 3, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 2, 0, 0, 0, // t5 + 1, 0, 0, 0, 2, 0, 0, 0, // t4 + 1, 0, 0, 0, 0, 0, 0, 0, // t3 + 1, 0, 0, 0, 0, 0, 0, 0, // t2 + 1, 0, 0, 0, 0, 0, 0, 0, // t1 + 1, 0, 0, 0, 0, 0, 0, 0, // t0 + ] + ); + assert_eq!( + std::fs::read(temp.path().join("copy.freqs"))?, + vec![ + 1, 0, 0, 0, 1, 0, 0, 0, // t8 + 3, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 3, 0, 0, 0, // t7 + 2, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, // t6 + 3, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, // t5 + 1, 0, 0, 0, 1, 0, 0, 0, // t4 + 1, 0, 0, 0, 1, 0, 0, 0, // t3 + 1, 0, 0, 0, 1, 0, 0, 0, // t2 + 1, 0, 0, 0, 1, 0, 0, 0, // t1 + 1, 0, 0, 0, 1, 0, 0, 0, // t0 + ] + ); + + Ok(()) +} From 4e1f136a3cc9bedab95223cc4ccd3966bf477c43 Mon Sep 17 00:00:00 2001 From: J Mackenzie Date: Mon, 28 Feb 2022 11:57:12 +1100 Subject: [PATCH 5/5] Typo --- src/binary_collection.rs | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/binary_collection.rs b/src/binary_collection.rs index 754569f..c71c4da 100644 --- a/src/binary_collection.rs +++ b/src/binary_collection.rs @@ -126,7 +126,7 @@ impl<'a> Iterator for BinaryCollection<'a> { /// This means [`RandomAccessBinaryCollection::try_from`] will have to /// perform one full pass through the entire collection to collect the /// offsets. Thus, use this class only if you need the random access -/// funcionality. +/// functionality. /// /// Note that the because offsets are stored within the struct, it is /// not `Copy` as opposed to [`BinaryCollection`], which is simply a view