From 9c795a4b907af900cedd299260b0f6672f17fec5 Mon Sep 17 00:00:00 2001
From: Michal Siedlaczek <michal@siedlaczek.me>
Date: Fri, 25 Feb 2022 21:14:09 -0500
Subject: [PATCH 1/5] Implement RandomAccessBinaryCollection

RandomAccessBinaryCollection is a wrapper over BinaryCollection that
collects sequence offsets at construction time to allow for random
access.
---
 src/binary_collection.rs | 264 ++++++++++++++++++++++++++++++++++++---
 src/lib.rs               |   7 +-
 src/pisa2ciff.rs         |   2 +-
 3 files changed, 254 insertions(+), 19 deletions(-)
diff --git a/src/binary_collection.rs b/src/binary_collection.rs
index 5a94518..2eaab8f 100644
--- a/src/binary_collection.rs
+++ b/src/binary_collection.rs
@@ -3,8 +3,10 @@ use std::convert::TryInto;
 use std::error::Error;
 use std::fmt;
 
+const ELEMENT_SIZE: usize = std::mem::size_of::<u32>();
+
 /// Error raised when the bytes cannot be properly parsed into the collection format.
-#[derive(Debug)]
+#[derive(Debug, Default)]
 pub struct InvalidFormat(Option<String>);
 
 impl InvalidFormat {
@@ -14,12 +16,6 @@ impl InvalidFormat {
     }
 }
 
-impl Default for InvalidFormat {
-    fn default() -> Self {
-        Self(None)
-    }
-}
-
 impl Error for InvalidFormat {}
 
 impl fmt::Display for InvalidFormat {
@@ -72,6 +68,7 @@ impl fmt::Display for InvalidFormat {
 /// # Ok(())
 /// # }
 /// ```
+#[derive(Debug, Clone, Copy)]
 pub struct BinaryCollection<'a> {
     bytes: &'a [u8],
 }
@@ -89,23 +86,25 @@ impl<'a> TryFrom<&'a [u8]> for BinaryCollection<'a> {
     }
 }
 
-fn get_next<'a>(
-    collection: &mut BinaryCollection<'a>,
-) -> Result<BinarySequence<'a>, InvalidFormat> {
-    const ELEMENT_SIZE: usize = std::mem::size_of::<u32>();
-    let length_bytes = collection
-        .bytes
+fn get_from(bytes: &[u8]) -> Result<BinarySequence<'_>, InvalidFormat> {
+    let length_bytes = bytes
         .get(..ELEMENT_SIZE)
         .ok_or_else(InvalidFormat::default)?;
     let length = u32::from_le_bytes(length_bytes.try_into().unwrap()) as usize;
-    let bytes = collection
-        .bytes
+    let bytes = bytes
         .get(ELEMENT_SIZE..(ELEMENT_SIZE * (length + 1)))
         .ok_or_else(InvalidFormat::default)?;
-    collection.bytes = &collection.bytes[length_bytes.len() + bytes.len()..];
     Ok(BinarySequence { bytes, length })
 }
 
+fn get_next<'a>(
+    collection: &mut BinaryCollection<'a>,
+) -> Result<BinarySequence<'a>, InvalidFormat> {
+    let sequence = get_from(collection.bytes)?;
+    collection.bytes = &collection.bytes[ELEMENT_SIZE * (sequence.len() + 1)..];
+    Ok(sequence)
+}
+
 impl<'a> Iterator for BinaryCollection<'a> {
     type Item = Result<BinarySequence<'a>, InvalidFormat>;
 
@@ -118,6 +117,142 @@ impl<'a> Iterator for BinaryCollection<'a> {
     }
 }
 
+/// A version of [`BinaryCollection`] with random access to sequences.
+///
+/// Because the binary format underlying [`BinaryCollection`] does not
+/// support random access, implementing it requires precomputing memory
+/// offsets for the sequences, and storing them in the struct.
+/// This means [`RandomAccessBinaryCollection::try_from`] will have to
+/// perform one full pass through the entire collection to collect the
+/// offsets. Thus, use this class only if you need the random access
+/// funcionality.
+///
+/// Note that the because offsets are stored within the struct, it is
+/// not `Copy` as opposed to [`BinaryCollection`], which is simply a view
+/// over a memory buffer.
+///
+/// # Examples
+///
+/// ```
+/// # use ciff::{encode_u32_sequence, RandomAccessBinaryCollection, InvalidFormat};
+/// # use std::convert::TryFrom;
+/// # fn main() -> Result<(), anyhow::Error> {
+/// let mut buffer: Vec<u8> = Vec::new();
+/// encode_u32_sequence(&mut buffer, 3, &[1, 2, 3])?;
+/// encode_u32_sequence(&mut buffer, 1, &[4])?;
+/// encode_u32_sequence(&mut buffer, 3, &[5, 6, 7])?;
+///
+/// let mut collection = RandomAccessBinaryCollection::try_from(&buffer[..])?;
+/// assert_eq!(
+///     collection.get(0).map(|seq| seq.iter().collect::<Vec<_>>()),
+///     Some(vec![1_u32, 2, 3]),
+/// );
+/// assert_eq!(
+///     collection.at(2).iter().collect::<Vec<_>>(),
+///     vec![5_u32, 6, 7],
+/// );
+/// assert_eq!(collection.get(3), None);
+/// # Ok(())
+/// # }
+/// ```
+///
+/// ```should_panic
+/// # use ciff::{encode_u32_sequence, RandomAccessBinaryCollection, InvalidFormat};
+/// # use std::convert::TryFrom;
+/// # fn main() -> Result<(), anyhow::Error> {
+/// # let mut buffer: Vec<u8> = Vec::new();
+/// # encode_u32_sequence(&mut buffer, 3, &[1, 2, 3])?;
+/// # encode_u32_sequence(&mut buffer, 1, &[4])?;
+/// # encode_u32_sequence(&mut buffer, 3, &[5, 6, 7])?;
+/// # let mut collection = RandomAccessBinaryCollection::try_from(&buffer[..])?;
+/// collection.at(3); // out of bounds
+/// # Ok(())
+/// # }
+/// ```
+#[derive(Debug, Clone)]
+pub struct RandomAccessBinaryCollection<'a> {
+    inner: BinaryCollection<'a>,
+    offsets: Vec<usize>,
+}
+
+impl<'a> TryFrom<&'a [u8]> for RandomAccessBinaryCollection<'a> {
+    type Error = InvalidFormat;
+    fn try_from(bytes: &'a [u8]) -> Result<Self, Self::Error> {
+        let collection = BinaryCollection::try_from(bytes)?;
+        let offsets = collection
+            .map(|sequence| sequence.map(|s| s.len()))
+            .scan(0, |offset, len| {
+                Some(len.map(|len| {
+                    let result = *offset;
+                    *offset += ELEMENT_SIZE * (len + 1);
+                    result
+                }))
+            })
+            .collect::<Result<Vec<_>, _>>()?;
+
+        Ok(Self {
+            inner: collection,
+            offsets,
+        })
+    }
+}
+
+impl<'a> IntoIterator for RandomAccessBinaryCollection<'a> {
+    type Item = Result<BinarySequence<'a>, InvalidFormat>;
+    type IntoIter = BinaryCollection<'a>;
+    fn into_iter(self) -> BinaryCollection<'a> {
+        self.inner
+    }
+}
+
+impl<'a> RandomAccessBinaryCollection<'a> {
+    /// Returns an iterator over sequences.
+    pub fn iter(&self) -> impl Iterator<Item = Result<BinarySequence<'a>, InvalidFormat>> {
+        self.inner
+    }
+
+    /// Returns the sequence at the given index.
+    ///
+    /// # Panics
+    ///
+    /// Panics if the index is out of bounds.
+    #[must_use]
+    pub fn at(&self, index: usize) -> BinarySequence<'a> {
+        if let Some(sequence) = self.get(index) {
+            sequence
+        } else {
+            panic!("out of bounds");
+        }
+    }
+
+    /// Returns the sequence at the given index or `None` if out of bounds.
+    #[must_use]
+    pub fn get(&self, index: usize) -> Option<BinarySequence<'a>> {
+        let byte_offset = *self.offsets.get(index)?;
+        if let Ok(sequence) = get_from(self.inner.bytes.get(byte_offset..)?) {
+            Some(sequence)
+        } else {
+            // The following case should be unreachable, because when constructing
+            // the collection, we iterate through all sequences. Though there still
+            // can be an error when iterating the sequence elements, the sequence
+            // itself must be Ok.
+            unreachable!()
+        }
+    }
+
+    /// Returns the number of sequences in the collection.
+    #[must_use]
+    pub fn len(&self) -> usize {
+        self.offsets.len()
+    }
+
+    /// Checks if the collection is empty.
+    #[must_use]
+    pub fn is_empty(&self) -> bool {
+        self.offsets.len() == 0
+    }
+}
+
 /// A single binary sequence.
 ///
 /// # Examples
@@ -138,6 +273,7 @@ impl<'a> Iterator for BinaryCollection<'a> {
 /// # Ok(())
 /// # }
 /// ```
+#[derive(Debug, Clone, Copy, PartialEq, Eq)]
 pub struct BinarySequence<'a> {
     /// All bytes, **excluding** the length bytes.
     bytes: &'a [u8],
@@ -272,4 +408,100 @@ mod test {
             let _ = sequence.get(idx);
         }
     }
+
+    #[test]
+    fn test_binary_collection() {
+        let input: Vec<u8> = vec![
+            1, 0, 0, 0, 3, 0, 0, 0, // Number of documents
+            1, 0, 0, 0, 0, 0, 0, 0, // t0
+            1, 0, 0, 0, 0, 0, 0, 0, // t1
+            1, 0, 0, 0, 0, 0, 0, 0, // t2
+            1, 0, 0, 0, 0, 0, 0, 0, // t3
+            1, 0, 0, 0, 2, 0, 0, 0, // t4
+            3, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 2, 0, 0, 0, // t5
+            2, 0, 0, 0, 1, 0, 0, 0, 2, 0, 0, 0, // t6
+            3, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 2, 0, 0, 0, // t7
+            1, 0, 0, 0, 1, 0, 0, 0, // t8
+        ];
+        let coll = BinaryCollection::try_from(input.as_ref()).unwrap();
+        let sequences = coll
+            .map(|sequence| {
+                sequence.map(|sequence| (sequence.len(), sequence.iter().collect::<Vec<_>>()))
+            })
+            .collect::<Result<Vec<_>, _>>()
+            .unwrap();
+        assert_eq!(
+            sequences,
+            vec![
+                (1, vec![3]),
+                (1, vec![0]),
+                (1, vec![0]),
+                (1, vec![0]),
+                (1, vec![0]),
+                (1, vec![2]),
+                (3, vec![0, 1, 2]),
+                (2, vec![1, 2]),
+                (3, vec![0, 1, 2]),
+                (1, vec![1]),
+            ]
+        );
+    }
+
+    #[test]
+    fn test_random_access_binary_collection() {
+        let input = vec![
+            1, 0, 0, 0, 3, 0, 0, 0, // Number of documents
+            1, 0, 0, 0, 0, 0, 0, 0, // t0
+            1, 0, 0, 0, 0, 0, 0, 0, // t1
+            1, 0, 0, 0, 0, 0, 0, 0, // t2
+            1, 0, 0, 0, 0, 0, 0, 0, // t3
+            1, 0, 0, 0, 2, 0, 0, 0, // t4
+            3, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 2, 0, 0, 0, // t5
+            2, 0, 0, 0, 1, 0, 0, 0, 2, 0, 0, 0, // t6
+            3, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 2, 0, 0, 0, // t7
+            1, 0, 0, 0, 1, 0, 0, 0, // t8
+        ];
+        let coll = RandomAccessBinaryCollection::try_from(input.as_ref()).unwrap();
+        let sequences = coll
+            .iter()
+            .map(|sequence| {
+                sequence.map(|sequence| (sequence.len(), sequence.iter().collect::<Vec<_>>()))
+            })
+            .collect::<Result<Vec<_>, _>>()
+            .unwrap();
+        assert_eq!(
+            sequences,
+            vec![
+                (1, vec![3]),
+                (1, vec![0]),
+                (1, vec![0]),
+                (1, vec![0]),
+                (1, vec![0]),
+                (1, vec![2]),
+                (3, vec![0, 1, 2]),
+                (2, vec![1, 2]),
+                (3, vec![0, 1, 2]),
+                (1, vec![1]),
+            ]
+        );
+        assert_eq!(coll.offsets, vec![0, 8, 16, 24, 32, 40, 48, 64, 76, 92]);
+        assert_eq!(coll.len(), 10);
+        assert_eq!(
+            (0..coll.len())
+                .map(|idx| coll.at(idx).iter().collect())
+                .collect::<Vec<Vec<u32>>>(),
+            vec![
+                vec![3],
+                vec![0],
+                vec![0],
+                vec![0],
+                vec![0],
+                vec![2],
+                vec![0, 1, 2],
+                vec![1, 2],
+                vec![0, 1, 2],
+                vec![1],
+            ]
+        );
+    }
 }
diff --git a/src/lib.rs b/src/lib.rs
index e315783..f4d67c5 100644
--- a/src/lib.rs
+++ b/src/lib.rs
@@ -18,7 +18,8 @@
     clippy::module_name_repetitions,
     clippy::default_trait_access,
     clippy::cast_possible_wrap,
-    clippy::cast_possible_truncation
+    clippy::cast_possible_truncation,
+    clippy::copy_iterator
 )]
 
 use anyhow::{anyhow, Context};
@@ -37,7 +38,9 @@ use std::path::{Path, PathBuf};
 mod proto;
 pub use proto::{DocRecord, Posting, PostingsList};
 mod binary_collection;
-pub use binary_collection::{BinaryCollection, BinarySequence, InvalidFormat};
+pub use binary_collection::{
+    BinaryCollection, BinarySequence, InvalidFormat, RandomAccessBinaryCollection,
+};
 
 type Result<T> = anyhow::Result<T>;
 
diff --git a/src/pisa2ciff.rs b/src/pisa2ciff.rs
index b6754d3..a451346 100644
--- a/src/pisa2ciff.rs
+++ b/src/pisa2ciff.rs
@@ -41,7 +41,7 @@ fn main() {
         &args.terms,
         &args.documents,
         &args.output,
-        &args.description.unwrap_or_else(String::new),
+        &args.description.unwrap_or_default(),
     ) {
         eprintln!("ERROR: {}", error);
         std::process::exit(1);

From 631cb2f996090319cef6d6a6fad287fe233c9baf Mon Sep 17 00:00:00 2001
From: Michal Siedlaczek <michal@siedlaczek.me>
Date: Fri, 25 Feb 2022 21:46:51 -0500
Subject: [PATCH 2/5] Fix downloading grcov

---
 .github/workflows/rust.yml | 5 ++++-
 1 file changed, 4 insertions(+), 1 deletion(-)

diff --git a/.github/workflows/rust.yml b/.github/workflows/rust.yml
index fe9ecdf..df2f40f 100644
--- a/.github/workflows/rust.yml
+++ b/.github/workflows/rust.yml
@@ -78,7 +78,10 @@ jobs:
             rust-version: nightly
       - run: rustup component add llvm-tools-preview
       - run: echo "PATH=/home/runner/.cargo/bin:$PATH" >> $GITHUB_ENV
-      - run: curl -L https://github.com/mozilla/grcov/releases/latest/download/grcov-linux-x86_64.tar.bz2 | tar jxf -
+      - run: echo "7817b621f62dddfadd35fb84999b441bbce72b70cd8a61a9fe8e0998ccf75898 *grcov-v0.8.6-x86_64-unknown-linux-gnu.tar.gz" > checksums
+      - run: curl -sOL https://github.com/mozilla/grcov/releases/download/v0.8.6/grcov-v0.8.6-x86_64-unknown-linux-gnu.tar.gz
+      - run: sha256sum -c checksums --ignore-missing
+      - run: tar xf grcov-v0.8.6-x86_64-unknown-linux-gnu.tar.gz
       - run: cargo test --verbose --workspace
       - run: mkdir ./coverage
       - run: ./grcov . --binary-path ./target/debug/ -s . -t lcov --branch --ignore-not-existing --ignore "*cargo*" --ignore "build.rs" --ignore "*target*" --ignore "tests/*" --ignore "*ciff2pisa.rs" --ignore "*pisa2ciff.rs" -o ./coverage/lcov.info

From 5f312725877e39871e92a6bf2498db59bf22b6dc Mon Sep 17 00:00:00 2001
From: Michal Siedlaczek <michal@siedlaczek.me>
Date: Sat, 26 Feb 2022 08:10:27 -0500
Subject: [PATCH 3/5] Add tests

---
 src/binary_collection.rs | 83 +++++++++++++++++++++-------------------
 1 file changed, 43 insertions(+), 40 deletions(-)

diff --git a/src/binary_collection.rs b/src/binary_collection.rs
index 2eaab8f..40de3da 100644
--- a/src/binary_collection.rs
+++ b/src/binary_collection.rs
@@ -6,7 +6,7 @@ use std::fmt;
 const ELEMENT_SIZE: usize = std::mem::size_of::<u32>();
 
 /// Error raised when the bytes cannot be properly parsed into the collection format.
-#[derive(Debug, Default)]
+#[derive(Debug, Default, PartialEq, Eq)]
 pub struct InvalidFormat(Option<String>);
 
 impl InvalidFormat {
@@ -197,14 +197,6 @@ impl<'a> TryFrom<&'a [u8]> for RandomAccessBinaryCollection<'a> {
     }
 }
 
-impl<'a> IntoIterator for RandomAccessBinaryCollection<'a> {
-    type Item = Result<BinarySequence<'a>, InvalidFormat>;
-    type IntoIter = BinaryCollection<'a>;
-    fn into_iter(self) -> BinaryCollection<'a> {
-        self.inner
-    }
-}
-
 impl<'a> RandomAccessBinaryCollection<'a> {
     /// Returns an iterator over sequences.
     pub fn iter(&self) -> impl Iterator<Item = Result<BinarySequence<'a>, InvalidFormat>> {
@@ -221,7 +213,11 @@ impl<'a> RandomAccessBinaryCollection<'a> {
         if let Some(sequence) = self.get(index) {
             sequence
         } else {
-            panic!("out of bounds");
+            panic!(
+                "out of bounds: requested {} out of {} elements",
+                index,
+                self.len()
+            );
         }
     }
 
@@ -385,13 +381,24 @@ mod test {
     use super::*;
     use quickcheck_macros::quickcheck;
 
+    const COLLECTION_BYTES: [u8; 100] = [
+        1, 0, 0, 0, 3, 0, 0, 0, // Number of documents
+        1, 0, 0, 0, 0, 0, 0, 0, // t0
+        1, 0, 0, 0, 0, 0, 0, 0, // t1
+        1, 0, 0, 0, 0, 0, 0, 0, // t2
+        1, 0, 0, 0, 0, 0, 0, 0, // t3
+        1, 0, 0, 0, 2, 0, 0, 0, // t4
+        3, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 2, 0, 0, 0, // t5
+        2, 0, 0, 0, 1, 0, 0, 0, 2, 0, 0, 0, // t6
+        3, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 2, 0, 0, 0, // t7
+        1, 0, 0, 0, 1, 0, 0, 0, // t8
+    ];
+
     #[test]
     fn test_binary_sequence() {
         let bytes: Vec<u8> = (0_u32..10).flat_map(|i| i.to_le_bytes().to_vec()).collect();
-        let sequence = BinarySequence {
-            bytes: &bytes,
-            length: 10,
-        };
+        let sequence = BinarySequence::try_from(bytes.as_ref()).unwrap();
+        assert!(!sequence.is_empty());
         for n in 0..10 {
             assert_eq!(sequence.get(n).unwrap(), n as u32);
         }
@@ -411,19 +418,7 @@ mod test {
 
     #[test]
     fn test_binary_collection() {
-        let input: Vec<u8> = vec![
-            1, 0, 0, 0, 3, 0, 0, 0, // Number of documents
-            1, 0, 0, 0, 0, 0, 0, 0, // t0
-            1, 0, 0, 0, 0, 0, 0, 0, // t1
-            1, 0, 0, 0, 0, 0, 0, 0, // t2
-            1, 0, 0, 0, 0, 0, 0, 0, // t3
-            1, 0, 0, 0, 2, 0, 0, 0, // t4
-            3, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 2, 0, 0, 0, // t5
-            2, 0, 0, 0, 1, 0, 0, 0, 2, 0, 0, 0, // t6
-            3, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 2, 0, 0, 0, // t7
-            1, 0, 0, 0, 1, 0, 0, 0, // t8
-        ];
-        let coll = BinaryCollection::try_from(input.as_ref()).unwrap();
+        let coll = BinaryCollection::try_from(COLLECTION_BYTES.as_ref()).unwrap();
         let sequences = coll
             .map(|sequence| {
                 sequence.map(|sequence| (sequence.len(), sequence.iter().collect::<Vec<_>>()))
@@ -447,21 +442,22 @@ mod test {
         );
     }
 
+    #[test]
+    fn test_binary_collection_invalid_format() {
+        let input: Vec<u8> = vec![1, 0, 0, 0, 3, 0, 0, 0, 1];
+        let coll = BinaryCollection::try_from(input.as_ref());
+        assert_eq!(
+            coll.err(),
+            Some(InvalidFormat::new(
+                "The byte-length of the collection is not divisible by the element size (4)"
+            ))
+        );
+    }
+
     #[test]
     fn test_random_access_binary_collection() {
-        let input = vec![
-            1, 0, 0, 0, 3, 0, 0, 0, // Number of documents
-            1, 0, 0, 0, 0, 0, 0, 0, // t0
-            1, 0, 0, 0, 0, 0, 0, 0, // t1
-            1, 0, 0, 0, 0, 0, 0, 0, // t2
-            1, 0, 0, 0, 0, 0, 0, 0, // t3
-            1, 0, 0, 0, 2, 0, 0, 0, // t4
-            3, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 2, 0, 0, 0, // t5
-            2, 0, 0, 0, 1, 0, 0, 0, 2, 0, 0, 0, // t6
-            3, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 2, 0, 0, 0, // t7
-            1, 0, 0, 0, 1, 0, 0, 0, // t8
-        ];
-        let coll = RandomAccessBinaryCollection::try_from(input.as_ref()).unwrap();
+        let coll = RandomAccessBinaryCollection::try_from(COLLECTION_BYTES.as_ref()).unwrap();
+        assert!(!coll.is_empty());
         let sequences = coll
             .iter()
             .map(|sequence| {
@@ -504,4 +500,11 @@ mod test {
             ]
         );
     }
+
+    #[test]
+    #[should_panic]
+    fn test_random_access_binary_collection_out_of_bounds() {
+        let coll = RandomAccessBinaryCollection::try_from(COLLECTION_BYTES.as_ref()).unwrap();
+        let _ = coll.at(10);
+    }
 }

From dea60430e1e8216ca14ca1b9c87b365d174488f8 Mon Sep 17 00:00:00 2001
From: Michal Siedlaczek <michal@siedlaczek.me>
Date: Sat, 26 Feb 2022 14:50:18 -0500
Subject: [PATCH 4/5] Implement index reordering

---
 Cargo.toml               |  2 +-
 src/binary_collection.rs | 51 ++++++++++++++++++++++
 src/lib.rs               | 92 +++++++++++++++++++++++++++++++++++++---
 tests/toy.rs             | 76 +++++++++++++++++++++++++++++++++
 4 files changed, 215 insertions(+), 6 deletions(-)

diff --git a/Cargo.toml b/Cargo.toml
index 59ba76a..89dc60a 100644
--- a/Cargo.toml
+++ b/Cargo.toml
@@ -26,11 +26,11 @@ num-traits = "0"
 indicatif = "0.15"
 anyhow = "1.0"
 memmap = "0.7"
+tempfile = "3"
 
 [build-dependencies]
 protobuf-codegen-pure = "2.22"
 
 [dev-dependencies]
-tempfile = "3"
 quickcheck = "1"
 quickcheck_macros = "1"
diff --git a/src/binary_collection.rs b/src/binary_collection.rs
index 40de3da..754569f 100644
--- a/src/binary_collection.rs
+++ b/src/binary_collection.rs
@@ -2,6 +2,7 @@ use std::convert::TryFrom;
 use std::convert::TryInto;
 use std::error::Error;
 use std::fmt;
+use std::io::{self, Write};
 
 const ELEMENT_SIZE: usize = std::mem::size_of::<u32>();
 
@@ -376,6 +377,25 @@ impl<'a> Iterator for BinarySequenceIterator<'a> {
     }
 }
 
+/// Reorders a collection according to the given order.
+///
+/// The new collection will be written to `output`, such that a sequence at position `i`
+/// in `collection` will be at position `order[i]` in the new collection.
+pub fn reorder<W: Write>(
+    collection: &RandomAccessBinaryCollection<'_>,
+    order: &[usize],
+    output: &mut W,
+) -> io::Result<()> {
+    for &pos in order {
+        let sequence = collection.at(pos);
+        let length = sequence.len() as u32;
+        output.write_all(&length.to_le_bytes())?;
+        output.write_all(sequence.bytes)?;
+    }
+    output.flush()?;
+    Ok(())
+}
+
 #[cfg(test)]
 mod test {
     use super::*;
@@ -507,4 +527,35 @@ mod test {
         let coll = RandomAccessBinaryCollection::try_from(COLLECTION_BYTES.as_ref()).unwrap();
         let _ = coll.at(10);
     }
+
+    #[test]
+    fn test_reorder_collection() {
+        let coll = RandomAccessBinaryCollection::try_from(COLLECTION_BYTES.as_ref()).unwrap();
+        let order = vec![0, 1, 4, 9, 5, 6, 7, 2, 3, 8];
+        let mut output = Vec::<u8>::new();
+        reorder(&coll, &order, &mut output).unwrap();
+        println!("{:?}", output);
+        let reordered = BinaryCollection::try_from(output.as_ref()).unwrap();
+        let sequences = reordered
+            .map(|sequence| {
+                sequence.map(|sequence| (sequence.len(), sequence.iter().collect::<Vec<_>>()))
+            })
+            .collect::<Result<Vec<_>, _>>()
+            .unwrap();
+        assert_eq!(
+            sequences,
+            vec![
+                (1, vec![3]),       // 0
+                (1, vec![0]),       // 1
+                (1, vec![0]),       // 4
+                (1, vec![1]),       // 9
+                (1, vec![2]),       // 5
+                (3, vec![0, 1, 2]), // 6
+                (2, vec![1, 2]),    // 7
+                (1, vec![0]),       // 2
+                (1, vec![0]),       // 3
+                (3, vec![0, 1, 2]), // 8
+            ]
+        );
+    }
 }
diff --git a/src/lib.rs b/src/lib.rs
index f4d67c5..4ee8820 100644
--- a/src/lib.rs
+++ b/src/lib.rs
@@ -30,10 +30,12 @@ use num_traits::ToPrimitive;
 use protobuf::{CodedInputStream, CodedOutputStream};
 use std::borrow::Borrow;
 use std::convert::TryFrom;
+use std::ffi::OsStr;
 use std::fmt;
 use std::fs::File;
 use std::io::{self, BufRead, BufReader, BufWriter, Write};
 use std::path::{Path, PathBuf};
+use tempfile::TempDir;
 
 mod proto;
 pub use proto::{DocRecord, Posting, PostingsList};
@@ -169,6 +171,79 @@ where
     Ok(())
 }
 
+fn check_lines_sorted<R: BufRead>(reader: R) -> io::Result<bool> {
+    let mut prev = String::from("");
+    for line in reader.lines() {
+        let line = line?;
+        if line < prev {
+            return Ok(false);
+        }
+        prev = line;
+    }
+    Ok(true)
+}
+
+struct PisaIndexPaths {
+    terms: PathBuf,
+    documents: PathBuf,
+    frequencies: PathBuf,
+    sizes: PathBuf,
+    titles: PathBuf,
+}
+
+impl PisaIndexPaths {
+    fn from_base_path(path: &Path) -> Option<Self> {
+        let file_name = path.file_name()?;
+        let parent = path.parent()?;
+        let format_name = |file: &OsStr, suffix| {
+            let mut full_name = file.to_owned();
+            full_name.push(suffix);
+            full_name
+        };
+        Some(Self {
+            terms: parent.join(format_name(file_name, ".terms")),
+            documents: parent.join(format_name(file_name, ".docs")),
+            frequencies: parent.join(format_name(file_name, ".freqs")),
+            sizes: parent.join(format_name(file_name, ".sizes")),
+            titles: parent.join(format_name(file_name, ".documents")),
+        })
+    }
+}
+
+fn reorder_postings(path: &Path, order: &[usize], skip_first: bool) -> Result<()> {
+    let temp = TempDir::new()?;
+    let tmp_path = temp.path().join("coll");
+    std::fs::rename(path, &tmp_path)?;
+    let mmap = unsafe { Mmap::map(&File::open(tmp_path)?)? };
+    let coll = RandomAccessBinaryCollection::try_from(mmap.as_ref())?;
+    let mut writer = BufWriter::new(File::create(path)?);
+    if skip_first {
+        let order: Vec<_> = std::iter::once(0)
+            .chain(order.iter().map(|&i| i + 1))
+            .collect();
+        binary_collection::reorder(&coll, &order, &mut writer)?;
+    } else {
+        binary_collection::reorder(&coll, order, &mut writer)?;
+    }
+    writer.flush()?;
+    Ok(())
+}
+
+fn reorder_pisa_index(paths: &PisaIndexPaths) -> Result<()> {
+    let terms = BufReader::new(File::open(&paths.terms)?)
+        .lines()
+        .collect::<io::Result<Vec<_>>>()?;
+    let mut order: Vec<_> = (0..terms.len()).collect();
+    order.sort_by_key(|&i| &terms[i]);
+    reorder_postings(&paths.documents, &order, true)?;
+    reorder_postings(&paths.frequencies, &order, false)?;
+    let mut term_writer = BufWriter::new(File::create(&paths.terms)?);
+    for index in order {
+        writeln!(&mut term_writer, "{}", terms[index])?;
+    }
+    Ok(())
+}
+
 /// Converts a CIFF index stored in `path` to a PISA "binary collection" (uncompressed inverted
 /// index) with a basename `output`.
 ///
@@ -180,12 +255,15 @@ where
 /// - data format is valid but any ID, frequency, or a count is negative,
 /// - document records is out of order.
 pub fn ciff_to_pisa(input: &Path, output: &Path) -> Result<()> {
+    let index_paths =
+        PisaIndexPaths::from_base_path(output).ok_or_else(|| anyhow!("invalid output path"))?;
+
     let mut ciff_reader =
         File::open(input).with_context(|| format!("Unable to open {}", input.display()))?;
     let mut input = CodedInputStream::new(&mut ciff_reader);
-    let mut documents = BufWriter::new(File::create(format!("{}.docs", output.display()))?);
-    let mut frequencies = BufWriter::new(File::create(format!("{}.freqs", output.display()))?);
-    let mut terms = BufWriter::new(File::create(format!("{}.terms", output.display()))?);
+    let mut documents = BufWriter::new(File::create(&index_paths.documents)?);
+    let mut frequencies = BufWriter::new(File::create(&index_paths.frequencies)?);
+    let mut terms = BufWriter::new(File::create(&index_paths.terms)?);
 
     let header = Header::from_stream(&mut input)?;
     println!("{}", header);
@@ -211,8 +289,8 @@ pub fn ciff_to_pisa(input: &Path, output: &Path) -> Result<()> {
     terms.flush()?;
 
     eprintln!("Processing document lengths");
-    let mut sizes = BufWriter::new(File::create(format!("{}.sizes", output.display()))?);
-    let mut trecids = BufWriter::new(File::create(format!("{}.documents", output.display()))?);
+    let mut sizes = BufWriter::new(File::create(&index_paths.sizes)?);
+    let mut trecids = BufWriter::new(File::create(&index_paths.titles)?);
 
     let progress = ProgressBar::new(u64::from(header.num_documents));
     progress.set_style(pb_style());
@@ -245,6 +323,10 @@ pub fn ciff_to_pisa(input: &Path, output: &Path) -> Result<()> {
     }
     progress.finish();
 
+    if !check_lines_sorted(BufReader::new(File::open(&index_paths.terms)?))? {
+        reorder_pisa_index(&index_paths)?;
+    }
+
     Ok(())
 }
 
diff --git a/tests/toy.rs b/tests/toy.rs
index 1f9f60f..e7010d0 100644
--- a/tests/toy.rs
+++ b/tests/toy.rs
@@ -109,3 +109,79 @@ fn test_to_and_from_ciff() -> anyhow::Result<()> {
 
     Ok(())
 }
+
+#[test]
+fn test_reorder_terms() -> anyhow::Result<()> {
+    let input_path = PathBuf::from("tests/test_data/toy-complete-20200309.ciff");
+    let temp = TempDir::new().unwrap();
+    let pisa_path = temp.path().join("coll");
+    ciff_to_pisa(&input_path, &pisa_path)?;
+
+    // Rewrite the terms; later, we will check if the posting lists are in reverse order.
+    std::fs::write(
+        temp.path().join("coll.terms"),
+        vec![
+            "veri", "text", "simpl", "head", "enough", "content", "30", "03", "01",
+        ]
+        .join("\n"),
+    )?;
+
+    let ciff_output_path = temp.path().join("ciff");
+    pisa_to_ciff(
+        &pisa_path,
+        &temp.path().join("coll.terms"),
+        &temp.path().join("coll.documents"),
+        &ciff_output_path,
+        "",
+    )?;
+
+    // Convert back to PISA to verify list order
+    let pisa_copy = temp.path().join("copy");
+    ciff_to_pisa(&ciff_output_path, &pisa_copy)?;
+
+    assert_eq!(
+        std::fs::read_to_string(temp.path().join("copy.documents"))?,
+        "WSJ_1\nTREC_DOC_1\nDOC222\n"
+    );
+    assert_eq!(
+        std::fs::read(temp.path().join("coll.sizes"))?,
+        vec![3, 0, 0, 0, 6, 0, 0, 0, 4, 0, 0, 0, 6, 0, 0, 0]
+    );
+    assert_eq!(
+        std::fs::read_to_string(temp.path().join("copy.terms"))?
+            .lines()
+            .collect::<Vec<_>>(),
+        vec!["01", "03", "30", "content", "enough", "head", "simpl", "text", "veri"]
+    );
+    assert_eq!(
+        std::fs::read(temp.path().join("copy.docs"))?,
+        vec![
+            1, 0, 0, 0, 3, 0, 0, 0, // Number of documents
+            1, 0, 0, 0, 1, 0, 0, 0, // t8
+            3, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 2, 0, 0, 0, // t7
+            2, 0, 0, 0, 1, 0, 0, 0, 2, 0, 0, 0, // t6
+            3, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 2, 0, 0, 0, // t5
+            1, 0, 0, 0, 2, 0, 0, 0, // t4
+            1, 0, 0, 0, 0, 0, 0, 0, // t3
+            1, 0, 0, 0, 0, 0, 0, 0, // t2
+            1, 0, 0, 0, 0, 0, 0, 0, // t1
+            1, 0, 0, 0, 0, 0, 0, 0, // t0
+        ]
+    );
+    assert_eq!(
+        std::fs::read(temp.path().join("copy.freqs"))?,
+        vec![
+            1, 0, 0, 0, 1, 0, 0, 0, // t8
+            3, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 3, 0, 0, 0, // t7
+            2, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, // t6
+            3, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, // t5
+            1, 0, 0, 0, 1, 0, 0, 0, // t4
+            1, 0, 0, 0, 1, 0, 0, 0, // t3
+            1, 0, 0, 0, 1, 0, 0, 0, // t2
+            1, 0, 0, 0, 1, 0, 0, 0, // t1
+            1, 0, 0, 0, 1, 0, 0, 0, // t0
+        ]
+    );
+
+    Ok(())
+}

From 4e1f136a3cc9bedab95223cc4ccd3966bf477c43 Mon Sep 17 00:00:00 2001
From: J Mackenzie <JMMackenzie@users.noreply.github.com>
Date: Mon, 28 Feb 2022 11:57:12 +1100
Subject: [PATCH 5/5] Typo

---
 src/binary_collection.rs | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/binary_collection.rs b/src/binary_collection.rs
index 754569f..c71c4da 100644
--- a/src/binary_collection.rs
+++ b/src/binary_collection.rs
@@ -126,7 +126,7 @@ impl<'a> Iterator for BinaryCollection<'a> {
 /// This means [`RandomAccessBinaryCollection::try_from`] will have to
 /// perform one full pass through the entire collection to collect the
 /// offsets. Thus, use this class only if you need the random access
-/// funcionality.
+/// functionality.
 ///
 /// Note that the because offsets are stored within the struct, it is
 /// not `Copy` as opposed to [`BinaryCollection`], which is simply a view