quickwit-oss · fulmicoton · May 17, 2021 · Apr 27, 2021 · Apr 28, 2021 · Apr 28, 2021
diff --git a/src/core/index.rs b/src/core/index.rs
@@ -64,14 +64,14 @@ fn load_metas(
 ///
 /// ```
 /// use tantivy::schema::*;
-/// use tantivy::{Index, IndexSettings};
+/// use tantivy::{Index, IndexSettings, IndexSortByField, Order};
 ///
 /// let mut schema_builder = Schema::builder();
 /// let id_field = schema_builder.add_text_field("id", STRING);
 /// let title_field = schema_builder.add_text_field("title", TEXT);
 /// let body_field = schema_builder.add_text_field("body", TEXT);
 /// let schema = schema_builder.build();
-/// let settings = IndexSettings::default();
+/// let settings = IndexSettings{sort_by_field: IndexSortByField{field:"title".to_string(), order:Order::Asc}};
 /// let index = Index::builder().schema(schema).settings(settings).create_in_ram();
 ///
 /// ```
@@ -80,6 +80,7 @@ pub struct IndexBuilder {
     index_settings: Option<IndexSettings>,
 }
 impl IndexBuilder {
+    /// Creates a new `IndexBuilder`
     pub fn new() -> Self {
         Self {
             schema: None,
@@ -423,7 +424,7 @@ impl Index {
 
     /// Helper to create an index writer for tests.
     ///
-    /// That index writer only simply has a single thread and a heap of 5 MB.
+    /// That index writer only simply has a single thread and a heap of 10 MB.
     /// Using a single thread gives us a deterministic allocation of DocId.
     #[cfg(test)]
     pub fn writer_for_tests(&self) -> crate::Result<IndexWriter> {

diff --git a/src/core/index_meta.rs b/src/core/index_meta.rs
@@ -112,6 +112,7 @@ impl SegmentMeta {
             SegmentComponent::Positions => ".pos".to_string(),
             SegmentComponent::Terms => ".term".to_string(),
             SegmentComponent::Store => ".store".to_string(),
+            SegmentComponent::TempStore => ".store.temp".to_string(),
             SegmentComponent::FastFields => ".fast".to_string(),
             SegmentComponent::FieldNorms => ".fieldnorm".to_string(),
             SegmentComponent::Delete => format!(".{}.del", self.delete_opstamp().unwrap_or(0)),
@@ -193,9 +194,36 @@ impl InnerSegmentMeta {
     }
 }
 
-/// Search Index Settings
-#[derive(Clone, Default, Serialize)]
-pub struct IndexSettings {}
+/// Search Index Settings.
+///
+/// Contains settings which are applied on the whole
+/// index, like presort documents.
+#[derive(Clone, Serialize)]
+pub struct IndexSettings {
+    /// Sorts the documents by information
+    /// provided in `IndexSortByField`
+    pub sort_by_field: IndexSortByField,
+}
+/// Settings to presort the documents in an index
+///
+/// Presorting documents can greatly performance
+/// in some scenarios, by appliying top n
+/// optimizations.
+#[derive(Clone, Serialize)]
+pub struct IndexSortByField {
+    /// The field to sort the documents by
+    pub field: String,
+    /// The order to sort the documents by
+    pub order: Order,
+}
+/// The order to sort by
+#[derive(Clone, Serialize, PartialEq)]
+pub enum Order {
+    /// Ascending Order
+    Asc,
+    /// Descending Order
+    Desc,
+}
 /// Meta information about the `Index`.
 ///
 /// This object is serialized on disk in the `meta.json` file.

diff --git a/src/core/mod.rs b/src/core/mod.rs
@@ -10,7 +10,9 @@ mod segment_reader;
 
 pub use self::executor::Executor;
 pub use self::index::{Index, IndexBuilder};
-pub use self::index_meta::{IndexMeta, IndexSettings, SegmentMeta, SegmentMetaInventory};
+pub use self::index_meta::{
+    IndexMeta, IndexSettings, IndexSortByField, Order, SegmentMeta, SegmentMetaInventory,
+};
 pub use self::inverted_index_reader::InvertedIndexReader;
 pub use self::searcher::Searcher;
 pub use self::segment::Segment;

diff --git a/src/core/segment.rs b/src/core/segment.rs
@@ -1,5 +1,4 @@
 use super::SegmentComponent;
-use crate::core::Index;
 use crate::core::SegmentId;
 use crate::core::SegmentMeta;
 use crate::directory::error::{OpenReadError, OpenWriteError};
@@ -8,6 +7,7 @@ use crate::directory::{FileSlice, WritePtr};
 use crate::indexer::segment_serializer::SegmentSerializer;
 use crate::schema::Schema;
 use crate::Opstamp;
+use crate::{core::Index, indexer::index_sorter::DocidMapping};
 use std::fmt;
 use std::path::PathBuf;
 
@@ -97,5 +97,9 @@ pub trait SerializableSegment {
     ///
     /// # Returns
     /// The number of documents in the segment.
-    fn write(&self, serializer: SegmentSerializer) -> crate::Result<u32>;
+    fn write(
+        &self,
+        serializer: SegmentSerializer,
+        docid_map: Option<&DocidMapping>,
+    ) -> crate::Result<u32>;
 }
diff --git a/src/core/segment_component.rs b/src/core/segment_component.rs
@@ -22,6 +22,8 @@ pub enum SegmentComponent {
     /// Accessing a document from the store is relatively slow, as it
     /// requires to decompress the entire block it belongs to.
     Store,
+    /// Temporary storage of the documents, before streamed to `Store`.
+    TempStore,
     /// Bitset describing which document of the segment is deleted.
     Delete,
 }

diff --git a/src/fastfield/bytes/writer.rs b/src/fastfield/bytes/writer.rs
@@ -1,8 +1,8 @@
 use std::io;
 
-use crate::fastfield::serializer::FastFieldSerializer;
 use crate::schema::{Document, Field, Value};
 use crate::DocId;
+use crate::{fastfield::serializer::FastFieldSerializer, indexer::index_sorter::DocidMapping};
 
 /// Writer for byte array (as in, any number of bytes per document) fast fields
 ///
@@ -72,20 +72,63 @@ impl BytesFastFieldWriter {
         doc
     }
 
+    /// Returns an iterator over values per docid in ascending docid order.
+    ///
+    /// Normally the order is simply iterating self.docid_index.
+    /// With docid_map it accounts for the new mapping, returning values in the order of the
+    /// new docids.
+    fn get_ordered_values<'a: 'b, 'b>(
+        &'a self,
+        docid_map: Option<&'b DocidMapping>,
+    ) -> impl Iterator<Item = &'b [u8]> {
+        let docid_iter = if let Some(docid_map) = docid_map {
+            Box::new(docid_map.iter_old_docids().cloned()) as Box<dyn Iterator<Item = u32>>
+        } else {
+            Box::new(self.doc_index.iter().enumerate().map(|el| el.0 as u32))
+                as Box<dyn Iterator<Item = u32>>
+        };
+        docid_iter.map(move |docid| self.get_values_for_docid(docid))
+    }
+
+    /// returns all values for a docids
+    fn get_values_for_docid(&self, docid: u32) -> &[u8] {
+        let start_pos = self.doc_index[docid as usize] as usize;
+        let end_pos = if docid as usize + 1 == self.doc_index.len() {
+            // special case, last docid has no offset information
+            self.vals.len()
+        } else {
+            self.doc_index[docid as usize + 1] as usize
+        };
+        &self.vals[start_pos..end_pos]
+    }
+
     /// Serializes the fast field values by pushing them to the `FastFieldSerializer`.
-    pub fn serialize(&self, serializer: &mut FastFieldSerializer) -> io::Result<()> {
+    pub fn serialize(
+        &self,
+        serializer: &mut FastFieldSerializer,
+        docid_map: Option<&DocidMapping>,
+    ) -> io::Result<()> {
         // writing the offset index
         let mut doc_index_serializer =
             serializer.new_u64_fast_field_with_idx(self.field, 0, self.vals.len() as u64, 0)?;
-        for &offset in &self.doc_index {
+        let mut offset = 0;
+        for vals in self.get_ordered_values(docid_map) {
             doc_index_serializer.add_val(offset)?;
+            offset += vals.len() as u64;
         }
         doc_index_serializer.add_val(self.vals.len() as u64)?;
         doc_index_serializer.close_field()?;
         // writing the values themselves
-        serializer
-            .new_bytes_fast_field_with_idx(self.field, 1)
-            .write_all(&self.vals)?;
+        let mut value_serializer = serializer.new_bytes_fast_field_with_idx(self.field, 1);
+        // the else could be removed, but this is faster (difference not benchmarked)
+        if let Some(docid_map) = docid_map {
+            for vals in self.get_ordered_values(Some(docid_map)) {
+                // sort values in case of remapped docids?
+                value_serializer.write_all(vals)?;
+            }
+        } else {
+            value_serializer.write_all(&self.vals)?;
+        }
         Ok(())
     }
 }
diff --git a/src/fastfield/mod.rs b/src/fastfield/mod.rs
@@ -251,7 +251,7 @@ mod tests {
             fast_field_writers.add_document(&doc!(*FIELD=>14u64));
             fast_field_writers.add_document(&doc!(*FIELD=>2u64));
             fast_field_writers
-                .serialize(&mut serializer, &HashMap::new())
+                .serialize(&mut serializer, &HashMap::new(), None)
                 .unwrap();
             serializer.close().unwrap();
         }
@@ -283,7 +283,7 @@ mod tests {
             fast_field_writers.add_document(&doc!(*FIELD=>1_002u64));
             fast_field_writers.add_document(&doc!(*FIELD=>1_501u64));
             fast_field_writers.add_document(&doc!(*FIELD=>215u64));
-            fast_field_writers.serialize(&mut serializer, &HashMap::new())?;
+            fast_field_writers.serialize(&mut serializer, &HashMap::new(), None)?;
             serializer.close()?;
         }
         let file = directory.open_read(&path)?;
@@ -318,7 +318,7 @@ mod tests {
                 fast_field_writers.add_document(&doc!(*FIELD=>100_000u64));
             }
             fast_field_writers
-                .serialize(&mut serializer, &HashMap::new())
+                .serialize(&mut serializer, &HashMap::new(), None)
                 .unwrap();
             serializer.close().unwrap();
         }
@@ -350,7 +350,7 @@ mod tests {
                 fast_field_writers.add_document(&doc!(*FIELD=>5_000_000_000_000_000_000u64 + i));
             }
             fast_field_writers
-                .serialize(&mut serializer, &HashMap::new())
+                .serialize(&mut serializer, &HashMap::new(), None)
                 .unwrap();
             serializer.close().unwrap();
         }
@@ -389,7 +389,7 @@ mod tests {
                 fast_field_writers.add_document(&doc);
             }
             fast_field_writers
-                .serialize(&mut serializer, &HashMap::new())
+                .serialize(&mut serializer, &HashMap::new(), None)
                 .unwrap();
             serializer.close().unwrap();
         }
@@ -429,7 +429,7 @@ mod tests {
             let doc = Document::default();
             fast_field_writers.add_document(&doc);
             fast_field_writers
-                .serialize(&mut serializer, &HashMap::new())
+                .serialize(&mut serializer, &HashMap::new(), None)
                 .unwrap();
             serializer.close().unwrap();
         }
@@ -464,7 +464,7 @@ mod tests {
             for &x in &permutation {
                 fast_field_writers.add_document(&doc!(*FIELD=>x));
             }
-            fast_field_writers.serialize(&mut serializer, &HashMap::new())?;
+            fast_field_writers.serialize(&mut serializer, &HashMap::new(), None)?;
             serializer.close()?;
         }
         let file = directory.open_read(&path)?;
@@ -621,7 +621,7 @@ mod bench {
                 fast_field_writers.add_document(&doc!(*FIELD=>x));
             }
             fast_field_writers
-                .serialize(&mut serializer, &HashMap::new())
+                .serialize(&mut serializer, &HashMap::new(), None)
                 .unwrap();
             serializer.close().unwrap();
         }
@@ -655,7 +655,7 @@ mod bench {
                 fast_field_writers.add_document(&doc!(*FIELD=>x));
             }
             fast_field_writers
-                .serialize(&mut serializer, &HashMap::new())
+                .serialize(&mut serializer, &HashMap::new(), None)
                 .unwrap();
             serializer.close().unwrap();
         }

diff --git a/src/fastfield/multivalued/writer.rs b/src/fastfield/multivalued/writer.rs
@@ -1,13 +1,12 @@
 use crate::fastfield::serializer::FastSingleFieldSerializer;
-use crate::fastfield::value_to_u64;
 use crate::fastfield::FastFieldSerializer;
 use crate::postings::UnorderedTermId;
 use crate::schema::{Document, Field};
 use crate::termdict::TermOrdinal;
 use crate::DocId;
+use crate::{fastfield::value_to_u64, indexer::index_sorter::DocidMapping};
 use fnv::FnvHashMap;
 use std::io;
-use std::iter::once;
 use tantivy_bitpacker::minmax;
 
 /// Writer for multi-valued (as in, more than one value per document)
@@ -94,7 +93,35 @@ impl MultiValuedFastFieldWriter {
         self.vals.extend_from_slice(vals);
         doc
     }
+    /// Returns an iterator over values per docid in ascending docid order.
+    ///
+    /// Normally the order is simply iterating self.docid_index.
+    /// With docid_map it accounts for the new mapping, returning values in the order of the
+    /// new docids.
+    fn get_ordered_values<'a: 'b, 'b>(
+        &'a self,
+        docid_map: Option<&'b DocidMapping>,
+    ) -> impl Iterator<Item = &'b [u64]> {
+        let docid_iter = if let Some(docid_map) = docid_map {
+            Box::new(docid_map.iter_old_docids().cloned()) as Box<dyn Iterator<Item = u32>>
+        } else {
+            Box::new(self.doc_index.iter().enumerate().map(|el| el.0 as u32))
+                as Box<dyn Iterator<Item = u32>>
+        };
+        docid_iter.map(move |docid| self.get_values_for_docid(docid))
+    }
 
+    /// returns all values for a docids
+    fn get_values_for_docid(&self, docid: u32) -> &[u64] {
+        let start_pos = self.doc_index[docid as usize] as usize;
+        let end_pos = if docid as usize + 1 == self.doc_index.len() {
+            // special case, last docid has no offset information
+            self.vals.len()
+        } else {
+            self.doc_index[docid as usize + 1] as usize
+        };
+        &self.vals[start_pos..end_pos]
+    }
     /// Serializes fast field values by pushing them to the `FastFieldSerializer`.
     ///
     /// If a mapping is given, the values are remapped *and sorted* before serialization.
@@ -110,15 +137,20 @@ impl MultiValuedFastFieldWriter {
         &self,
         serializer: &mut FastFieldSerializer,
         mapping_opt: Option<&FnvHashMap<UnorderedTermId, TermOrdinal>>,
+        docid_map: Option<&DocidMapping>,
     ) -> io::Result<()> {
         {
             // writing the offset index
             let mut doc_index_serializer =
                 serializer.new_u64_fast_field_with_idx(self.field, 0, self.vals.len() as u64, 0)?;
-            for &offset in &self.doc_index {
+
+            let mut offset = 0;
+            for vals in self.get_ordered_values(docid_map) {
                 doc_index_serializer.add_val(offset)?;
+                offset += vals.len() as u64;
             }
             doc_index_serializer.add_val(self.vals.len() as u64)?;
+
             doc_index_serializer.close_field()?;
         }
         {
@@ -133,18 +165,10 @@ impl MultiValuedFastFieldWriter {
                         1,
                     )?;
 
-                    let last_interval =
-                        self.doc_index.last().cloned().unwrap() as usize..self.vals.len();
-
                     let mut doc_vals: Vec<u64> = Vec::with_capacity(100);
-                    for range in self
-                        .doc_index
-                        .windows(2)
-                        .map(|interval| interval[0] as usize..interval[1] as usize)
-                        .chain(once(last_interval))
-                    {
+                    for vals in self.get_ordered_values(docid_map) {
                         doc_vals.clear();
-                        let remapped_vals = self.vals[range]
+                        let remapped_vals = vals
                             .iter()
                             .map(|val| *mapping.get(val).expect("Missing term ordinal"));
                         doc_vals.extend(remapped_vals);
@@ -159,8 +183,11 @@ impl MultiValuedFastFieldWriter {
                     let (val_min, val_max) = val_min_max.unwrap_or((0u64, 0u64));
                     value_serializer =
                         serializer.new_u64_fast_field_with_idx(self.field, val_min, val_max, 1)?;
-                    for &val in &self.vals {
-                        value_serializer.add_val(val)?;
+                    for vals in self.get_ordered_values(docid_map) {
+                        // sort values in case of remapped docids?
+                        for &val in vals {
+                            value_serializer.add_val(val)?;
+                        }
                     }
                 }
             }