LUCENE-10458: BoundedDocSetIdIterator may supply error count in Weigt…

…h#count(LeafReaderContext) when missingValue enables (#736)
apache · Mar 23, 2022 · 5450d72 · 5450d72
1 parent 1c6f631
commit 5450d72
Show file tree

Hide file tree

Showing 3 changed files with 148 additions and 27 deletions.
diff --git a/lucene/CHANGES.txt b/lucene/CHANGES.txt
@@ -56,6 +56,10 @@ API Changes
 
 New Features
 ---------------------
+
+* LUCENE-10385: Implement Weight#count on IndexSortSortedNumericDocValuesRangeQuery
+  to speed up computing the number of hits when possible. (Lu Xugang, Luca Cavanna, Adrien Grand)
+
 * LUCENE-10422: Monitor Improvements: `Monitor` can use a custom `Directory` 
 implementation. `Monitor` can be created with a readonly `QueryIndex` in order to 
 have readonly `Monitor` instances. (Niko Usai)
@@ -66,6 +70,7 @@ Improvements
 
 Optimizations
 ---------------------
+
 * LUCENE-10452: Hunspell: call checkCanceled less frequently to reduce the overhead (Peter Gromov)
 
 * LUCENE-10451: Hunspell: don't perform potentially expensive spellchecking after timeout (Peter Gromov)
@@ -195,9 +200,6 @@ New Features
   based on TotalHitCountCollector that allows users to parallelize counting the
   number of hits. (Luca Cavanna, Adrien Grand)
 
-* LUCENE-10385: Implement Weight#count on IndexSortSortedNumericDocValuesRangeQuery
-  to speed up computing the number of hits when possible. (Luca Cavanna, Adrien Grand)
-
 * LUCENE-10403: Add ArrayUtil#grow(T[]). (Greg Miller)
 
 * LUCENE-10414: Add fn:fuzzyTerm interval function to flexible query parser (Dawid Weiss, 

diff --git a/.../src/java/org/apache/lucene/sandbox/search/IndexSortSortedNumericDocValuesRangeQuery.java b/.../src/java/org/apache/lucene/sandbox/search/IndexSortSortedNumericDocValuesRangeQuery.java
@@ -20,8 +20,10 @@
 import java.util.Objects;
 import org.apache.lucene.index.DocValues;
 import org.apache.lucene.index.IndexReader;
+import org.apache.lucene.index.LeafReader;
 import org.apache.lucene.index.LeafReaderContext;
 import org.apache.lucene.index.NumericDocValues;
+import org.apache.lucene.index.PointValues;
 import org.apache.lucene.index.SortedNumericDocValues;
 import org.apache.lucene.search.ConstantScoreScorer;
 import org.apache.lucene.search.ConstantScoreWeight;
@@ -198,16 +200,18 @@ public boolean isCacheable(LeafReaderContext ctx) {
 
       @Override
       public int count(LeafReaderContext context) throws IOException {
-        BoundedDocSetIdIterator disi = getDocIdSetIteratorOrNull(context);
-        if (disi != null) {
-          return disi.lastDoc - disi.firstDoc;
+        if (context.reader().hasDeletions() == false) {
+          BoundedDocIdSetIterator disi = getDocIdSetIteratorOrNull(context);
+          if (disi != null && disi.delegate == null) {
+            return disi.lastDoc - disi.firstDoc;
+          }
         }
         return fallbackWeight.count(context);
       }
     };
   }
 
-  private BoundedDocSetIdIterator getDocIdSetIteratorOrNull(LeafReaderContext context)
+  private BoundedDocIdSetIterator getDocIdSetIteratorOrNull(LeafReaderContext context)
       throws IOException {
     SortedNumericDocValues sortedNumericValues =
         DocValues.getSortedNumeric(context.reader(), field);
@@ -237,7 +241,7 @@ private BoundedDocSetIdIterator getDocIdSetIteratorOrNull(LeafReaderContext cont
    * {@link DocIdSetIterator} makes sure to wrap the original docvalues to skip over documents with
    * no value.
    */
-  private BoundedDocSetIdIterator getDocIdSetIterator(
+  private BoundedDocIdSetIterator getDocIdSetIterator(
       SortField sortField, LeafReaderContext context, DocIdSetIterator delegate)
       throws IOException {
     long lower = sortField.getReverse() ? upperValue : lowerValue;
@@ -278,7 +282,19 @@ private BoundedDocSetIdIterator getDocIdSetIterator(
     }
 
     int lastDocIdExclusive = high + 1;
-    return new BoundedDocSetIdIterator(firstDocIdInclusive, lastDocIdExclusive, delegate);
+    Object missingValue = sortField.getMissingValue();
+    BoundedDocIdSetIterator disi;
+    LeafReader reader = context.reader();
+    PointValues pointValues = reader.getPointValues(field);
+    final long missingLongValue = missingValue == null ? 0L : (long) missingValue;
+    // all documents have docValues or missing value falls outside the range
+    if ((pointValues != null && pointValues.getDocCount() == reader.maxDoc())
+        || (missingLongValue < lowerValue || missingLongValue > upperValue)) {
+      disi = new BoundedDocIdSetIterator(firstDocIdInclusive, lastDocIdExclusive, null);
+    } else {
+      disi = new BoundedDocIdSetIterator(firstDocIdInclusive, lastDocIdExclusive, delegate);
+    }
+    return disi;
   }
 
   /** Compares the given document's value with a stored reference value. */
@@ -306,14 +322,14 @@ private static ValueComparator loadComparator(
    * A doc ID set iterator that wraps a delegate iterator and only returns doc IDs in the range
    * [firstDocInclusive, lastDoc).
    */
-  private static class BoundedDocSetIdIterator extends DocIdSetIterator {
+  private static class BoundedDocIdSetIterator extends DocIdSetIterator {
     private final int firstDoc;
     private final int lastDoc;
     private final DocIdSetIterator delegate;
 
     private int docID = -1;
 
-    BoundedDocSetIdIterator(int firstDoc, int lastDoc, DocIdSetIterator delegate) {
+    BoundedDocIdSetIterator(int firstDoc, int lastDoc, DocIdSetIterator delegate) {
       this.firstDoc = firstDoc;
       this.lastDoc = lastDoc;
       this.delegate = delegate;
@@ -335,7 +351,12 @@ public int advance(int target) throws IOException {
         target = firstDoc;
       }
 
-      int result = delegate.advance(target);
+      int result;
+      if (delegate != null) {
+        result = delegate.advance(target);
+      } else {
+        result = target;
+      }
       if (result < lastDoc) {
         docID = result;
       } else {

diff --git a/.../test/org/apache/lucene/sandbox/search/TestIndexSortSortedNumericDocValuesRangeQuery.java b/.../test/org/apache/lucene/sandbox/search/TestIndexSortSortedNumericDocValuesRangeQuery.java
@@ -20,9 +20,11 @@
 
 import java.io.IOException;
 import org.apache.lucene.document.Document;
+import org.apache.lucene.document.Field;
 import org.apache.lucene.document.LongPoint;
 import org.apache.lucene.document.SortedNumericDocValuesField;
 import org.apache.lucene.document.SortedSetDocValuesField;
+import org.apache.lucene.document.StringField;
 import org.apache.lucene.index.DirectoryReader;
 import org.apache.lucene.index.IndexReader;
 import org.apache.lucene.index.IndexWriterConfig;
@@ -59,7 +61,14 @@ public void testSameHitsAsPointRangeQuery() throws IOException {
       IndexWriterConfig iwc = new IndexWriterConfig(new MockAnalyzer(random()));
       boolean reverse = random().nextBoolean();
       SortField sortField = new SortedNumericSortField("dv", SortField.Type.LONG, reverse);
-      sortField.setMissingValue(random().nextLong());
+      boolean enableMissingValue = random().nextBoolean();
+      if (enableMissingValue) {
+        long missingValue =
+            random().nextBoolean()
+                ? TestUtil.nextLong(random(), -100, 10000)
+                : (random().nextBoolean() ? Long.MIN_VALUE : Long.MAX_VALUE);
+        sortField.setMissingValue(missingValue);
+      }
       iwc.setIndexSort(new Sort(sortField));
 
       RandomIndexWriter iw = new RandomIndexWriter(random(), dir, iwc);
@@ -459,7 +468,7 @@ public void testIndexSortOptimizationDeactivated(RandomIndexWriter writer) throw
     reader.close();
   }
 
-  public void testCount() throws IOException {
+  public void testFallbackCount() throws IOException {
     Directory dir = newDirectory();
     IndexWriterConfig iwc = new IndexWriterConfig(new MockAnalyzer(random()));
     Sort indexSort = new Sort(new SortedNumericSortField("field", SortField.Type.LONG));
@@ -471,44 +480,133 @@ public void testCount() throws IOException {
     IndexReader reader = writer.getReader();
     IndexSearcher searcher = newSearcher(reader);
 
-    Query fallbackQuery = LongPoint.newRangeQuery("field", 1, 42);
-    Query query = new IndexSortSortedNumericDocValuesRangeQuery("field", 1, 42, fallbackQuery);
+    // we use an unrealistic query that exposes its own Weight#count
+    Query fallbackQuery = new MatchNoDocsQuery();
+    // the index is not sorted on this field, the fallback query is used
+    Query query = new IndexSortSortedNumericDocValuesRangeQuery("another", 1, 42, fallbackQuery);
     Weight weight = query.createWeight(searcher, ScoreMode.COMPLETE, 1.0f);
     for (LeafReaderContext context : searcher.getLeafContexts()) {
-      assertEquals(1, weight.count(context));
+      assertEquals(0, weight.count(context));
     }
 
     writer.close();
     reader.close();
     dir.close();
   }
 
-  public void testFallbackCount() throws IOException {
+  public void testCompareCount() throws IOException {
+    final int iters = atLeast(10);
+    for (int iter = 0; iter < iters; ++iter) {
+      Directory dir = newDirectory();
+      IndexWriterConfig iwc = new IndexWriterConfig(new MockAnalyzer(random()));
+      SortField sortField = new SortedNumericSortField("field", SortField.Type.LONG);
+      boolean enableMissingValue = random().nextBoolean();
+      if (enableMissingValue) {
+        long missingValue =
+            random().nextBoolean()
+                ? TestUtil.nextLong(random(), -100, 10000)
+                : (random().nextBoolean() ? Long.MIN_VALUE : Long.MAX_VALUE);
+        sortField.setMissingValue(missingValue);
+      }
+      iwc.setIndexSort(new Sort(sortField));
+
+      RandomIndexWriter writer = new RandomIndexWriter(random(), dir, iwc);
+
+      final int numDocs = atLeast(100);
+      for (int i = 0; i < numDocs; ++i) {
+        Document doc = new Document();
+        final int numValues = TestUtil.nextInt(random(), 0, 1);
+        for (int j = 0; j < numValues; ++j) {
+          final long value = TestUtil.nextLong(random(), -100, 10000);
+          doc = createSNDVAndPointDocument("field", value);
+        }
+        writer.addDocument(doc);
+      }
+
+      if (random().nextBoolean()) {
+        writer.deleteDocuments(LongPoint.newRangeQuery("field", 0L, 10L));
+      }
+
+      final IndexReader reader = writer.getReader();
+      final IndexSearcher searcher = newSearcher(reader);
+      writer.close();
+
+      for (int i = 0; i < 100; ++i) {
+        final long min =
+            random().nextBoolean() ? Long.MIN_VALUE : TestUtil.nextLong(random(), -100, 10000);
+        final long max =
+            random().nextBoolean() ? Long.MAX_VALUE : TestUtil.nextLong(random(), -100, 10000);
+        final Query q1 = LongPoint.newRangeQuery("field", min, max);
+
+        final Query fallbackQuery = LongPoint.newRangeQuery("field", min, max);
+        final Query q2 =
+            new IndexSortSortedNumericDocValuesRangeQuery("field", min, max, fallbackQuery);
+        final Weight weight1 = q1.createWeight(searcher, ScoreMode.COMPLETE, 1.0f);
+        final Weight weight2 = q2.createWeight(searcher, ScoreMode.COMPLETE, 1.0f);
+        assertSameCount(weight1, weight2, searcher);
+      }
+
+      reader.close();
+      dir.close();
+    }
+  }
+
+  private void assertSameCount(Weight weight1, Weight weight2, IndexSearcher searcher)
+      throws IOException {
+    for (LeafReaderContext context : searcher.getLeafContexts()) {
+      assertEquals(weight1.count(context), weight2.count(context));
+    }
+  }
+
+  public void testCountBoundary() throws IOException {
     Directory dir = newDirectory();
     IndexWriterConfig iwc = new IndexWriterConfig(new MockAnalyzer(random()));
-    Sort indexSort = new Sort(new SortedNumericSortField("field", SortField.Type.LONG));
+    SortField sortField = new SortedNumericSortField("field", SortField.Type.LONG);
+    boolean useLower = random().nextBoolean();
+    long lowerValue = 1;
+    long upperValue = 100;
+    sortField.setMissingValue(useLower ? lowerValue : upperValue);
+    Sort indexSort = new Sort(sortField);
     iwc.setIndexSort(indexSort);
     RandomIndexWriter writer = new RandomIndexWriter(random(), dir, iwc);
-    Document doc = new Document();
-    doc.add(new SortedNumericDocValuesField("field", 10));
-    writer.addDocument(doc);
+
+    writer.addDocument(
+        createSNDVAndPointDocument("field", random().nextLong(lowerValue, upperValue)));
+    writer.addDocument(
+        createSNDVAndPointDocument("field", random().nextLong(lowerValue, upperValue)));
+    // missingValue
+    writer.addDocument(createMissingValueDocument());
+
     IndexReader reader = writer.getReader();
     IndexSearcher searcher = newSearcher(reader);
 
-    // we use an unrealistic query that exposes its own Weight#count
-    Query fallbackQuery = new MatchNoDocsQuery();
-    // the index is not sorted on this field, the fallback query is used
-    Query query = new IndexSortSortedNumericDocValuesRangeQuery("another", 1, 42, fallbackQuery);
+    Query fallbackQuery = LongPoint.newRangeQuery("field", lowerValue, upperValue);
+    Query query =
+        new IndexSortSortedNumericDocValuesRangeQuery(
+            "field", lowerValue, upperValue, fallbackQuery);
     Weight weight = query.createWeight(searcher, ScoreMode.COMPLETE, 1.0f);
     for (LeafReaderContext context : searcher.getLeafContexts()) {
-      assertEquals(0, weight.count(context));
+      assertEquals(2, weight.count(context));
     }
 
     writer.close();
     reader.close();
     dir.close();
   }
 
+  private Document createMissingValueDocument() {
+    Document doc = new Document();
+    doc.add(new StringField("foo", "fox", Field.Store.YES));
+    return doc;
+  }
+
+  private Document createSNDVAndPointDocument(String field, long value) {
+    Document doc = new Document();
+    doc.add(new SortedNumericDocValuesField(field, value));
+    doc.add(new LongPoint(field, value));
+    return doc;
+  }
+
   private Document createDocument(String field, long value) {
     Document doc = new Document();
     doc.add(new SortedNumericDocValuesField(field, value));