From cab88cafb1715c8844b69e366b7c8558c3c5f7d0 Mon Sep 17 00:00:00 2001 From: Benjamin Trent Date: Tue, 28 Jan 2025 08:34:41 -0500 Subject: [PATCH 1/3] Make knn graph conn writing more consistent (#14174) * Make graph writing more consistent * correct concurrent connected components logic --- .../lucene99/Lucene99HnswVectorsWriter.java | 29 ++++++++++++------- .../util/hnsw/HnswConcurrentMergeBuilder.java | 4 +-- 2 files changed, 20 insertions(+), 13 deletions(-) diff --git a/lucene/core/src/java/org/apache/lucene/codecs/lucene99/Lucene99HnswVectorsWriter.java b/lucene/core/src/java/org/apache/lucene/codecs/lucene99/Lucene99HnswVectorsWriter.java index 4983fdec6bf..e219157ab98 100644 --- a/lucene/core/src/java/org/apache/lucene/codecs/lucene99/Lucene99HnswVectorsWriter.java +++ b/lucene/core/src/java/org/apache/lucene/codecs/lucene99/Lucene99HnswVectorsWriter.java @@ -243,13 +243,14 @@ private HnswGraph reconstructAndWriteGraph( nodesByLevel.add(null); int maxOrd = graph.size(); + int[] scratch = new int[graph.maxConn() * 2]; NodesIterator nodesOnLevel0 = graph.getNodesOnLevel(0); levelNodeOffsets[0] = new int[nodesOnLevel0.size()]; while (nodesOnLevel0.hasNext()) { int node = nodesOnLevel0.nextInt(); NeighborArray neighbors = graph.getNeighbors(0, newToOldMap[node]); long offset = vectorIndex.getFilePointer(); - reconstructAndWriteNeighbours(neighbors, oldToNewMap, maxOrd); + reconstructAndWriteNeighbours(neighbors, oldToNewMap, scratch, maxOrd); levelNodeOffsets[0][node] = Math.toIntExact(vectorIndex.getFilePointer() - offset); } @@ -266,7 +267,7 @@ private HnswGraph reconstructAndWriteGraph( for (int node : newNodes) { NeighborArray neighbors = graph.getNeighbors(level, newToOldMap[node]); long offset = vectorIndex.getFilePointer(); - reconstructAndWriteNeighbours(neighbors, oldToNewMap, maxOrd); + reconstructAndWriteNeighbours(neighbors, oldToNewMap, scratch, maxOrd); levelNodeOffsets[level][nodeOffsetIndex++] = Math.toIntExact(vectorIndex.getFilePointer() - offset); } @@ -313,25 +314,33 @@ public NodesIterator getNodesOnLevel(int level) { }; } - private void reconstructAndWriteNeighbours(NeighborArray neighbors, int[] oldToNewMap, int maxOrd) - throws IOException { + private void reconstructAndWriteNeighbours( + NeighborArray neighbors, int[] oldToNewMap, int[] scratch, int maxOrd) throws IOException { int size = neighbors.size(); - vectorIndex.writeVInt(size); - // Destructively modify; it's ok we are discarding it after this int[] nnodes = neighbors.nodes(); for (int i = 0; i < size; i++) { nnodes[i] = oldToNewMap[nnodes[i]]; } Arrays.sort(nnodes, 0, size); + int actualSize = 0; + if (size > 0) { + scratch[0] = nnodes[0]; + actualSize = 1; + } // Now that we have sorted, do delta encoding to minimize the required bits to store the // information - for (int i = size - 1; i > 0; --i) { + for (int i = 1; i < size; i++) { assert nnodes[i] < maxOrd : "node too large: " + nnodes[i] + ">=" + maxOrd; - nnodes[i] -= nnodes[i - 1]; + if (nnodes[i - 1] == nnodes[i]) { + continue; + } + scratch[actualSize++] = nnodes[i] - nnodes[i - 1]; } - for (int i = 0; i < size; i++) { - vectorIndex.writeVInt(nnodes[i]); + // Write the size after duplicates are removed + vectorIndex.writeVInt(actualSize); + for (int i = 0; i < actualSize; i++) { + vectorIndex.writeVInt(scratch[i]); } } diff --git a/lucene/core/src/java/org/apache/lucene/util/hnsw/HnswConcurrentMergeBuilder.java b/lucene/core/src/java/org/apache/lucene/util/hnsw/HnswConcurrentMergeBuilder.java index d2e81addc5d..d9d58c829d3 100644 --- a/lucene/core/src/java/org/apache/lucene/util/hnsw/HnswConcurrentMergeBuilder.java +++ b/lucene/core/src/java/org/apache/lucene/util/hnsw/HnswConcurrentMergeBuilder.java @@ -90,9 +90,7 @@ public OnHeapHnswGraph build(int maxOrd) throws IOException { }); } taskExecutor.invokeAll(futures); - finish(); - frozen = true; - return workers[0].getCompletedGraph(); + return getCompletedGraph(); } @Override From 71256cced2e4e4bae67ccca3159d50099e7eb6f5 Mon Sep 17 00:00:00 2001 From: Adrien Grand Date: Tue, 28 Jan 2025 15:16:30 +0100 Subject: [PATCH 2/3] Specialize DisiPriorityQueue for the 2-clauses case. (#14070) Disjunctions with 2 clauses are rather common. Specializing this case enables some shortcuts. --- .../lucene/search/DisiPriorityQueue.java | 216 +++------------- .../lucene/search/DisiPriorityQueue2.java | 110 +++++++++ .../lucene/search/DisiPriorityQueueN.java | 230 ++++++++++++++++++ .../search/DisjunctionDISIApproximation.java | 2 +- .../lucene/search/DocIdSetBulkIterator.java | 34 +++ .../lucene/search/MaxScoreBulkScorer.java | 2 +- .../org/apache/lucene/search/WANDScorer.java | 8 +- .../lucene/search/TestDisiPriorityQueue.java | 38 ++- .../lucene/sandbox/search/CoveringScorer.java | 2 +- 9 files changed, 448 insertions(+), 194 deletions(-) create mode 100644 lucene/core/src/java/org/apache/lucene/search/DisiPriorityQueue2.java create mode 100644 lucene/core/src/java/org/apache/lucene/search/DisiPriorityQueueN.java create mode 100644 lucene/core/src/java/org/apache/lucene/search/DocIdSetBulkIterator.java diff --git a/lucene/core/src/java/org/apache/lucene/search/DisiPriorityQueue.java b/lucene/core/src/java/org/apache/lucene/search/DisiPriorityQueue.java index 034f46ed93f..d6bdf82e48d 100644 --- a/lucene/core/src/java/org/apache/lucene/search/DisiPriorityQueue.java +++ b/lucene/core/src/java/org/apache/lucene/search/DisiPriorityQueue.java @@ -16,8 +16,6 @@ */ package org.apache.lucene.search; -import java.util.Arrays; -import java.util.Iterator; import org.apache.lucene.util.PriorityQueue; /** @@ -27,205 +25,51 @@ * * @lucene.internal */ -public final class DisiPriorityQueue implements Iterable { - - static int leftNode(int node) { - return ((node + 1) << 1) - 1; - } - - static int rightNode(int leftNode) { - return leftNode + 1; - } - - static int parentNode(int node) { - return ((node + 1) >>> 1) - 1; +public abstract sealed class DisiPriorityQueue implements Iterable + permits DisiPriorityQueue2, DisiPriorityQueueN { + + /** Create a {@link DisiPriorityQueue} of the given maximum size. */ + public static DisiPriorityQueue ofMaxSize(int maxSize) { + if (maxSize <= 2) { + return new DisiPriorityQueue2(); + } else { + return new DisiPriorityQueueN(maxSize); + } } - private final DisiWrapper[] heap; - private int size; + /** Return the number of entries in this heap. */ + public abstract int size(); - public DisiPriorityQueue(int maxSize) { - heap = new DisiWrapper[maxSize]; - size = 0; - } - - public int size() { - return size; - } - - public DisiWrapper top() { - return heap[0]; - } + /** Return top value in this heap, or null if the heap is empty. */ + public abstract DisiWrapper top(); /** Return the 2nd least value in this heap, or null if the heap contains less than 2 values. */ - public DisiWrapper top2() { - switch (size()) { - case 0: - case 1: - return null; - case 2: - return heap[1]; - default: - if (heap[1].doc <= heap[2].doc) { - return heap[1]; - } else { - return heap[2]; - } - } - } + public abstract DisiWrapper top2(); /** Get the list of scorers which are on the current doc. */ - public DisiWrapper topList() { - final DisiWrapper[] heap = this.heap; - final int size = this.size; - DisiWrapper list = heap[0]; - list.next = null; - if (size >= 3) { - list = topList(list, heap, size, 1); - list = topList(list, heap, size, 2); - } else if (size == 2 && heap[1].doc == list.doc) { - list = prepend(heap[1], list); - } - return list; - } - - // prepend w1 (iterator) to w2 (list) - private DisiWrapper prepend(DisiWrapper w1, DisiWrapper w2) { - w1.next = w2; - return w1; - } - - private DisiWrapper topList(DisiWrapper list, DisiWrapper[] heap, int size, int i) { - final DisiWrapper w = heap[i]; - if (w.doc == list.doc) { - list = prepend(w, list); - final int left = leftNode(i); - final int right = left + 1; - if (right < size) { - list = topList(list, heap, size, left); - list = topList(list, heap, size, right); - } else if (left < size && heap[left].doc == list.doc) { - list = prepend(heap[left], list); - } - } - return list; - } + public abstract DisiWrapper topList(); - public DisiWrapper add(DisiWrapper entry) { - final DisiWrapper[] heap = this.heap; - final int size = this.size; - heap[size] = entry; - upHeap(size); - this.size = size + 1; - return heap[0]; - } + /** Add a {@link DisiWrapper} to this queue and return the top entry. */ + public abstract DisiWrapper add(DisiWrapper entry); + /** Bulk add. */ public void addAll(DisiWrapper[] entries, int offset, int len) { - // Nothing to do if empty: - if (len == 0) { - return; - } - - // Fail early if we're going to over-fill: - if (size + len > heap.length) { - throw new IndexOutOfBoundsException( - "Cannot add " - + len - + " elements to a queue with remaining capacity " - + (heap.length - size)); - } - - // Copy the entries over to our heap array: - System.arraycopy(entries, offset, heap, size, len); - size += len; - - // Heapify in bulk: - final int firstLeafIndex = size >>> 1; - for (int rootIndex = firstLeafIndex - 1; rootIndex >= 0; rootIndex--) { - int parentIndex = rootIndex; - DisiWrapper parent = heap[parentIndex]; - while (parentIndex < firstLeafIndex) { - int childIndex = leftNode(parentIndex); - int rightChildIndex = rightNode(childIndex); - DisiWrapper child = heap[childIndex]; - if (rightChildIndex < size && heap[rightChildIndex].doc < child.doc) { - child = heap[rightChildIndex]; - childIndex = rightChildIndex; - } - if (child.doc >= parent.doc) { - break; - } - heap[parentIndex] = child; - parentIndex = childIndex; - } - heap[parentIndex] = parent; + for (int i = 0; i < len; ++i) { + add(entries[offset + i]); } } - public DisiWrapper pop() { - final DisiWrapper[] heap = this.heap; - final DisiWrapper result = heap[0]; - final int i = --size; - heap[0] = heap[i]; - heap[i] = null; - downHeap(i); - return result; - } + /** Remove the top entry and return it. */ + public abstract DisiWrapper pop(); - public DisiWrapper updateTop() { - downHeap(size); - return heap[0]; - } + /** Rebalance this heap and return the top entry. */ + public abstract DisiWrapper updateTop(); - DisiWrapper updateTop(DisiWrapper topReplacement) { - heap[0] = topReplacement; - return updateTop(); - } + /** + * Replace the top entry with the given entry, rebalance the heap, and return the new top entry. + */ + abstract DisiWrapper updateTop(DisiWrapper topReplacement); /** Clear the heap. */ - public void clear() { - Arrays.fill(heap, null); - size = 0; - } - - void upHeap(int i) { - final DisiWrapper node = heap[i]; - final int nodeDoc = node.doc; - int j = parentNode(i); - while (j >= 0 && nodeDoc < heap[j].doc) { - heap[i] = heap[j]; - i = j; - j = parentNode(j); - } - heap[i] = node; - } - - void downHeap(int size) { - int i = 0; - final DisiWrapper node = heap[0]; - int j = leftNode(i); - if (j < size) { - int k = rightNode(j); - if (k < size && heap[k].doc < heap[j].doc) { - j = k; - } - if (heap[j].doc < node.doc) { - do { - heap[i] = heap[j]; - i = j; - j = leftNode(i); - k = rightNode(j); - if (k < size && heap[k].doc < heap[j].doc) { - j = k; - } - } while (j < size && heap[j].doc < node.doc); - heap[i] = node; - } - } - } - - @Override - public Iterator iterator() { - return Arrays.asList(heap).subList(0, size).iterator(); - } + public abstract void clear(); } diff --git a/lucene/core/src/java/org/apache/lucene/search/DisiPriorityQueue2.java b/lucene/core/src/java/org/apache/lucene/search/DisiPriorityQueue2.java new file mode 100644 index 00000000000..b7e587382db --- /dev/null +++ b/lucene/core/src/java/org/apache/lucene/search/DisiPriorityQueue2.java @@ -0,0 +1,110 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.lucene.search; + +import java.util.Arrays; +import java.util.Collections; +import java.util.Iterator; + +/** {@link DisiPriorityQueue} of two entries or less. */ +final class DisiPriorityQueue2 extends DisiPriorityQueue { + + private DisiWrapper top, top2; + + @Override + public Iterator iterator() { + if (top2 != null) { + return Arrays.asList(top, top2).iterator(); + } else if (top != null) { + return Collections.singleton(top).iterator(); + } else { + return Collections.emptyIterator(); + } + } + + @Override + public int size() { + return top2 == null ? (top == null ? 0 : 1) : 2; + } + + @Override + public DisiWrapper top() { + return top; + } + + @Override + public DisiWrapper top2() { + return top2; + } + + @Override + public DisiWrapper topList() { + DisiWrapper topList = null; + if (top != null) { + top.next = null; + topList = top; + if (top2 != null && top.doc == top2.doc) { + top2.next = topList; + topList = top2; + } + } + return topList; + } + + @Override + public DisiWrapper add(DisiWrapper entry) { + if (top == null) { + return top = entry; + } else if (top2 == null) { + top2 = entry; + return updateTop(); + } else { + throw new IllegalStateException( + "Trying to add a 3rd element to a DisiPriorityQueue configured with a max size of 2"); + } + } + + @Override + public DisiWrapper pop() { + DisiWrapper ret = top; + top = top2; + top2 = null; + return ret; + } + + @Override + public DisiWrapper updateTop() { + if (top2 != null && top2.doc < top.doc) { + DisiWrapper tmp = top; + top = top2; + top2 = tmp; + } + return top; + } + + @Override + DisiWrapper updateTop(DisiWrapper topReplacement) { + top = topReplacement; + return updateTop(); + } + + @Override + public void clear() { + top = null; + top2 = null; + } +} diff --git a/lucene/core/src/java/org/apache/lucene/search/DisiPriorityQueueN.java b/lucene/core/src/java/org/apache/lucene/search/DisiPriorityQueueN.java new file mode 100644 index 00000000000..b841c3ef0ef --- /dev/null +++ b/lucene/core/src/java/org/apache/lucene/search/DisiPriorityQueueN.java @@ -0,0 +1,230 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.lucene.search; + +import java.util.Arrays; +import java.util.Iterator; + +final class DisiPriorityQueueN extends DisiPriorityQueue { + + static int leftNode(int node) { + return ((node + 1) << 1) - 1; + } + + static int rightNode(int leftNode) { + return leftNode + 1; + } + + static int parentNode(int node) { + return ((node + 1) >>> 1) - 1; + } + + private final DisiWrapper[] heap; + private int size; + + DisiPriorityQueueN(int maxSize) { + heap = new DisiWrapper[maxSize]; + size = 0; + } + + @Override + public int size() { + return size; + } + + @Override + public DisiWrapper top() { + return heap[0]; + } + + @Override + public DisiWrapper top2() { + switch (size()) { + case 0: + case 1: + return null; + case 2: + return heap[1]; + default: + if (heap[1].doc <= heap[2].doc) { + return heap[1]; + } else { + return heap[2]; + } + } + } + + @Override + public DisiWrapper topList() { + final DisiWrapper[] heap = this.heap; + final int size = this.size; + DisiWrapper list = heap[0]; + list.next = null; + if (size >= 3) { + list = topList(list, heap, size, 1); + list = topList(list, heap, size, 2); + } else if (size == 2 && heap[1].doc == list.doc) { + list = prepend(heap[1], list); + } + return list; + } + + // prepend w1 (iterator) to w2 (list) + private DisiWrapper prepend(DisiWrapper w1, DisiWrapper w2) { + w1.next = w2; + return w1; + } + + private DisiWrapper topList(DisiWrapper list, DisiWrapper[] heap, int size, int i) { + final DisiWrapper w = heap[i]; + if (w.doc == list.doc) { + list = prepend(w, list); + final int left = leftNode(i); + final int right = rightNode(left); + if (right < size) { + list = topList(list, heap, size, left); + list = topList(list, heap, size, right); + } else if (left < size && heap[left].doc == list.doc) { + list = prepend(heap[left], list); + } + } + return list; + } + + @Override + public DisiWrapper add(DisiWrapper entry) { + final DisiWrapper[] heap = this.heap; + final int size = this.size; + heap[size] = entry; + upHeap(size); + this.size = size + 1; + return heap[0]; + } + + @Override + public void addAll(DisiWrapper[] entries, int offset, int len) { + // Nothing to do if empty: + if (len == 0) { + return; + } + + // Fail early if we're going to over-fill: + if (size + len > heap.length) { + throw new IndexOutOfBoundsException( + "Cannot add " + + len + + " elements to a queue with remaining capacity " + + (heap.length - size)); + } + + // Copy the entries over to our heap array: + System.arraycopy(entries, offset, heap, size, len); + size += len; + + // Heapify in bulk: + final int firstLeafIndex = size >>> 1; + for (int rootIndex = firstLeafIndex - 1; rootIndex >= 0; rootIndex--) { + int parentIndex = rootIndex; + DisiWrapper parent = heap[parentIndex]; + while (parentIndex < firstLeafIndex) { + int childIndex = leftNode(parentIndex); + int rightChildIndex = rightNode(childIndex); + DisiWrapper child = heap[childIndex]; + if (rightChildIndex < size && heap[rightChildIndex].doc < child.doc) { + child = heap[rightChildIndex]; + childIndex = rightChildIndex; + } + if (child.doc >= parent.doc) { + break; + } + heap[parentIndex] = child; + parentIndex = childIndex; + } + heap[parentIndex] = parent; + } + } + + @Override + public DisiWrapper pop() { + final DisiWrapper[] heap = this.heap; + final DisiWrapper result = heap[0]; + final int i = --size; + heap[0] = heap[i]; + heap[i] = null; + downHeap(i); + return result; + } + + @Override + public DisiWrapper updateTop() { + downHeap(size); + return heap[0]; + } + + @Override + DisiWrapper updateTop(DisiWrapper topReplacement) { + heap[0] = topReplacement; + return updateTop(); + } + + @Override + public void clear() { + Arrays.fill(heap, null); + size = 0; + } + + void upHeap(int i) { + final DisiWrapper node = heap[i]; + final int nodeDoc = node.doc; + int j = parentNode(i); + while (j >= 0 && nodeDoc < heap[j].doc) { + heap[i] = heap[j]; + i = j; + j = parentNode(j); + } + heap[i] = node; + } + + void downHeap(int size) { + int i = 0; + final DisiWrapper node = heap[0]; + int j = leftNode(i); + if (j < size) { + int k = rightNode(j); + if (k < size && heap[k].doc < heap[j].doc) { + j = k; + } + if (heap[j].doc < node.doc) { + do { + heap[i] = heap[j]; + i = j; + j = leftNode(i); + k = rightNode(j); + if (k < size && heap[k].doc < heap[j].doc) { + j = k; + } + } while (j < size && heap[j].doc < node.doc); + heap[i] = node; + } + } + } + + @Override + public Iterator iterator() { + return Arrays.asList(heap).subList(0, size).iterator(); + } +} diff --git a/lucene/core/src/java/org/apache/lucene/search/DisjunctionDISIApproximation.java b/lucene/core/src/java/org/apache/lucene/search/DisjunctionDISIApproximation.java index cedababbce6..08018dacf9b 100644 --- a/lucene/core/src/java/org/apache/lucene/search/DisjunctionDISIApproximation.java +++ b/lucene/core/src/java/org/apache/lucene/search/DisjunctionDISIApproximation.java @@ -91,7 +91,7 @@ public DisjunctionDISIApproximation( // Build the PQ: assert lastIdx >= -1 && lastIdx < wrappers.length - 1; int pqLen = wrappers.length - lastIdx - 1; - leadIterators = new DisiPriorityQueue(pqLen); + leadIterators = DisiPriorityQueue.ofMaxSize(pqLen); leadIterators.addAll(wrappers, lastIdx + 1, pqLen); // Build the non-PQ list: diff --git a/lucene/core/src/java/org/apache/lucene/search/DocIdSetBulkIterator.java b/lucene/core/src/java/org/apache/lucene/search/DocIdSetBulkIterator.java new file mode 100644 index 00000000000..87912beeccc --- /dev/null +++ b/lucene/core/src/java/org/apache/lucene/search/DocIdSetBulkIterator.java @@ -0,0 +1,34 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.lucene.search; + +import java.io.IOException; +import org.apache.lucene.util.Bits; + +/** Bulk iterator over a {@link DocIdSetIterator}. */ +public abstract class DocIdSetBulkIterator { + + /** Sole constructor, invoked by sub-classes. */ + protected DocIdSetBulkIterator() {} + + /** + * Iterate over documents contained in this iterator and call {@link LeafCollector#collect} on + * them. + */ + public abstract void iterate(LeafCollector collector, Bits acceptDocs, int min, int max) + throws IOException; +} diff --git a/lucene/core/src/java/org/apache/lucene/search/MaxScoreBulkScorer.java b/lucene/core/src/java/org/apache/lucene/search/MaxScoreBulkScorer.java index 93dd1ea91e3..30b1d4b7e5a 100644 --- a/lucene/core/src/java/org/apache/lucene/search/MaxScoreBulkScorer.java +++ b/lucene/core/src/java/org/apache/lucene/search/MaxScoreBulkScorer.java @@ -64,7 +64,7 @@ final class MaxScoreBulkScorer extends BulkScorer { allScorers[i++] = w; } this.cost = cost; - essentialQueue = new DisiPriorityQueue(allScorers.length); + essentialQueue = DisiPriorityQueue.ofMaxSize(allScorers.length); maxScoreSums = new double[allScorers.length]; } diff --git a/lucene/core/src/java/org/apache/lucene/search/WANDScorer.java b/lucene/core/src/java/org/apache/lucene/search/WANDScorer.java index 897713dbe17..88ffa4a0c62 100644 --- a/lucene/core/src/java/org/apache/lucene/search/WANDScorer.java +++ b/lucene/core/src/java/org/apache/lucene/search/WANDScorer.java @@ -16,9 +16,9 @@ */ package org.apache.lucene.search; -import static org.apache.lucene.search.DisiPriorityQueue.leftNode; -import static org.apache.lucene.search.DisiPriorityQueue.parentNode; -import static org.apache.lucene.search.DisiPriorityQueue.rightNode; +import static org.apache.lucene.search.DisiPriorityQueueN.leftNode; +import static org.apache.lucene.search.DisiPriorityQueueN.parentNode; +import static org.apache.lucene.search.DisiPriorityQueueN.rightNode; import static org.apache.lucene.search.ScorerUtil.costWithMinShouldMatch; import java.io.IOException; @@ -170,7 +170,7 @@ private static long scaleMinScore(float minScore, int scalingFactor) { this.scoreMode = scoreMode; - head = new DisiPriorityQueue(scorers.size()); + head = DisiPriorityQueue.ofMaxSize(scorers.size()); // there can be at most num_scorers - 1 scorers beyond the current position tail = new DisiWrapper[scorers.size()]; diff --git a/lucene/core/src/test/org/apache/lucene/search/TestDisiPriorityQueue.java b/lucene/core/src/test/org/apache/lucene/search/TestDisiPriorityQueue.java index fb7afac8ba4..967c5a34d7d 100644 --- a/lucene/core/src/test/org/apache/lucene/search/TestDisiPriorityQueue.java +++ b/lucene/core/src/test/org/apache/lucene/search/TestDisiPriorityQueue.java @@ -26,6 +26,42 @@ public class TestDisiPriorityQueue extends LuceneTestCase { + public void testDisiPriorityQueue2() throws IOException { + Random r = random(); + DisiWrapper w1 = wrapper(randomDisi(r)); + DisiWrapper w2 = wrapper(randomDisi(r)); + DisiWrapper w3 = wrapper(randomDisi(r)); + + DisiPriorityQueue pq = DisiPriorityQueue.ofMaxSize(2); + w1.doc = 1; + w2.doc = 0; + assertNull(pq.top()); + assertEquals(0, pq.size()); + assertSame(w1, pq.add(w1)); + assertSame(w1, pq.top()); + assertEquals(1, pq.size()); + assertSame(w2, pq.add(w2)); + assertSame(w2, pq.top()); + assertEquals(2, pq.size()); + expectThrows(IllegalStateException.class, () -> pq.add(w3)); + + w2.doc = 1; + assertSame(w2, pq.updateTop()); + DisiWrapper topList = pq.topList(); + assertSame(w1, topList); + assertSame(w2, topList.next); + assertNull(topList.next.next); + + w2.doc = 2; + assertSame(w1, pq.updateTop()); + topList = pq.topList(); + assertSame(w1, topList); + assertNull(topList.next); + + assertSame(w1, pq.pop()); + assertSame(w2, pq.top()); + } + public void testRandom() throws Exception { Random r = random(); @@ -37,7 +73,7 @@ public void testRandom() throws Exception { all[i] = w; } - DisiPriorityQueue pq = new DisiPriorityQueue(size); + DisiPriorityQueue pq = DisiPriorityQueue.ofMaxSize(size); if (r.nextBoolean()) { for (DisiWrapper w : all) { pq.add(w); diff --git a/lucene/sandbox/src/java/org/apache/lucene/sandbox/search/CoveringScorer.java b/lucene/sandbox/src/java/org/apache/lucene/sandbox/search/CoveringScorer.java index dfedb51ed1f..09e2bb57af7 100644 --- a/lucene/sandbox/src/java/org/apache/lucene/sandbox/search/CoveringScorer.java +++ b/lucene/sandbox/src/java/org/apache/lucene/sandbox/search/CoveringScorer.java @@ -51,7 +51,7 @@ final class CoveringScorer extends Scorer { this.minMatchValues = minMatchValues; this.doc = -1; - subScorers = new DisiPriorityQueue(scorers.size()); + subScorers = DisiPriorityQueue.ofMaxSize(scorers.size()); for (Scorer scorer : scorers) { subScorers.add(new DisiWrapper(scorer, false)); From a7b7f0d6583c5532337320efee71d4797f473b60 Mon Sep 17 00:00:00 2001 From: Michael Froh Date: Tue, 28 Jan 2025 14:31:14 -0800 Subject: [PATCH 3/3] Upgrade OpenNLP from 2.3.2 to 2.5.3 (#14130) --- lucene/CHANGES.txt | 3 +++ .../lucene/analysis/opennlp/tools/NLPPOSTaggerOp.java | 6 +++--- lucene/licenses/opennlp-tools-2.3.2.jar.sha1 | 1 - lucene/licenses/opennlp-tools-2.5.3.jar.sha1 | 1 + lucene/licenses/slf4j-api-1.7.36.jar.sha1 | 1 - lucene/licenses/slf4j-api-2.0.16.jar.sha1 | 1 + versions.lock | 8 ++++---- versions.toml | 2 +- 8 files changed, 13 insertions(+), 10 deletions(-) delete mode 100644 lucene/licenses/opennlp-tools-2.3.2.jar.sha1 create mode 100644 lucene/licenses/opennlp-tools-2.5.3.jar.sha1 delete mode 100644 lucene/licenses/slf4j-api-1.7.36.jar.sha1 create mode 100644 lucene/licenses/slf4j-api-2.0.16.jar.sha1 diff --git a/lucene/CHANGES.txt b/lucene/CHANGES.txt index b36a48c2fd0..fb9e7665a9e 100644 --- a/lucene/CHANGES.txt +++ b/lucene/CHANGES.txt @@ -102,6 +102,9 @@ Other * GITHUB#14091: Cover all DataType. (Lu Xugang) +* GITHUB#14130: Upgrade OpenNLP from 2.3.2 to 2.5.3, which transitively upgrades Slf4j + from 1.7.36 to 2.0.16. (Michael Froh) + ======================= Lucene 10.1.0 ======================= API Changes diff --git a/lucene/analysis/opennlp/src/java/org/apache/lucene/analysis/opennlp/tools/NLPPOSTaggerOp.java b/lucene/analysis/opennlp/src/java/org/apache/lucene/analysis/opennlp/tools/NLPPOSTaggerOp.java index dee4afefc58..ef7a6fb6245 100644 --- a/lucene/analysis/opennlp/src/java/org/apache/lucene/analysis/opennlp/tools/NLPPOSTaggerOp.java +++ b/lucene/analysis/opennlp/src/java/org/apache/lucene/analysis/opennlp/tools/NLPPOSTaggerOp.java @@ -17,8 +17,8 @@ package org.apache.lucene.analysis.opennlp.tools; -import java.io.IOException; import opennlp.tools.postag.POSModel; +import opennlp.tools.postag.POSTagFormat; import opennlp.tools.postag.POSTagger; import opennlp.tools.postag.POSTaggerME; @@ -29,8 +29,8 @@ public class NLPPOSTaggerOp { private final POSTagger tagger; - public NLPPOSTaggerOp(POSModel model) throws IOException { - tagger = new POSTaggerME(model); + public NLPPOSTaggerOp(POSModel model) { + tagger = new POSTaggerME(model, POSTagFormat.PENN); } public synchronized String[] getPOSTags(String[] words) { diff --git a/lucene/licenses/opennlp-tools-2.3.2.jar.sha1 b/lucene/licenses/opennlp-tools-2.3.2.jar.sha1 deleted file mode 100644 index 94b2924f8fa..00000000000 --- a/lucene/licenses/opennlp-tools-2.3.2.jar.sha1 +++ /dev/null @@ -1 +0,0 @@ -d739edba1e729691ed5ab80e1ccf330555a02ea7 diff --git a/lucene/licenses/opennlp-tools-2.5.3.jar.sha1 b/lucene/licenses/opennlp-tools-2.5.3.jar.sha1 new file mode 100644 index 00000000000..fb01299fa29 --- /dev/null +++ b/lucene/licenses/opennlp-tools-2.5.3.jar.sha1 @@ -0,0 +1 @@ +4b544138ec079c1c73dc2c1b928506871c4b1b47 diff --git a/lucene/licenses/slf4j-api-1.7.36.jar.sha1 b/lucene/licenses/slf4j-api-1.7.36.jar.sha1 deleted file mode 100644 index 828b7cf7e05..00000000000 --- a/lucene/licenses/slf4j-api-1.7.36.jar.sha1 +++ /dev/null @@ -1 +0,0 @@ -6c62681a2f655b49963a5983b8b0950a6120ae14 diff --git a/lucene/licenses/slf4j-api-2.0.16.jar.sha1 b/lucene/licenses/slf4j-api-2.0.16.jar.sha1 new file mode 100644 index 00000000000..b1bb75be39b --- /dev/null +++ b/lucene/licenses/slf4j-api-2.0.16.jar.sha1 @@ -0,0 +1 @@ +0172931663a09a1fa515567af5fbef00897d3c04 diff --git a/versions.lock b/versions.lock index f3057288a9f..07f8ff30543 100644 --- a/versions.lock +++ b/versions.lock @@ -12,7 +12,7 @@ "org.antlr:antlr4-runtime:4.11.1" : "d9953130,refs=4", "org.apache.commons:commons-compress:1.19" : "5ce8cdc6,refs=2", "org.apache.commons:commons-math3:3.6.1" : "85a1e4c6,refs=2", - "org.apache.opennlp:opennlp-tools:2.3.2" : "2f760bab,refs=4", + "org.apache.opennlp:opennlp-tools:2.5.3" : "2f760bab,refs=4", "org.carrot2:morfologik-fsa:2.1.9" : "79af844b,refs=4", "org.carrot2:morfologik-polish:2.1.9" : "fe494320,refs=3", "org.carrot2:morfologik-stemming:2.1.9" : "79af844b,refs=4", @@ -22,7 +22,7 @@ "org.ow2.asm:asm:9.6" : "d9953130,refs=4", "org.ow2.asm:asm-commons:9.6" : "d9953130,refs=4", "org.ow2.asm:asm-tree:9.6" : "d9953130,refs=4", - "org.slf4j:slf4j-api:1.7.36" : "2f760bab,refs=4", + "org.slf4j:slf4j-api:2.0.16" : "2f760bab,refs=4", "ua.net.nlp:morfologik-ukrainian-search:4.9.1" : "fe494320,refs=3", "xerces:xercesImpl:2.12.0" : "5ce8cdc6,refs=2" }, @@ -56,7 +56,7 @@ "org.antlr:antlr4-runtime:4.11.1" : "6fbc4021,refs=5", "org.apache.commons:commons-compress:1.19" : "6f16ff86,refs=2", "org.apache.commons:commons-math3:3.6.1" : "152d9f78,refs=3", - "org.apache.opennlp:opennlp-tools:2.3.2" : "b91715f0,refs=6", + "org.apache.opennlp:opennlp-tools:2.5.3" : "b91715f0,refs=6", "org.assertj:assertj-core:3.21.0" : "b7ba1646,refs=2", "org.carrot2:morfologik-fsa:2.1.9" : "e077a675,refs=8", "org.carrot2:morfologik-polish:2.1.9" : "cb00cecf,refs=5", @@ -73,7 +73,7 @@ "org.ow2.asm:asm-commons:9.6" : "6fbc4021,refs=5", "org.ow2.asm:asm-tree:9.6" : "6fbc4021,refs=5", "org.pcollections:pcollections:3.1.4" : "6897bc09,refs=38", - "org.slf4j:slf4j-api:1.7.36" : "b91715f0,refs=6", + "org.slf4j:slf4j-api:2.0.16" : "b91715f0,refs=6", "ua.net.nlp:morfologik-ukrainian-search:4.9.1" : "cb00cecf,refs=5", "xerces:xercesImpl:2.12.0" : "6f16ff86,refs=2" } diff --git a/versions.toml b/versions.toml index d1e693e03ca..679287f9d7d 100644 --- a/versions.toml +++ b/versions.toml @@ -25,7 +25,7 @@ minJava = "21" morfologik = "2.1.9" morfologik-ukrainian = "4.9.1" nekohtml = "1.9.17" -opennlp = "2.3.2" +opennlp = "2.5.3" procfork = "1.0.6" randomizedtesting = "2.8.1" rat = "0.14"