Skip to content

Commit

Permalink
Merge remote-tracking branch 'upstream/main' into test/adj-knn-merge-…
Browse files Browse the repository at this point in the history
…stability
  • Loading branch information
benwtrent committed Jan 29, 2025
2 parents f881e6c + a7b7f0d commit 379aee4
Show file tree
Hide file tree
Showing 18 changed files with 480 additions and 214 deletions.
3 changes: 3 additions & 0 deletions lucene/CHANGES.txt
Original file line number Diff line number Diff line change
Expand Up @@ -102,6 +102,9 @@ Other

* GITHUB#14091: Cover all DataType. (Lu Xugang)

* GITHUB#14130: Upgrade OpenNLP from 2.3.2 to 2.5.3, which transitively upgrades Slf4j
from 1.7.36 to 2.0.16. (Michael Froh)

======================= Lucene 10.1.0 =======================

API Changes
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -17,8 +17,8 @@

package org.apache.lucene.analysis.opennlp.tools;

import java.io.IOException;
import opennlp.tools.postag.POSModel;
import opennlp.tools.postag.POSTagFormat;
import opennlp.tools.postag.POSTagger;
import opennlp.tools.postag.POSTaggerME;

Expand All @@ -29,8 +29,8 @@
public class NLPPOSTaggerOp {
private final POSTagger tagger;

public NLPPOSTaggerOp(POSModel model) throws IOException {
tagger = new POSTaggerME(model);
public NLPPOSTaggerOp(POSModel model) {
tagger = new POSTaggerME(model, POSTagFormat.PENN);
}

public synchronized String[] getPOSTags(String[] words) {
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -243,13 +243,14 @@ private HnswGraph reconstructAndWriteGraph(
nodesByLevel.add(null);

int maxOrd = graph.size();
int[] scratch = new int[graph.maxConn() * 2];
NodesIterator nodesOnLevel0 = graph.getNodesOnLevel(0);
levelNodeOffsets[0] = new int[nodesOnLevel0.size()];
while (nodesOnLevel0.hasNext()) {
int node = nodesOnLevel0.nextInt();
NeighborArray neighbors = graph.getNeighbors(0, newToOldMap[node]);
long offset = vectorIndex.getFilePointer();
reconstructAndWriteNeighbours(neighbors, oldToNewMap, maxOrd);
reconstructAndWriteNeighbours(neighbors, oldToNewMap, scratch, maxOrd);
levelNodeOffsets[0][node] = Math.toIntExact(vectorIndex.getFilePointer() - offset);
}

Expand All @@ -266,7 +267,7 @@ private HnswGraph reconstructAndWriteGraph(
for (int node : newNodes) {
NeighborArray neighbors = graph.getNeighbors(level, newToOldMap[node]);
long offset = vectorIndex.getFilePointer();
reconstructAndWriteNeighbours(neighbors, oldToNewMap, maxOrd);
reconstructAndWriteNeighbours(neighbors, oldToNewMap, scratch, maxOrd);
levelNodeOffsets[level][nodeOffsetIndex++] =
Math.toIntExact(vectorIndex.getFilePointer() - offset);
}
Expand Down Expand Up @@ -313,25 +314,33 @@ public NodesIterator getNodesOnLevel(int level) {
};
}

private void reconstructAndWriteNeighbours(NeighborArray neighbors, int[] oldToNewMap, int maxOrd)
throws IOException {
private void reconstructAndWriteNeighbours(
NeighborArray neighbors, int[] oldToNewMap, int[] scratch, int maxOrd) throws IOException {
int size = neighbors.size();
vectorIndex.writeVInt(size);

// Destructively modify; it's ok we are discarding it after this
int[] nnodes = neighbors.nodes();
for (int i = 0; i < size; i++) {
nnodes[i] = oldToNewMap[nnodes[i]];
}
Arrays.sort(nnodes, 0, size);
int actualSize = 0;
if (size > 0) {
scratch[0] = nnodes[0];
actualSize = 1;
}
// Now that we have sorted, do delta encoding to minimize the required bits to store the
// information
for (int i = size - 1; i > 0; --i) {
for (int i = 1; i < size; i++) {
assert nnodes[i] < maxOrd : "node too large: " + nnodes[i] + ">=" + maxOrd;
nnodes[i] -= nnodes[i - 1];
if (nnodes[i - 1] == nnodes[i]) {
continue;
}
scratch[actualSize++] = nnodes[i] - nnodes[i - 1];
}
for (int i = 0; i < size; i++) {
vectorIndex.writeVInt(nnodes[i]);
// Write the size after duplicates are removed
vectorIndex.writeVInt(actualSize);
for (int i = 0; i < actualSize; i++) {
vectorIndex.writeVInt(scratch[i]);
}
}

Expand Down
216 changes: 30 additions & 186 deletions lucene/core/src/java/org/apache/lucene/search/DisiPriorityQueue.java
Original file line number Diff line number Diff line change
Expand Up @@ -16,8 +16,6 @@
*/
package org.apache.lucene.search;

import java.util.Arrays;
import java.util.Iterator;
import org.apache.lucene.util.PriorityQueue;

/**
Expand All @@ -27,205 +25,51 @@
*
* @lucene.internal
*/
public final class DisiPriorityQueue implements Iterable<DisiWrapper> {

static int leftNode(int node) {
return ((node + 1) << 1) - 1;
}

static int rightNode(int leftNode) {
return leftNode + 1;
}

static int parentNode(int node) {
return ((node + 1) >>> 1) - 1;
public abstract sealed class DisiPriorityQueue implements Iterable<DisiWrapper>
permits DisiPriorityQueue2, DisiPriorityQueueN {

/** Create a {@link DisiPriorityQueue} of the given maximum size. */
public static DisiPriorityQueue ofMaxSize(int maxSize) {
if (maxSize <= 2) {
return new DisiPriorityQueue2();
} else {
return new DisiPriorityQueueN(maxSize);
}
}

private final DisiWrapper[] heap;
private int size;
/** Return the number of entries in this heap. */
public abstract int size();

public DisiPriorityQueue(int maxSize) {
heap = new DisiWrapper[maxSize];
size = 0;
}

public int size() {
return size;
}

public DisiWrapper top() {
return heap[0];
}
/** Return top value in this heap, or null if the heap is empty. */
public abstract DisiWrapper top();

/** Return the 2nd least value in this heap, or null if the heap contains less than 2 values. */
public DisiWrapper top2() {
switch (size()) {
case 0:
case 1:
return null;
case 2:
return heap[1];
default:
if (heap[1].doc <= heap[2].doc) {
return heap[1];
} else {
return heap[2];
}
}
}
public abstract DisiWrapper top2();

/** Get the list of scorers which are on the current doc. */
public DisiWrapper topList() {
final DisiWrapper[] heap = this.heap;
final int size = this.size;
DisiWrapper list = heap[0];
list.next = null;
if (size >= 3) {
list = topList(list, heap, size, 1);
list = topList(list, heap, size, 2);
} else if (size == 2 && heap[1].doc == list.doc) {
list = prepend(heap[1], list);
}
return list;
}

// prepend w1 (iterator) to w2 (list)
private DisiWrapper prepend(DisiWrapper w1, DisiWrapper w2) {
w1.next = w2;
return w1;
}

private DisiWrapper topList(DisiWrapper list, DisiWrapper[] heap, int size, int i) {
final DisiWrapper w = heap[i];
if (w.doc == list.doc) {
list = prepend(w, list);
final int left = leftNode(i);
final int right = left + 1;
if (right < size) {
list = topList(list, heap, size, left);
list = topList(list, heap, size, right);
} else if (left < size && heap[left].doc == list.doc) {
list = prepend(heap[left], list);
}
}
return list;
}
public abstract DisiWrapper topList();

public DisiWrapper add(DisiWrapper entry) {
final DisiWrapper[] heap = this.heap;
final int size = this.size;
heap[size] = entry;
upHeap(size);
this.size = size + 1;
return heap[0];
}
/** Add a {@link DisiWrapper} to this queue and return the top entry. */
public abstract DisiWrapper add(DisiWrapper entry);

/** Bulk add. */
public void addAll(DisiWrapper[] entries, int offset, int len) {
// Nothing to do if empty:
if (len == 0) {
return;
}

// Fail early if we're going to over-fill:
if (size + len > heap.length) {
throw new IndexOutOfBoundsException(
"Cannot add "
+ len
+ " elements to a queue with remaining capacity "
+ (heap.length - size));
}

// Copy the entries over to our heap array:
System.arraycopy(entries, offset, heap, size, len);
size += len;

// Heapify in bulk:
final int firstLeafIndex = size >>> 1;
for (int rootIndex = firstLeafIndex - 1; rootIndex >= 0; rootIndex--) {
int parentIndex = rootIndex;
DisiWrapper parent = heap[parentIndex];
while (parentIndex < firstLeafIndex) {
int childIndex = leftNode(parentIndex);
int rightChildIndex = rightNode(childIndex);
DisiWrapper child = heap[childIndex];
if (rightChildIndex < size && heap[rightChildIndex].doc < child.doc) {
child = heap[rightChildIndex];
childIndex = rightChildIndex;
}
if (child.doc >= parent.doc) {
break;
}
heap[parentIndex] = child;
parentIndex = childIndex;
}
heap[parentIndex] = parent;
for (int i = 0; i < len; ++i) {
add(entries[offset + i]);
}
}

public DisiWrapper pop() {
final DisiWrapper[] heap = this.heap;
final DisiWrapper result = heap[0];
final int i = --size;
heap[0] = heap[i];
heap[i] = null;
downHeap(i);
return result;
}
/** Remove the top entry and return it. */
public abstract DisiWrapper pop();

public DisiWrapper updateTop() {
downHeap(size);
return heap[0];
}
/** Rebalance this heap and return the top entry. */
public abstract DisiWrapper updateTop();

DisiWrapper updateTop(DisiWrapper topReplacement) {
heap[0] = topReplacement;
return updateTop();
}
/**
* Replace the top entry with the given entry, rebalance the heap, and return the new top entry.
*/
abstract DisiWrapper updateTop(DisiWrapper topReplacement);

/** Clear the heap. */
public void clear() {
Arrays.fill(heap, null);
size = 0;
}

void upHeap(int i) {
final DisiWrapper node = heap[i];
final int nodeDoc = node.doc;
int j = parentNode(i);
while (j >= 0 && nodeDoc < heap[j].doc) {
heap[i] = heap[j];
i = j;
j = parentNode(j);
}
heap[i] = node;
}

void downHeap(int size) {
int i = 0;
final DisiWrapper node = heap[0];
int j = leftNode(i);
if (j < size) {
int k = rightNode(j);
if (k < size && heap[k].doc < heap[j].doc) {
j = k;
}
if (heap[j].doc < node.doc) {
do {
heap[i] = heap[j];
i = j;
j = leftNode(i);
k = rightNode(j);
if (k < size && heap[k].doc < heap[j].doc) {
j = k;
}
} while (j < size && heap[j].doc < node.doc);
heap[i] = node;
}
}
}

@Override
public Iterator<DisiWrapper> iterator() {
return Arrays.asList(heap).subList(0, size).iterator();
}
public abstract void clear();
}
Loading

0 comments on commit 379aee4

Please sign in to comment.