Skip to content

Commit

Permalink
Add "direct to binary" option for DaciukMihovAutomatonBuilder and use…
Browse files Browse the repository at this point in the history
… it in TermInSetQuery#visit (#12320)
  • Loading branch information
gsmiller authored Jun 2, 2023
1 parent 45110a6 commit 52ace7e
Show file tree
Hide file tree
Showing 7 changed files with 395 additions and 133 deletions.
3 changes: 3 additions & 0 deletions lucene/CHANGES.txt
Original file line number Diff line number Diff line change
Expand Up @@ -158,6 +158,9 @@ Improvements

* GITHUB#12290: Make memory fence in ByteBufferGuard explicit using `VarHandle.fullFence()`

* GITHUB#12320: Add "direct to binary" option for DaciukMihovAutomatonBuilder and use it in TermInSetQuery#visit.
(Greg Miller)

Optimizations
---------------------

Expand Down
23 changes: 10 additions & 13 deletions lucene/core/src/java/org/apache/lucene/search/TermInSetQuery.java
Original file line number Diff line number Diff line change
Expand Up @@ -17,11 +17,10 @@
package org.apache.lucene.search;

import java.io.IOException;
import java.util.ArrayList;
import java.io.UncheckedIOException;
import java.util.Arrays;
import java.util.Collection;
import java.util.Collections;
import java.util.List;
import java.util.SortedSet;
import org.apache.lucene.index.FilteredTermsEnum;
import org.apache.lucene.index.PrefixCodedTerms;
Expand All @@ -38,8 +37,6 @@
import org.apache.lucene.util.automaton.Automata;
import org.apache.lucene.util.automaton.Automaton;
import org.apache.lucene.util.automaton.ByteRunAutomaton;
import org.apache.lucene.util.automaton.CompiledAutomaton;
import org.apache.lucene.util.automaton.Operations;

/**
* Specialization for a disjunction over many terms that, by default, behaves like a {@link
Expand Down Expand Up @@ -150,17 +147,17 @@ public void visit(QueryVisitor visitor) {
}
}

// TODO: this is extremely slow. we should not be doing this.
// TODO: This is pretty heavy-weight. If we have TermInSetQuery directly extend AutomatonQuery
// we won't have to do this (see GH#12176).
private ByteRunAutomaton asByteRunAutomaton() {
TermIterator iterator = termData.iterator();
List<Automaton> automata = new ArrayList<>();
for (BytesRef term = iterator.next(); term != null; term = iterator.next()) {
automata.add(Automata.makeBinary(term));
try {
Automaton a = Automata.makeBinaryStringUnion(termData.iterator());
return new ByteRunAutomaton(a, true);
} catch (IOException e) {
// Shouldn't happen since termData.iterator() provides an interator implementation that
// never throws:
throw new UncheckedIOException(e);
}
Automaton automaton =
Operations.determinize(
Operations.union(automata), Operations.DEFAULT_DETERMINIZE_WORK_LIMIT);
return new CompiledAutomaton(automaton).runAutomaton;
}

@Override
Expand Down
75 changes: 48 additions & 27 deletions lucene/core/src/java/org/apache/lucene/util/UnicodeUtil.java
Original file line number Diff line number Diff line change
Expand Up @@ -477,38 +477,59 @@ public static int UTF8toUTF32(final BytesRef utf8, final int[] ints) {
int utf8Upto = utf8.offset;
final byte[] bytes = utf8.bytes;
final int utf8Limit = utf8.offset + utf8.length;
UTF8CodePoint reuse = null;
while (utf8Upto < utf8Limit) {
final int numBytes = utf8CodeLength[bytes[utf8Upto] & 0xFF];
int v = 0;
switch (numBytes) {
case 1:
ints[utf32Count++] = bytes[utf8Upto++];
continue;
case 2:
// 5 useful bits
v = bytes[utf8Upto++] & 31;
break;
case 3:
// 4 useful bits
v = bytes[utf8Upto++] & 15;
break;
case 4:
// 3 useful bits
v = bytes[utf8Upto++] & 7;
break;
default:
throw new IllegalArgumentException("invalid utf8");
}
reuse = codePointAt(bytes, utf8Upto, reuse);
ints[utf32Count++] = reuse.codePoint;
utf8Upto += reuse.numBytes;
}

return utf32Count;
}

// TODO: this may read past utf8's limit.
final int limit = utf8Upto + numBytes - 1;
while (utf8Upto < limit) {
v = v << 6 | bytes[utf8Upto++] & 63;
/**
* Computes the codepoint and codepoint length (in bytes) of the specified {@code offset} in the
* provided {@code utf8} byte array, assuming UTF8 encoding. As with other related methods in this
* class, this assumes valid UTF8 input and <strong>does not perform</strong> full UTF8
* validation. Passing invalid UTF8 or a position that is not a valid header byte position may
* result in undefined behavior. This makes no attempt to synchronize or validate.
*/
public static UTF8CodePoint codePointAt(byte[] utf8, int pos, UTF8CodePoint reuse) {
if (reuse == null) {
reuse = new UTF8CodePoint();
}

int leadByte = utf8[pos] & 0xFF;
int numBytes = utf8CodeLength[leadByte];
reuse.numBytes = numBytes;
int v;
switch (numBytes) {
case 1 -> {
reuse.codePoint = leadByte;
return reuse;
}
ints[utf32Count++] = v;
case 2 -> v = leadByte & 31; // 5 useful bits
case 3 -> v = leadByte & 15; // 4 useful bits
case 4 -> v = leadByte & 7; // 3 useful bits
default -> throw new IllegalArgumentException(
"Invalid UTF8 header byte: 0x" + Integer.toHexString(leadByte));
}

return utf32Count;
// TODO: this may read past utf8's limit.
final int limit = pos + numBytes;
pos++;
while (pos < limit) {
v = v << 6 | utf8[pos++] & 63;
}
reuse.codePoint = v;

return reuse;
}

/** Holds a codepoint along with the number of bytes required to represent it in UTF8 */
public static final class UTF8CodePoint {
public int codePoint;
public int numBytes;
}

/** Shift value for lead surrogate to form a supplementary character. */
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -29,9 +29,11 @@

package org.apache.lucene.util.automaton;

import java.io.IOException;
import java.util.ArrayList;
import java.util.Collection;
import org.apache.lucene.util.BytesRef;
import org.apache.lucene.util.BytesRefIterator;
import org.apache.lucene.util.StringHelper;

/**
Expand Down Expand Up @@ -578,7 +580,49 @@ public static Automaton makeStringUnion(Collection<BytesRef> utf8Strings) {
if (utf8Strings.isEmpty()) {
return makeEmpty();
} else {
return StringsToAutomaton.build(utf8Strings);
return StringsToAutomaton.build(utf8Strings, false);
}
}

/**
* Returns a new (deterministic and minimal) automaton that accepts the union of the given
* collection of {@link BytesRef}s representing UTF-8 encoded strings. The resulting automaton
* will be built in a binary representation.
*
* @param utf8Strings The input strings, UTF-8 encoded. The collection must be in sorted order.
* @return An {@link Automaton} accepting all input strings. The resulting automaton is binary
* based (UTF-8 encoded byte transition labels).
*/
public static Automaton makeBinaryStringUnion(Collection<BytesRef> utf8Strings) {
if (utf8Strings.isEmpty()) {
return makeEmpty();
} else {
return StringsToAutomaton.build(utf8Strings, true);
}
}

/**
* Returns a new (deterministic and minimal) automaton that accepts the union of the given
* iterator of {@link BytesRef}s representing UTF-8 encoded strings.
*
* @param utf8Strings The input strings, UTF-8 encoded. The iterator must be in sorted order.
* @return An {@link Automaton} accepting all input strings. The resulting automaton is codepoint
* based (full unicode codepoints on transitions).
*/
public static Automaton makeStringUnion(BytesRefIterator utf8Strings) throws IOException {
return StringsToAutomaton.build(utf8Strings, false);
}

/**
* Returns a new (deterministic and minimal) automaton that accepts the union of the given
* iterator of {@link BytesRef}s representing UTF-8 encoded strings. The resulting automaton will
* be built in a binary representation.
*
* @param utf8Strings The input strings, UTF-8 encoded. The iterator must be in sorted order.
* @return An {@link Automaton} accepting all input strings. The resulting automaton is binary
* based (UTF-8 encoded byte transition labels).
*/
public static Automaton makeBinaryStringUnion(BytesRefIterator utf8Strings) throws IOException {
return StringsToAutomaton.build(utf8Strings, true);
}
}
Loading

0 comments on commit 52ace7e

Please sign in to comment.