Add "direct to binary" option for DaciukMihovAutomatonBuilder and use…

… it in TermInSetQuery#visit (#12320)
apache · Jun 2, 2023 · 52ace7e · 52ace7e
1 parent 45110a6
commit 52ace7e
Show file tree

Hide file tree

Showing 7 changed files with 395 additions and 133 deletions.
diff --git a/lucene/CHANGES.txt b/lucene/CHANGES.txt
@@ -158,6 +158,9 @@ Improvements
 
 * GITHUB#12290: Make memory fence in ByteBufferGuard explicit using `VarHandle.fullFence()`
 
+* GITHUB#12320: Add "direct to binary" option for DaciukMihovAutomatonBuilder and use it in TermInSetQuery#visit.
+  (Greg Miller)
+
 Optimizations
 ---------------------
 

diff --git a/lucene/core/src/java/org/apache/lucene/search/TermInSetQuery.java b/lucene/core/src/java/org/apache/lucene/search/TermInSetQuery.java
@@ -17,11 +17,10 @@
 package org.apache.lucene.search;
 
 import java.io.IOException;
-import java.util.ArrayList;
+import java.io.UncheckedIOException;
 import java.util.Arrays;
 import java.util.Collection;
 import java.util.Collections;
-import java.util.List;
 import java.util.SortedSet;
 import org.apache.lucene.index.FilteredTermsEnum;
 import org.apache.lucene.index.PrefixCodedTerms;
@@ -38,8 +37,6 @@
 import org.apache.lucene.util.automaton.Automata;
 import org.apache.lucene.util.automaton.Automaton;
 import org.apache.lucene.util.automaton.ByteRunAutomaton;
-import org.apache.lucene.util.automaton.CompiledAutomaton;
-import org.apache.lucene.util.automaton.Operations;
 
 /**
  * Specialization for a disjunction over many terms that, by default, behaves like a {@link
@@ -150,17 +147,17 @@ public void visit(QueryVisitor visitor) {
     }
   }
 
-  // TODO: this is extremely slow. we should not be doing this.
+  // TODO: This is pretty heavy-weight. If we have TermInSetQuery directly extend AutomatonQuery
+  // we won't have to do this (see GH#12176).
   private ByteRunAutomaton asByteRunAutomaton() {
-    TermIterator iterator = termData.iterator();
-    List<Automaton> automata = new ArrayList<>();
-    for (BytesRef term = iterator.next(); term != null; term = iterator.next()) {
-      automata.add(Automata.makeBinary(term));
+    try {
+      Automaton a = Automata.makeBinaryStringUnion(termData.iterator());
+      return new ByteRunAutomaton(a, true);
+    } catch (IOException e) {
+      // Shouldn't happen since termData.iterator() provides an interator implementation that
+      // never throws:
+      throw new UncheckedIOException(e);
     }
-    Automaton automaton =
-        Operations.determinize(
-            Operations.union(automata), Operations.DEFAULT_DETERMINIZE_WORK_LIMIT);
-    return new CompiledAutomaton(automaton).runAutomaton;
   }
 
   @Override

diff --git a/lucene/core/src/java/org/apache/lucene/util/UnicodeUtil.java b/lucene/core/src/java/org/apache/lucene/util/UnicodeUtil.java
@@ -477,38 +477,59 @@ public static int UTF8toUTF32(final BytesRef utf8, final int[] ints) {
     int utf8Upto = utf8.offset;
     final byte[] bytes = utf8.bytes;
     final int utf8Limit = utf8.offset + utf8.length;
+    UTF8CodePoint reuse = null;
     while (utf8Upto < utf8Limit) {
-      final int numBytes = utf8CodeLength[bytes[utf8Upto] & 0xFF];
-      int v = 0;
-      switch (numBytes) {
-        case 1:
-          ints[utf32Count++] = bytes[utf8Upto++];
-          continue;
-        case 2:
-          // 5 useful bits
-          v = bytes[utf8Upto++] & 31;
-          break;
-        case 3:
-          // 4 useful bits
-          v = bytes[utf8Upto++] & 15;
-          break;
-        case 4:
-          // 3 useful bits
-          v = bytes[utf8Upto++] & 7;
-          break;
-        default:
-          throw new IllegalArgumentException("invalid utf8");
-      }
+      reuse = codePointAt(bytes, utf8Upto, reuse);
+      ints[utf32Count++] = reuse.codePoint;
+      utf8Upto += reuse.numBytes;
+    }
+
+    return utf32Count;
+  }
 
-      // TODO: this may read past utf8's limit.
-      final int limit = utf8Upto + numBytes - 1;
-      while (utf8Upto < limit) {
-        v = v << 6 | bytes[utf8Upto++] & 63;
+  /**
+   * Computes the codepoint and codepoint length (in bytes) of the specified {@code offset} in the
+   * provided {@code utf8} byte array, assuming UTF8 encoding. As with other related methods in this
+   * class, this assumes valid UTF8 input and <strong>does not perform</strong> full UTF8
+   * validation. Passing invalid UTF8 or a position that is not a valid header byte position may
+   * result in undefined behavior. This makes no attempt to synchronize or validate.
+   */
+  public static UTF8CodePoint codePointAt(byte[] utf8, int pos, UTF8CodePoint reuse) {
+    if (reuse == null) {
+      reuse = new UTF8CodePoint();
+    }
+
+    int leadByte = utf8[pos] & 0xFF;
+    int numBytes = utf8CodeLength[leadByte];
+    reuse.numBytes = numBytes;
+    int v;
+    switch (numBytes) {
+      case 1 -> {
+        reuse.codePoint = leadByte;
+        return reuse;
       }
-      ints[utf32Count++] = v;
+      case 2 -> v = leadByte & 31; // 5 useful bits
+      case 3 -> v = leadByte & 15; // 4 useful bits
+      case 4 -> v = leadByte & 7; // 3 useful bits
+      default -> throw new IllegalArgumentException(
+          "Invalid UTF8 header byte: 0x" + Integer.toHexString(leadByte));
     }
 
-    return utf32Count;
+    // TODO: this may read past utf8's limit.
+    final int limit = pos + numBytes;
+    pos++;
+    while (pos < limit) {
+      v = v << 6 | utf8[pos++] & 63;
+    }
+    reuse.codePoint = v;
+
+    return reuse;
+  }
+
+  /** Holds a codepoint along with the number of bytes required to represent it in UTF8 */
+  public static final class UTF8CodePoint {
+    public int codePoint;
+    public int numBytes;
   }
 
   /** Shift value for lead surrogate to form a supplementary character. */

diff --git a/lucene/core/src/java/org/apache/lucene/util/automaton/Automata.java b/lucene/core/src/java/org/apache/lucene/util/automaton/Automata.java
@@ -29,9 +29,11 @@
 
 package org.apache.lucene.util.automaton;
 
+import java.io.IOException;
 import java.util.ArrayList;
 import java.util.Collection;
 import org.apache.lucene.util.BytesRef;
+import org.apache.lucene.util.BytesRefIterator;
 import org.apache.lucene.util.StringHelper;
 
 /**
@@ -578,7 +580,49 @@ public static Automaton makeStringUnion(Collection<BytesRef> utf8Strings) {
     if (utf8Strings.isEmpty()) {
       return makeEmpty();
     } else {
-      return StringsToAutomaton.build(utf8Strings);
+      return StringsToAutomaton.build(utf8Strings, false);
     }
   }
+
+  /**
+   * Returns a new (deterministic and minimal) automaton that accepts the union of the given
+   * collection of {@link BytesRef}s representing UTF-8 encoded strings. The resulting automaton
+   * will be built in a binary representation.
+   *
+   * @param utf8Strings The input strings, UTF-8 encoded. The collection must be in sorted order.
+   * @return An {@link Automaton} accepting all input strings. The resulting automaton is binary
+   *     based (UTF-8 encoded byte transition labels).
+   */
+  public static Automaton makeBinaryStringUnion(Collection<BytesRef> utf8Strings) {
+    if (utf8Strings.isEmpty()) {
+      return makeEmpty();
+    } else {
+      return StringsToAutomaton.build(utf8Strings, true);
+    }
+  }
+
+  /**
+   * Returns a new (deterministic and minimal) automaton that accepts the union of the given
+   * iterator of {@link BytesRef}s representing UTF-8 encoded strings.
+   *
+   * @param utf8Strings The input strings, UTF-8 encoded. The iterator must be in sorted order.
+   * @return An {@link Automaton} accepting all input strings. The resulting automaton is codepoint
+   *     based (full unicode codepoints on transitions).
+   */
+  public static Automaton makeStringUnion(BytesRefIterator utf8Strings) throws IOException {
+    return StringsToAutomaton.build(utf8Strings, false);
+  }
+
+  /**
+   * Returns a new (deterministic and minimal) automaton that accepts the union of the given
+   * iterator of {@link BytesRef}s representing UTF-8 encoded strings. The resulting automaton will
+   * be built in a binary representation.
+   *
+   * @param utf8Strings The input strings, UTF-8 encoded. The iterator must be in sorted order.
+   * @return An {@link Automaton} accepting all input strings. The resulting automaton is binary
+   *     based (UTF-8 encoded byte transition labels).
+   */
+  public static Automaton makeBinaryStringUnion(BytesRefIterator utf8Strings) throws IOException {
+    return StringsToAutomaton.build(utf8Strings, true);
+  }
 }