GH#12321: Reduce visibility of StringsToAutomaton (#12331)

apache · May 26, 2023 · 367b03b · 367b03b
1 parent f5f2577
commit 367b03b
Show file tree

Hide file tree

Showing 9 changed files with 26 additions and 23 deletions.
diff --git a/lucene/CHANGES.txt b/lucene/CHANGES.txt
@@ -53,6 +53,8 @@ API Changes
 
 * GITHUB#12276: Rename DaciukMihovAutomatonBuilder to StringsToAutomaton
 
+* GITHUB#12321: Reduced visibility of StringsToAutomaton. Please use Automata#makeStringUnion instead. (Greg Miller)
+
 New Features
 ---------------------
 

diff --git a/lucene/MIGRATE.md b/lucene/MIGRATE.md
@@ -90,7 +90,10 @@ for the currently-positioned document (doing so will result in undefined behavio
 Callers should remove the parameter when calling this method.
 
 
-### DaciukMihovAutomatonBuilder is renamed to StringsToAutomaton
+### DaciukMihovAutomatonBuilder is renamed to StringsToAutomaton and made package-private
+
+The former `DaciukMihovAutomatonBuilder#build` functionality is exposed through `Automata#makeStringUnion`.
+Users should be able to directly migrate to the `Automata` static method as a 1:1 replacement.
 
 
 ## Migration from Lucene 9.0 to Lucene 9.1

diff --git a/lucene/analysis/common/src/test/org/apache/lucene/analysis/core/TestFlattenGraphFilter.java b/lucene/analysis/common/src/test/org/apache/lucene/analysis/core/TestFlattenGraphFilter.java
@@ -40,9 +40,9 @@
 import org.apache.lucene.util.BytesRef;
 import org.apache.lucene.util.CharsRef;
 import org.apache.lucene.util.CharsRefBuilder;
+import org.apache.lucene.util.automaton.Automata;
 import org.apache.lucene.util.automaton.Automaton;
 import org.apache.lucene.util.automaton.Operations;
-import org.apache.lucene.util.automaton.StringsToAutomaton;
 import org.apache.lucene.util.automaton.Transition;
 
 public class TestFlattenGraphFilter extends BaseTokenStreamTestCase {
@@ -780,7 +780,7 @@ public void testPathsNotLost() throws IOException {
     acceptStrings.sort(Comparator.naturalOrder());
 
     acceptStrings = acceptStrings.stream().limit(wordCount).collect(Collectors.toList());
-    Automaton nonFlattenedAutomaton = StringsToAutomaton.build(acceptStrings);
+    Automaton nonFlattenedAutomaton = Automata.makeStringUnion(acceptStrings);
 
     TokenStream ts = AutomatonToTokenStream.toTokenStream(nonFlattenedAutomaton);
     TokenStream flattenedTokenStream = new FlattenGraphFilter(ts);

diff --git a/lucene/core/src/java/org/apache/lucene/util/automaton/Automata.java b/lucene/core/src/java/org/apache/lucene/util/automaton/Automata.java
@@ -40,6 +40,11 @@
  * @lucene.experimental
  */
 public final class Automata {
+  /**
+   * {@link #makeStringUnion(Collection)} limits terms of this max length to ensure the stack
+   * doesn't overflow while building, since our algorithm currently relies on recursion.
+   */
+  public static final int MAX_STRING_UNION_TERM_LENGTH = 1000;
 
   private Automata() {}
 

diff --git a/lucene/core/src/java/org/apache/lucene/util/automaton/StringsToAutomaton.java b/lucene/core/src/java/org/apache/lucene/util/automaton/StringsToAutomaton.java
@@ -32,16 +32,9 @@
  * of Minimal Acyclic Finite-State Automata by Daciuk, Mihov, Watson and Watson</a>. This requires
  * sorted input data, but is very fast (nearly linear with the input size).
  *
- * @see #build(Collection)
  * @see Automata#makeStringUnion(Collection)
  */
-public final class StringsToAutomaton {
-
-  /**
-   * This builder rejects terms that are more than 1k chars long since it then uses recursion based
-   * on the length of the string, which might cause stack overflows.
-   */
-  public static final int MAX_TERM_LENGTH = 1_000;
+final class StringsToAutomaton {
 
   /** The default constructor is private. Use static methods directly. */
   private StringsToAutomaton() {
@@ -195,7 +188,7 @@ private static boolean referenceEquals(Object[] a1, Object[] a2) {
    * or equal compared to any previous sequences added to this automaton (the input must be sorted).
    */
   private void add(CharsRef current) {
-    if (current.length > MAX_TERM_LENGTH) {
+    if (current.length > Automata.MAX_STRING_UNION_TERM_LENGTH) {
       throw new IllegalArgumentException(
           "This builder doesn't allow terms that are larger than 1,000 characters, got " + current);
     }
@@ -269,7 +262,7 @@ private static int convert(
    * Build a minimal, deterministic automaton from a sorted list of {@link BytesRef} representing
    * strings in UTF-8. These strings must be binary-sorted.
    */
-  public static Automaton build(Collection<BytesRef> input) {
+  static Automaton build(Collection<BytesRef> input) {
     final StringsToAutomaton builder = new StringsToAutomaton();
 
     CharsRefBuilder current = new CharsRefBuilder();

diff --git a/lucene/core/src/test/org/apache/lucene/analysis/TestAutomatonToTokenStream.java b/lucene/core/src/test/org/apache/lucene/analysis/TestAutomatonToTokenStream.java
@@ -22,16 +22,16 @@
 import java.util.List;
 import org.apache.lucene.tests.analysis.BaseTokenStreamTestCase;
 import org.apache.lucene.util.BytesRef;
+import org.apache.lucene.util.automaton.Automata;
 import org.apache.lucene.util.automaton.Automaton;
-import org.apache.lucene.util.automaton.StringsToAutomaton;
 
 public class TestAutomatonToTokenStream extends BaseTokenStreamTestCase {
 
   public void testSinglePath() throws IOException {
     List<BytesRef> acceptStrings = new ArrayList<>();
     acceptStrings.add(new BytesRef("abc"));
 
-    Automaton flatPathAutomaton = StringsToAutomaton.build(acceptStrings);
+    Automaton flatPathAutomaton = Automata.makeStringUnion(acceptStrings);
     TokenStream ts = AutomatonToTokenStream.toTokenStream(flatPathAutomaton);
     assertTokenStreamContents(
         ts,
@@ -48,7 +48,7 @@ public void testParallelPaths() throws IOException {
     acceptStrings.add(new BytesRef("123"));
     acceptStrings.add(new BytesRef("abc"));
 
-    Automaton flatPathAutomaton = StringsToAutomaton.build(acceptStrings);
+    Automaton flatPathAutomaton = Automata.makeStringUnion(acceptStrings);
     TokenStream ts = AutomatonToTokenStream.toTokenStream(flatPathAutomaton);
     assertTokenStreamContents(
         ts,
@@ -65,7 +65,7 @@ public void testForkedPath() throws IOException {
     acceptStrings.add(new BytesRef("ab3"));
     acceptStrings.add(new BytesRef("abc"));
 
-    Automaton flatPathAutomaton = StringsToAutomaton.build(acceptStrings);
+    Automaton flatPathAutomaton = Automata.makeStringUnion(acceptStrings);
     TokenStream ts = AutomatonToTokenStream.toTokenStream(flatPathAutomaton);
     assertTokenStreamContents(
         ts,

diff --git a/lucene/core/src/test/org/apache/lucene/util/automaton/TestCompiledAutomaton.java b/lucene/core/src/test/org/apache/lucene/util/automaton/TestCompiledAutomaton.java
@@ -35,7 +35,7 @@ private CompiledAutomaton build(int determinizeWorkLimit, String... strings) {
       terms.add(new BytesRef(s));
     }
     Collections.sort(terms);
-    final Automaton a = StringsToAutomaton.build(terms);
+    final Automaton a = Automata.makeStringUnion(terms);
     return new CompiledAutomaton(a, true, false, false);
   }
 

diff --git a/...e/highlighter/src/java/org/apache/lucene/search/uhighlight/MemoryIndexOffsetStrategy.java b/...e/highlighter/src/java/org/apache/lucene/search/uhighlight/MemoryIndexOffsetStrategy.java
@@ -29,7 +29,7 @@
 import org.apache.lucene.index.memory.MemoryIndex;
 import org.apache.lucene.queries.spans.SpanQuery;
 import org.apache.lucene.util.BytesRef;
-import org.apache.lucene.util.automaton.StringsToAutomaton;
+import org.apache.lucene.util.automaton.Automata;
 
 /**
  * Uses an {@link Analyzer} on content to get offsets and then populates a {@link MemoryIndex}.
@@ -66,7 +66,7 @@ private static CharArrayMatcher buildCombinedAutomaton(UHComponents components)
       // to build an automaton on them
       List<BytesRef> filteredTerms =
           Arrays.stream(components.getTerms())
-              .filter(b -> b.length < StringsToAutomaton.MAX_TERM_LENGTH)
+              .filter(b -> b.length < Automata.MAX_STRING_UNION_TERM_LENGTH)
               .toList();
       allAutomata.add(CharArrayMatcher.fromTerms(filteredTerms));
     }

diff --git a/lucene/highlighter/src/test/org/apache/lucene/search/uhighlight/TestUnifiedHighlighter.java b/lucene/highlighter/src/test/org/apache/lucene/search/uhighlight/TestUnifiedHighlighter.java
@@ -58,7 +58,7 @@
 import org.apache.lucene.tests.analysis.MockTokenizer;
 import org.apache.lucene.tests.index.RandomIndexWriter;
 import org.apache.lucene.tests.util.LuceneTestCase;
-import org.apache.lucene.util.automaton.StringsToAutomaton;
+import org.apache.lucene.util.automaton.Automata;
 import org.junit.After;
 import org.junit.Before;
 
@@ -1671,11 +1671,11 @@ public void testQueryWithLongTerm() throws IOException {
     Query query =
         new BooleanQuery.Builder()
             .add(
-                new TermQuery(new Term("title", "a".repeat(StringsToAutomaton.MAX_TERM_LENGTH))),
+                new TermQuery(new Term("title", "a".repeat(Automata.MAX_STRING_UNION_TERM_LENGTH))),
                 BooleanClause.Occur.SHOULD)
             .add(
                 new TermQuery(
-                    new Term("title", "a".repeat(StringsToAutomaton.MAX_TERM_LENGTH + 1))),
+                    new Term("title", "a".repeat(Automata.MAX_STRING_UNION_TERM_LENGTH + 1))),
                 BooleanClause.Occur.SHOULD)
             .add(new TermQuery(new Term("title", "title")), BooleanClause.Occur.SHOULD)
             .build();
-Original file line number
+Diff line change
@@ Expand Up / @@ -53,6 +53,8 @@ API Changes @@
     * GITHUB#12276: Rename DaciukMihovAutomatonBuilder to StringsToAutomaton
+    * GITHUB#12321: Reduced visibility of StringsToAutomaton. Please use Automata#makeStringUnion instead. (Greg Miller)
     New Features
     ---------------------
@@ Expand Down @@