apache#12276: rename DaciukMihovAutomatonBuilder to StringsToAutomaton

Closes apache#12276
mikemccand · May 18, 2023 · ad3f89a · ad3f89a
1 parent 0c6e8ae
commit ad3f89a
Show file tree

Hide file tree

Showing 10 changed files with 30 additions and 23 deletions.
diff --git a/lucene/CHANGES.txt b/lucene/CHANGES.txt
@@ -51,6 +51,8 @@ API Changes
   They have already been runtime-checked to only be implemented by the specific classes
   so this is effectively a non-breaking change.
 
+* GITHUB#12276: Rename DaciukMihovAutomatonBuilder to StringsToAutomaton
+
 New Features
 ---------------------
 

diff --git a/lucene/MIGRATE.md b/lucene/MIGRATE.md
@@ -89,6 +89,10 @@ for the currently-positioned document (doing so will result in undefined behavio
 `IOContext.READONCE` for opening internally, as that's the only valid usage pattern for checksum input.
 Callers should remove the parameter when calling this method.
 
+
+### DaciukMihovAutomatonBuilder is renamed to StringsToAutomaton
+
+
 ## Migration from Lucene 9.0 to Lucene 9.1
 
 ### Test framework package migration and module (LUCENE-10301)

diff --git a/lucene/analysis/common/src/test/org/apache/lucene/analysis/core/TestFlattenGraphFilter.java b/lucene/analysis/common/src/test/org/apache/lucene/analysis/core/TestFlattenGraphFilter.java
@@ -41,8 +41,8 @@
 import org.apache.lucene.util.CharsRef;
 import org.apache.lucene.util.CharsRefBuilder;
 import org.apache.lucene.util.automaton.Automaton;
-import org.apache.lucene.util.automaton.DaciukMihovAutomatonBuilder;
 import org.apache.lucene.util.automaton.Operations;
+import org.apache.lucene.util.automaton.StringsToAutomaton;
 import org.apache.lucene.util.automaton.Transition;
 
 public class TestFlattenGraphFilter extends BaseTokenStreamTestCase {
@@ -780,7 +780,7 @@ public void testPathsNotLost() throws IOException {
     acceptStrings.sort(Comparator.naturalOrder());
 
     acceptStrings = acceptStrings.stream().limit(wordCount).collect(Collectors.toList());
-    Automaton nonFlattenedAutomaton = DaciukMihovAutomatonBuilder.build(acceptStrings);
+    Automaton nonFlattenedAutomaton = StringsToAutomaton.build(acceptStrings);
 
     TokenStream ts = AutomatonToTokenStream.toTokenStream(nonFlattenedAutomaton);
     TokenStream flattenedTokenStream = new FlattenGraphFilter(ts);

diff --git a/lucene/core/src/java/org/apache/lucene/util/automaton/Automata.java b/lucene/core/src/java/org/apache/lucene/util/automaton/Automata.java
@@ -573,7 +573,7 @@ public static Automaton makeStringUnion(Collection<BytesRef> utf8Strings) {
     if (utf8Strings.isEmpty()) {
       return makeEmpty();
     } else {
-      return DaciukMihovAutomatonBuilder.build(utf8Strings);
+      return StringsToAutomaton.build(utf8Strings);
     }
   }
 }
diff --git a/...utomaton/DaciukMihovAutomatonBuilder.java → ...ne/util/automaton/StringsToAutomaton.java b/...utomaton/DaciukMihovAutomatonBuilder.java → ...ne/util/automaton/StringsToAutomaton.java
@@ -27,13 +27,15 @@
 import org.apache.lucene.util.UnicodeUtil;
 
 /**
- * Builds a minimal, deterministic {@link Automaton} that accepts a set of strings. The algorithm
- * requires sorted input data, but is very fast (nearly linear with the input size).
+ * Builds a minimal, deterministic {@link Automaton} that accepts a set of strings using the
+ * algorithm described in <a href="https://aclanthology.org/J00-1002.pdf">Incremental Construction
+ * of Minimal Acyclic Finite-State Automata by Daciuk, Mihov, Watson and Watson</a>. This requires
+ * sorted input data, but is very fast (nearly linear with the input size).
  *
  * @see #build(Collection)
  * @see Automata#makeStringUnion(Collection)
  */
-public final class DaciukMihovAutomatonBuilder {
+public final class StringsToAutomaton {
 
   /**
    * This builder rejects terms that are more than 1k chars long since it then uses recursion based
@@ -42,7 +44,7 @@ public final class DaciukMihovAutomatonBuilder {
   public static final int MAX_TERM_LENGTH = 1_000;
 
   /** The default constructor is private. Use static methods directly. */
-  private DaciukMihovAutomatonBuilder() {
+  private StringsToAutomaton() {
     super();
   }
 
@@ -246,7 +248,7 @@ private static int convert(
     visited.put(s, converted);
     int i = 0;
     int[] labels = s.labels;
-    for (DaciukMihovAutomatonBuilder.State target : s.states) {
+    for (StringsToAutomaton.State target : s.states) {
       a.addTransition(converted, convert(a, target, visited), labels[i++]);
     }
 
@@ -258,7 +260,7 @@ private static int convert(
    * strings in UTF-8. These strings must be binary-sorted.
    */
   public static Automaton build(Collection<BytesRef> input) {
-    final DaciukMihovAutomatonBuilder builder = new DaciukMihovAutomatonBuilder();
+    final StringsToAutomaton builder = new StringsToAutomaton();
 
     char[] chars = new char[0];
     CharsRef ref = new CharsRef();

diff --git a/lucene/core/src/test/org/apache/lucene/analysis/TestAutomatonToTokenStream.java b/lucene/core/src/test/org/apache/lucene/analysis/TestAutomatonToTokenStream.java
@@ -23,15 +23,15 @@
 import org.apache.lucene.tests.analysis.BaseTokenStreamTestCase;
 import org.apache.lucene.util.BytesRef;
 import org.apache.lucene.util.automaton.Automaton;
-import org.apache.lucene.util.automaton.DaciukMihovAutomatonBuilder;
+import org.apache.lucene.util.automaton.StringsToAutomaton;
 
 public class TestAutomatonToTokenStream extends BaseTokenStreamTestCase {
 
   public void testSinglePath() throws IOException {
     List<BytesRef> acceptStrings = new ArrayList<>();
     acceptStrings.add(new BytesRef("abc"));
 
-    Automaton flatPathAutomaton = DaciukMihovAutomatonBuilder.build(acceptStrings);
+    Automaton flatPathAutomaton = StringsToAutomaton.build(acceptStrings);
     TokenStream ts = AutomatonToTokenStream.toTokenStream(flatPathAutomaton);
     assertTokenStreamContents(
         ts,
@@ -48,7 +48,7 @@ public void testParallelPaths() throws IOException {
     acceptStrings.add(new BytesRef("123"));
     acceptStrings.add(new BytesRef("abc"));
 
-    Automaton flatPathAutomaton = DaciukMihovAutomatonBuilder.build(acceptStrings);
+    Automaton flatPathAutomaton = StringsToAutomaton.build(acceptStrings);
     TokenStream ts = AutomatonToTokenStream.toTokenStream(flatPathAutomaton);
     assertTokenStreamContents(
         ts,
@@ -65,7 +65,7 @@ public void testForkedPath() throws IOException {
     acceptStrings.add(new BytesRef("ab3"));
     acceptStrings.add(new BytesRef("abc"));
 
-    Automaton flatPathAutomaton = DaciukMihovAutomatonBuilder.build(acceptStrings);
+    Automaton flatPathAutomaton = StringsToAutomaton.build(acceptStrings);
     TokenStream ts = AutomatonToTokenStream.toTokenStream(flatPathAutomaton);
     assertTokenStreamContents(
         ts,

diff --git a/lucene/core/src/test/org/apache/lucene/util/automaton/TestCompiledAutomaton.java b/lucene/core/src/test/org/apache/lucene/util/automaton/TestCompiledAutomaton.java
@@ -35,7 +35,7 @@ private CompiledAutomaton build(int determinizeWorkLimit, String... strings) {
       terms.add(new BytesRef(s));
     }
     Collections.sort(terms);
-    final Automaton a = DaciukMihovAutomatonBuilder.build(terms);
+    final Automaton a = StringsToAutomaton.build(terms);
     return new CompiledAutomaton(a, true, false, false);
   }
 

diff --git a/...aton/TestDaciukMihovAutomatonBuilder.java → ...til/automaton/TestStringsToAutomaton.java b/...aton/TestDaciukMihovAutomatonBuilder.java → ...til/automaton/TestStringsToAutomaton.java
@@ -22,20 +22,20 @@
 import org.apache.lucene.util.ArrayUtil;
 import org.apache.lucene.util.BytesRef;
 
-public class TestDaciukMihovAutomatonBuilder extends LuceneTestCase {
+public class TestStringsToAutomaton extends LuceneTestCase {
 
   public void testLargeTerms() {
     byte[] b10k = new byte[10_000];
     Arrays.fill(b10k, (byte) 'a');
     IllegalArgumentException e =
         expectThrows(
             IllegalArgumentException.class,
-            () -> DaciukMihovAutomatonBuilder.build(Collections.singleton(new BytesRef(b10k))));
+            () -> StringsToAutomaton.build(Collections.singleton(new BytesRef(b10k))));
     assertTrue(
         e.getMessage()
             .startsWith("This builder doesn't allow terms that are larger than 1,000 characters"));
 
     byte[] b1k = ArrayUtil.copyOfSubArray(b10k, 0, 1000);
-    DaciukMihovAutomatonBuilder.build(Collections.singleton(new BytesRef(b1k))); // no exception
+    StringsToAutomaton.build(Collections.singleton(new BytesRef(b1k))); // no exception
   }
 }
diff --git a/...e/highlighter/src/java/org/apache/lucene/search/uhighlight/MemoryIndexOffsetStrategy.java b/...e/highlighter/src/java/org/apache/lucene/search/uhighlight/MemoryIndexOffsetStrategy.java
@@ -29,7 +29,7 @@
 import org.apache.lucene.index.memory.MemoryIndex;
 import org.apache.lucene.queries.spans.SpanQuery;
 import org.apache.lucene.util.BytesRef;
-import org.apache.lucene.util.automaton.DaciukMihovAutomatonBuilder;
+import org.apache.lucene.util.automaton.StringsToAutomaton;
 
 /**
  * Uses an {@link Analyzer} on content to get offsets and then populates a {@link MemoryIndex}.
@@ -66,7 +66,7 @@ private static CharArrayMatcher buildCombinedAutomaton(UHComponents components)
       // to build an automaton on them
       List<BytesRef> filteredTerms =
           Arrays.stream(components.getTerms())
-              .filter(b -> b.length < DaciukMihovAutomatonBuilder.MAX_TERM_LENGTH)
+              .filter(b -> b.length < StringsToAutomaton.MAX_TERM_LENGTH)
               .toList();
       allAutomata.add(CharArrayMatcher.fromTerms(filteredTerms));
     }

diff --git a/lucene/highlighter/src/test/org/apache/lucene/search/uhighlight/TestUnifiedHighlighter.java b/lucene/highlighter/src/test/org/apache/lucene/search/uhighlight/TestUnifiedHighlighter.java
@@ -58,7 +58,7 @@
 import org.apache.lucene.tests.analysis.MockTokenizer;
 import org.apache.lucene.tests.index.RandomIndexWriter;
 import org.apache.lucene.tests.util.LuceneTestCase;
-import org.apache.lucene.util.automaton.DaciukMihovAutomatonBuilder;
+import org.apache.lucene.util.automaton.StringsToAutomaton;
 import org.junit.After;
 import org.junit.Before;
 
@@ -1671,12 +1671,11 @@ public void testQueryWithLongTerm() throws IOException {
     Query query =
         new BooleanQuery.Builder()
             .add(
-                new TermQuery(
-                    new Term("title", "a".repeat(DaciukMihovAutomatonBuilder.MAX_TERM_LENGTH))),
+                new TermQuery(new Term("title", "a".repeat(StringsToAutomaton.MAX_TERM_LENGTH))),
                 BooleanClause.Occur.SHOULD)
             .add(
                 new TermQuery(
-                    new Term("title", "a".repeat(DaciukMihovAutomatonBuilder.MAX_TERM_LENGTH + 1))),
+                    new Term("title", "a".repeat(StringsToAutomaton.MAX_TERM_LENGTH + 1))),
                 BooleanClause.Occur.SHOULD)
             .add(new TermQuery(new Term("title", "title")), BooleanClause.Occur.SHOULD)
             .build();