diff --git a/lucene/CHANGES.txt b/lucene/CHANGES.txt index 57c255853db9..5dd286488ecc 100644 --- a/lucene/CHANGES.txt +++ b/lucene/CHANGES.txt @@ -51,6 +51,8 @@ API Changes They have already been runtime-checked to only be implemented by the specific classes so this is effectively a non-breaking change. +* GITHUB#12276: Rename DaciukMihovAutomatonBuilder to StringsToAutomaton + New Features --------------------- diff --git a/lucene/MIGRATE.md b/lucene/MIGRATE.md index e731cfd84ab8..2bb6c27f563a 100644 --- a/lucene/MIGRATE.md +++ b/lucene/MIGRATE.md @@ -89,6 +89,10 @@ for the currently-positioned document (doing so will result in undefined behavio `IOContext.READONCE` for opening internally, as that's the only valid usage pattern for checksum input. Callers should remove the parameter when calling this method. + +### DaciukMihovAutomatonBuilder is renamed to StringsToAutomaton + + ## Migration from Lucene 9.0 to Lucene 9.1 ### Test framework package migration and module (LUCENE-10301) diff --git a/lucene/analysis/common/src/test/org/apache/lucene/analysis/core/TestFlattenGraphFilter.java b/lucene/analysis/common/src/test/org/apache/lucene/analysis/core/TestFlattenGraphFilter.java index 7b35f56016cb..13d3dd378e79 100644 --- a/lucene/analysis/common/src/test/org/apache/lucene/analysis/core/TestFlattenGraphFilter.java +++ b/lucene/analysis/common/src/test/org/apache/lucene/analysis/core/TestFlattenGraphFilter.java @@ -41,8 +41,8 @@ import org.apache.lucene.util.CharsRef; import org.apache.lucene.util.CharsRefBuilder; import org.apache.lucene.util.automaton.Automaton; -import org.apache.lucene.util.automaton.DaciukMihovAutomatonBuilder; import org.apache.lucene.util.automaton.Operations; +import org.apache.lucene.util.automaton.StringsToAutomaton; import org.apache.lucene.util.automaton.Transition; public class TestFlattenGraphFilter extends BaseTokenStreamTestCase { @@ -780,7 +780,7 @@ public void testPathsNotLost() throws IOException { acceptStrings.sort(Comparator.naturalOrder()); acceptStrings = acceptStrings.stream().limit(wordCount).collect(Collectors.toList()); - Automaton nonFlattenedAutomaton = DaciukMihovAutomatonBuilder.build(acceptStrings); + Automaton nonFlattenedAutomaton = StringsToAutomaton.build(acceptStrings); TokenStream ts = AutomatonToTokenStream.toTokenStream(nonFlattenedAutomaton); TokenStream flattenedTokenStream = new FlattenGraphFilter(ts); diff --git a/lucene/core/src/java/org/apache/lucene/util/automaton/Automata.java b/lucene/core/src/java/org/apache/lucene/util/automaton/Automata.java index 9a642338f09b..26afdc455649 100644 --- a/lucene/core/src/java/org/apache/lucene/util/automaton/Automata.java +++ b/lucene/core/src/java/org/apache/lucene/util/automaton/Automata.java @@ -573,7 +573,7 @@ public static Automaton makeStringUnion(Collection utf8Strings) { if (utf8Strings.isEmpty()) { return makeEmpty(); } else { - return DaciukMihovAutomatonBuilder.build(utf8Strings); + return StringsToAutomaton.build(utf8Strings); } } } diff --git a/lucene/core/src/java/org/apache/lucene/util/automaton/DaciukMihovAutomatonBuilder.java b/lucene/core/src/java/org/apache/lucene/util/automaton/StringsToAutomaton.java similarity index 95% rename from lucene/core/src/java/org/apache/lucene/util/automaton/DaciukMihovAutomatonBuilder.java rename to lucene/core/src/java/org/apache/lucene/util/automaton/StringsToAutomaton.java index 94002b04a40b..c40656d40894 100644 --- a/lucene/core/src/java/org/apache/lucene/util/automaton/DaciukMihovAutomatonBuilder.java +++ b/lucene/core/src/java/org/apache/lucene/util/automaton/StringsToAutomaton.java @@ -27,13 +27,15 @@ import org.apache.lucene.util.UnicodeUtil; /** - * Builds a minimal, deterministic {@link Automaton} that accepts a set of strings. The algorithm - * requires sorted input data, but is very fast (nearly linear with the input size). + * Builds a minimal, deterministic {@link Automaton} that accepts a set of strings using the + * algorithm described in Incremental Construction + * of Minimal Acyclic Finite-State Automata by Daciuk, Mihov, Watson and Watson. This requires + * sorted input data, but is very fast (nearly linear with the input size). * * @see #build(Collection) * @see Automata#makeStringUnion(Collection) */ -public final class DaciukMihovAutomatonBuilder { +public final class StringsToAutomaton { /** * This builder rejects terms that are more than 1k chars long since it then uses recursion based @@ -42,7 +44,7 @@ public final class DaciukMihovAutomatonBuilder { public static final int MAX_TERM_LENGTH = 1_000; /** The default constructor is private. Use static methods directly. */ - private DaciukMihovAutomatonBuilder() { + private StringsToAutomaton() { super(); } @@ -246,7 +248,7 @@ private static int convert( visited.put(s, converted); int i = 0; int[] labels = s.labels; - for (DaciukMihovAutomatonBuilder.State target : s.states) { + for (StringsToAutomaton.State target : s.states) { a.addTransition(converted, convert(a, target, visited), labels[i++]); } @@ -258,7 +260,7 @@ private static int convert( * strings in UTF-8. These strings must be binary-sorted. */ public static Automaton build(Collection input) { - final DaciukMihovAutomatonBuilder builder = new DaciukMihovAutomatonBuilder(); + final StringsToAutomaton builder = new StringsToAutomaton(); char[] chars = new char[0]; CharsRef ref = new CharsRef(); diff --git a/lucene/core/src/test/org/apache/lucene/analysis/TestAutomatonToTokenStream.java b/lucene/core/src/test/org/apache/lucene/analysis/TestAutomatonToTokenStream.java index 27d2c72ddb09..17157c48bdc3 100644 --- a/lucene/core/src/test/org/apache/lucene/analysis/TestAutomatonToTokenStream.java +++ b/lucene/core/src/test/org/apache/lucene/analysis/TestAutomatonToTokenStream.java @@ -23,7 +23,7 @@ import org.apache.lucene.tests.analysis.BaseTokenStreamTestCase; import org.apache.lucene.util.BytesRef; import org.apache.lucene.util.automaton.Automaton; -import org.apache.lucene.util.automaton.DaciukMihovAutomatonBuilder; +import org.apache.lucene.util.automaton.StringsToAutomaton; public class TestAutomatonToTokenStream extends BaseTokenStreamTestCase { @@ -31,7 +31,7 @@ public void testSinglePath() throws IOException { List acceptStrings = new ArrayList<>(); acceptStrings.add(new BytesRef("abc")); - Automaton flatPathAutomaton = DaciukMihovAutomatonBuilder.build(acceptStrings); + Automaton flatPathAutomaton = StringsToAutomaton.build(acceptStrings); TokenStream ts = AutomatonToTokenStream.toTokenStream(flatPathAutomaton); assertTokenStreamContents( ts, @@ -48,7 +48,7 @@ public void testParallelPaths() throws IOException { acceptStrings.add(new BytesRef("123")); acceptStrings.add(new BytesRef("abc")); - Automaton flatPathAutomaton = DaciukMihovAutomatonBuilder.build(acceptStrings); + Automaton flatPathAutomaton = StringsToAutomaton.build(acceptStrings); TokenStream ts = AutomatonToTokenStream.toTokenStream(flatPathAutomaton); assertTokenStreamContents( ts, @@ -65,7 +65,7 @@ public void testForkedPath() throws IOException { acceptStrings.add(new BytesRef("ab3")); acceptStrings.add(new BytesRef("abc")); - Automaton flatPathAutomaton = DaciukMihovAutomatonBuilder.build(acceptStrings); + Automaton flatPathAutomaton = StringsToAutomaton.build(acceptStrings); TokenStream ts = AutomatonToTokenStream.toTokenStream(flatPathAutomaton); assertTokenStreamContents( ts, diff --git a/lucene/core/src/test/org/apache/lucene/util/automaton/TestCompiledAutomaton.java b/lucene/core/src/test/org/apache/lucene/util/automaton/TestCompiledAutomaton.java index 432c281b1840..cb69e744b54a 100644 --- a/lucene/core/src/test/org/apache/lucene/util/automaton/TestCompiledAutomaton.java +++ b/lucene/core/src/test/org/apache/lucene/util/automaton/TestCompiledAutomaton.java @@ -35,7 +35,7 @@ private CompiledAutomaton build(int determinizeWorkLimit, String... strings) { terms.add(new BytesRef(s)); } Collections.sort(terms); - final Automaton a = DaciukMihovAutomatonBuilder.build(terms); + final Automaton a = StringsToAutomaton.build(terms); return new CompiledAutomaton(a, true, false, false); } diff --git a/lucene/core/src/test/org/apache/lucene/util/automaton/TestDaciukMihovAutomatonBuilder.java b/lucene/core/src/test/org/apache/lucene/util/automaton/TestStringsToAutomaton.java similarity index 84% rename from lucene/core/src/test/org/apache/lucene/util/automaton/TestDaciukMihovAutomatonBuilder.java rename to lucene/core/src/test/org/apache/lucene/util/automaton/TestStringsToAutomaton.java index 4b31eda98c6d..a251ae748809 100644 --- a/lucene/core/src/test/org/apache/lucene/util/automaton/TestDaciukMihovAutomatonBuilder.java +++ b/lucene/core/src/test/org/apache/lucene/util/automaton/TestStringsToAutomaton.java @@ -22,7 +22,7 @@ import org.apache.lucene.util.ArrayUtil; import org.apache.lucene.util.BytesRef; -public class TestDaciukMihovAutomatonBuilder extends LuceneTestCase { +public class TestStringsToAutomaton extends LuceneTestCase { public void testLargeTerms() { byte[] b10k = new byte[10_000]; @@ -30,12 +30,12 @@ public void testLargeTerms() { IllegalArgumentException e = expectThrows( IllegalArgumentException.class, - () -> DaciukMihovAutomatonBuilder.build(Collections.singleton(new BytesRef(b10k)))); + () -> StringsToAutomaton.build(Collections.singleton(new BytesRef(b10k)))); assertTrue( e.getMessage() .startsWith("This builder doesn't allow terms that are larger than 1,000 characters")); byte[] b1k = ArrayUtil.copyOfSubArray(b10k, 0, 1000); - DaciukMihovAutomatonBuilder.build(Collections.singleton(new BytesRef(b1k))); // no exception + StringsToAutomaton.build(Collections.singleton(new BytesRef(b1k))); // no exception } } diff --git a/lucene/highlighter/src/java/org/apache/lucene/search/uhighlight/MemoryIndexOffsetStrategy.java b/lucene/highlighter/src/java/org/apache/lucene/search/uhighlight/MemoryIndexOffsetStrategy.java index 7237be57807a..ad29e2de8a36 100644 --- a/lucene/highlighter/src/java/org/apache/lucene/search/uhighlight/MemoryIndexOffsetStrategy.java +++ b/lucene/highlighter/src/java/org/apache/lucene/search/uhighlight/MemoryIndexOffsetStrategy.java @@ -29,7 +29,7 @@ import org.apache.lucene.index.memory.MemoryIndex; import org.apache.lucene.queries.spans.SpanQuery; import org.apache.lucene.util.BytesRef; -import org.apache.lucene.util.automaton.DaciukMihovAutomatonBuilder; +import org.apache.lucene.util.automaton.StringsToAutomaton; /** * Uses an {@link Analyzer} on content to get offsets and then populates a {@link MemoryIndex}. @@ -66,7 +66,7 @@ private static CharArrayMatcher buildCombinedAutomaton(UHComponents components) // to build an automaton on them List filteredTerms = Arrays.stream(components.getTerms()) - .filter(b -> b.length < DaciukMihovAutomatonBuilder.MAX_TERM_LENGTH) + .filter(b -> b.length < StringsToAutomaton.MAX_TERM_LENGTH) .toList(); allAutomata.add(CharArrayMatcher.fromTerms(filteredTerms)); } diff --git a/lucene/highlighter/src/test/org/apache/lucene/search/uhighlight/TestUnifiedHighlighter.java b/lucene/highlighter/src/test/org/apache/lucene/search/uhighlight/TestUnifiedHighlighter.java index 6cc524a9fac4..821b93083357 100644 --- a/lucene/highlighter/src/test/org/apache/lucene/search/uhighlight/TestUnifiedHighlighter.java +++ b/lucene/highlighter/src/test/org/apache/lucene/search/uhighlight/TestUnifiedHighlighter.java @@ -58,7 +58,7 @@ import org.apache.lucene.tests.analysis.MockTokenizer; import org.apache.lucene.tests.index.RandomIndexWriter; import org.apache.lucene.tests.util.LuceneTestCase; -import org.apache.lucene.util.automaton.DaciukMihovAutomatonBuilder; +import org.apache.lucene.util.automaton.StringsToAutomaton; import org.junit.After; import org.junit.Before; @@ -1671,12 +1671,11 @@ public void testQueryWithLongTerm() throws IOException { Query query = new BooleanQuery.Builder() .add( - new TermQuery( - new Term("title", "a".repeat(DaciukMihovAutomatonBuilder.MAX_TERM_LENGTH))), + new TermQuery(new Term("title", "a".repeat(StringsToAutomaton.MAX_TERM_LENGTH))), BooleanClause.Occur.SHOULD) .add( new TermQuery( - new Term("title", "a".repeat(DaciukMihovAutomatonBuilder.MAX_TERM_LENGTH + 1))), + new Term("title", "a".repeat(StringsToAutomaton.MAX_TERM_LENGTH + 1))), BooleanClause.Occur.SHOULD) .add(new TermQuery(new Term("title", "title")), BooleanClause.Occur.SHOULD) .build();