diff --git a/lucene/CHANGES.txt b/lucene/CHANGES.txt index 7fe8d40114bb..8d12ffca5f18 100644 --- a/lucene/CHANGES.txt +++ b/lucene/CHANGES.txt @@ -53,6 +53,8 @@ API Changes * GITHUB#12276: Rename DaciukMihovAutomatonBuilder to StringsToAutomaton +* GITHUB#12321: Reduced visibility of StringsToAutomaton. Please use Automata#makeStringUnion instead. (Greg Miller) + New Features --------------------- diff --git a/lucene/MIGRATE.md b/lucene/MIGRATE.md index 2bb6c27f563a..15f813fde8b7 100644 --- a/lucene/MIGRATE.md +++ b/lucene/MIGRATE.md @@ -90,7 +90,10 @@ for the currently-positioned document (doing so will result in undefined behavio Callers should remove the parameter when calling this method. -### DaciukMihovAutomatonBuilder is renamed to StringsToAutomaton +### DaciukMihovAutomatonBuilder is renamed to StringsToAutomaton and made package-private + +The former `DaciukMihovAutomatonBuilder#build` functionality is exposed through `Automata#makeStringUnion`. +Users should be able to directly migrate to the `Automata` static method as a 1:1 replacement. ## Migration from Lucene 9.0 to Lucene 9.1 diff --git a/lucene/analysis/common/src/test/org/apache/lucene/analysis/core/TestFlattenGraphFilter.java b/lucene/analysis/common/src/test/org/apache/lucene/analysis/core/TestFlattenGraphFilter.java index 13d3dd378e79..7fa901ac5bf4 100644 --- a/lucene/analysis/common/src/test/org/apache/lucene/analysis/core/TestFlattenGraphFilter.java +++ b/lucene/analysis/common/src/test/org/apache/lucene/analysis/core/TestFlattenGraphFilter.java @@ -40,9 +40,9 @@ import org.apache.lucene.util.BytesRef; import org.apache.lucene.util.CharsRef; import org.apache.lucene.util.CharsRefBuilder; +import org.apache.lucene.util.automaton.Automata; import org.apache.lucene.util.automaton.Automaton; import org.apache.lucene.util.automaton.Operations; -import org.apache.lucene.util.automaton.StringsToAutomaton; import org.apache.lucene.util.automaton.Transition; public class TestFlattenGraphFilter extends BaseTokenStreamTestCase { @@ -780,7 +780,7 @@ public void testPathsNotLost() throws IOException { acceptStrings.sort(Comparator.naturalOrder()); acceptStrings = acceptStrings.stream().limit(wordCount).collect(Collectors.toList()); - Automaton nonFlattenedAutomaton = StringsToAutomaton.build(acceptStrings); + Automaton nonFlattenedAutomaton = Automata.makeStringUnion(acceptStrings); TokenStream ts = AutomatonToTokenStream.toTokenStream(nonFlattenedAutomaton); TokenStream flattenedTokenStream = new FlattenGraphFilter(ts); diff --git a/lucene/core/src/java/org/apache/lucene/util/automaton/Automata.java b/lucene/core/src/java/org/apache/lucene/util/automaton/Automata.java index 26afdc455649..20233374ce93 100644 --- a/lucene/core/src/java/org/apache/lucene/util/automaton/Automata.java +++ b/lucene/core/src/java/org/apache/lucene/util/automaton/Automata.java @@ -40,6 +40,11 @@ * @lucene.experimental */ public final class Automata { + /** + * {@link #makeStringUnion(Collection)} limits terms of this max length to ensure the stack + * doesn't overflow while building, since our algorithm currently relies on recursion. + */ + public static final int MAX_STRING_UNION_TERM_LENGTH = 1000; private Automata() {} diff --git a/lucene/core/src/java/org/apache/lucene/util/automaton/StringsToAutomaton.java b/lucene/core/src/java/org/apache/lucene/util/automaton/StringsToAutomaton.java index c85e26487039..08826dcbbe3c 100644 --- a/lucene/core/src/java/org/apache/lucene/util/automaton/StringsToAutomaton.java +++ b/lucene/core/src/java/org/apache/lucene/util/automaton/StringsToAutomaton.java @@ -32,16 +32,9 @@ * of Minimal Acyclic Finite-State Automata by Daciuk, Mihov, Watson and Watson. This requires * sorted input data, but is very fast (nearly linear with the input size). * - * @see #build(Collection) * @see Automata#makeStringUnion(Collection) */ -public final class StringsToAutomaton { - - /** - * This builder rejects terms that are more than 1k chars long since it then uses recursion based - * on the length of the string, which might cause stack overflows. - */ - public static final int MAX_TERM_LENGTH = 1_000; +final class StringsToAutomaton { /** The default constructor is private. Use static methods directly. */ private StringsToAutomaton() { @@ -195,7 +188,7 @@ private static boolean referenceEquals(Object[] a1, Object[] a2) { * or equal compared to any previous sequences added to this automaton (the input must be sorted). */ private void add(CharsRef current) { - if (current.length > MAX_TERM_LENGTH) { + if (current.length > Automata.MAX_STRING_UNION_TERM_LENGTH) { throw new IllegalArgumentException( "This builder doesn't allow terms that are larger than 1,000 characters, got " + current); } @@ -269,7 +262,7 @@ private static int convert( * Build a minimal, deterministic automaton from a sorted list of {@link BytesRef} representing * strings in UTF-8. These strings must be binary-sorted. */ - public static Automaton build(Collection input) { + static Automaton build(Collection input) { final StringsToAutomaton builder = new StringsToAutomaton(); CharsRefBuilder current = new CharsRefBuilder(); diff --git a/lucene/core/src/test/org/apache/lucene/analysis/TestAutomatonToTokenStream.java b/lucene/core/src/test/org/apache/lucene/analysis/TestAutomatonToTokenStream.java index 17157c48bdc3..d558ad6d3697 100644 --- a/lucene/core/src/test/org/apache/lucene/analysis/TestAutomatonToTokenStream.java +++ b/lucene/core/src/test/org/apache/lucene/analysis/TestAutomatonToTokenStream.java @@ -22,8 +22,8 @@ import java.util.List; import org.apache.lucene.tests.analysis.BaseTokenStreamTestCase; import org.apache.lucene.util.BytesRef; +import org.apache.lucene.util.automaton.Automata; import org.apache.lucene.util.automaton.Automaton; -import org.apache.lucene.util.automaton.StringsToAutomaton; public class TestAutomatonToTokenStream extends BaseTokenStreamTestCase { @@ -31,7 +31,7 @@ public void testSinglePath() throws IOException { List acceptStrings = new ArrayList<>(); acceptStrings.add(new BytesRef("abc")); - Automaton flatPathAutomaton = StringsToAutomaton.build(acceptStrings); + Automaton flatPathAutomaton = Automata.makeStringUnion(acceptStrings); TokenStream ts = AutomatonToTokenStream.toTokenStream(flatPathAutomaton); assertTokenStreamContents( ts, @@ -48,7 +48,7 @@ public void testParallelPaths() throws IOException { acceptStrings.add(new BytesRef("123")); acceptStrings.add(new BytesRef("abc")); - Automaton flatPathAutomaton = StringsToAutomaton.build(acceptStrings); + Automaton flatPathAutomaton = Automata.makeStringUnion(acceptStrings); TokenStream ts = AutomatonToTokenStream.toTokenStream(flatPathAutomaton); assertTokenStreamContents( ts, @@ -65,7 +65,7 @@ public void testForkedPath() throws IOException { acceptStrings.add(new BytesRef("ab3")); acceptStrings.add(new BytesRef("abc")); - Automaton flatPathAutomaton = StringsToAutomaton.build(acceptStrings); + Automaton flatPathAutomaton = Automata.makeStringUnion(acceptStrings); TokenStream ts = AutomatonToTokenStream.toTokenStream(flatPathAutomaton); assertTokenStreamContents( ts, diff --git a/lucene/core/src/test/org/apache/lucene/util/automaton/TestCompiledAutomaton.java b/lucene/core/src/test/org/apache/lucene/util/automaton/TestCompiledAutomaton.java index cb69e744b54a..b4e23f6a2b97 100644 --- a/lucene/core/src/test/org/apache/lucene/util/automaton/TestCompiledAutomaton.java +++ b/lucene/core/src/test/org/apache/lucene/util/automaton/TestCompiledAutomaton.java @@ -35,7 +35,7 @@ private CompiledAutomaton build(int determinizeWorkLimit, String... strings) { terms.add(new BytesRef(s)); } Collections.sort(terms); - final Automaton a = StringsToAutomaton.build(terms); + final Automaton a = Automata.makeStringUnion(terms); return new CompiledAutomaton(a, true, false, false); } diff --git a/lucene/highlighter/src/java/org/apache/lucene/search/uhighlight/MemoryIndexOffsetStrategy.java b/lucene/highlighter/src/java/org/apache/lucene/search/uhighlight/MemoryIndexOffsetStrategy.java index ad29e2de8a36..a4b090902ed4 100644 --- a/lucene/highlighter/src/java/org/apache/lucene/search/uhighlight/MemoryIndexOffsetStrategy.java +++ b/lucene/highlighter/src/java/org/apache/lucene/search/uhighlight/MemoryIndexOffsetStrategy.java @@ -29,7 +29,7 @@ import org.apache.lucene.index.memory.MemoryIndex; import org.apache.lucene.queries.spans.SpanQuery; import org.apache.lucene.util.BytesRef; -import org.apache.lucene.util.automaton.StringsToAutomaton; +import org.apache.lucene.util.automaton.Automata; /** * Uses an {@link Analyzer} on content to get offsets and then populates a {@link MemoryIndex}. @@ -66,7 +66,7 @@ private static CharArrayMatcher buildCombinedAutomaton(UHComponents components) // to build an automaton on them List filteredTerms = Arrays.stream(components.getTerms()) - .filter(b -> b.length < StringsToAutomaton.MAX_TERM_LENGTH) + .filter(b -> b.length < Automata.MAX_STRING_UNION_TERM_LENGTH) .toList(); allAutomata.add(CharArrayMatcher.fromTerms(filteredTerms)); } diff --git a/lucene/highlighter/src/test/org/apache/lucene/search/uhighlight/TestUnifiedHighlighter.java b/lucene/highlighter/src/test/org/apache/lucene/search/uhighlight/TestUnifiedHighlighter.java index 821b93083357..54792a569f76 100644 --- a/lucene/highlighter/src/test/org/apache/lucene/search/uhighlight/TestUnifiedHighlighter.java +++ b/lucene/highlighter/src/test/org/apache/lucene/search/uhighlight/TestUnifiedHighlighter.java @@ -58,7 +58,7 @@ import org.apache.lucene.tests.analysis.MockTokenizer; import org.apache.lucene.tests.index.RandomIndexWriter; import org.apache.lucene.tests.util.LuceneTestCase; -import org.apache.lucene.util.automaton.StringsToAutomaton; +import org.apache.lucene.util.automaton.Automata; import org.junit.After; import org.junit.Before; @@ -1671,11 +1671,11 @@ public void testQueryWithLongTerm() throws IOException { Query query = new BooleanQuery.Builder() .add( - new TermQuery(new Term("title", "a".repeat(StringsToAutomaton.MAX_TERM_LENGTH))), + new TermQuery(new Term("title", "a".repeat(Automata.MAX_STRING_UNION_TERM_LENGTH))), BooleanClause.Occur.SHOULD) .add( new TermQuery( - new Term("title", "a".repeat(StringsToAutomaton.MAX_TERM_LENGTH + 1))), + new Term("title", "a".repeat(Automata.MAX_STRING_UNION_TERM_LENGTH + 1))), BooleanClause.Occur.SHOULD) .add(new TermQuery(new Term("title", "title")), BooleanClause.Occur.SHOULD) .build();