Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

GH#12321: Reduce visibility of StringsToAutomaton #12331

Merged
merged 2 commits into from
May 26, 2023
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 2 additions & 0 deletions lucene/CHANGES.txt
Original file line number Diff line number Diff line change
Expand Up @@ -53,6 +53,8 @@ API Changes

* GITHUB#12276: Rename DaciukMihovAutomatonBuilder to StringsToAutomaton

* GITHUB#12321: Reduced visibility of StringsToAutomaton. Please use Automata#makeStringUnion instead. (Greg Miller)

New Features
---------------------

Expand Down
5 changes: 4 additions & 1 deletion lucene/MIGRATE.md
Original file line number Diff line number Diff line change
Expand Up @@ -90,7 +90,10 @@ for the currently-positioned document (doing so will result in undefined behavio
Callers should remove the parameter when calling this method.


### DaciukMihovAutomatonBuilder is renamed to StringsToAutomaton
### DaciukMihovAutomatonBuilder is renamed to StringsToAutomaton and made package-private

The former `DaciukMihovAutomatonBuilder#build` functionality is exposed through `Automata#makeStringUnion`.
Users should be able to directly migrate to the `Automata` static method as a 1:1 replacement.


## Migration from Lucene 9.0 to Lucene 9.1
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -40,9 +40,9 @@
import org.apache.lucene.util.BytesRef;
import org.apache.lucene.util.CharsRef;
import org.apache.lucene.util.CharsRefBuilder;
import org.apache.lucene.util.automaton.Automata;
import org.apache.lucene.util.automaton.Automaton;
import org.apache.lucene.util.automaton.Operations;
import org.apache.lucene.util.automaton.StringsToAutomaton;
import org.apache.lucene.util.automaton.Transition;

public class TestFlattenGraphFilter extends BaseTokenStreamTestCase {
Expand Down Expand Up @@ -780,7 +780,7 @@ public void testPathsNotLost() throws IOException {
acceptStrings.sort(Comparator.naturalOrder());

acceptStrings = acceptStrings.stream().limit(wordCount).collect(Collectors.toList());
Automaton nonFlattenedAutomaton = StringsToAutomaton.build(acceptStrings);
Automaton nonFlattenedAutomaton = Automata.makeStringUnion(acceptStrings);

TokenStream ts = AutomatonToTokenStream.toTokenStream(nonFlattenedAutomaton);
TokenStream flattenedTokenStream = new FlattenGraphFilter(ts);
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -40,6 +40,11 @@
* @lucene.experimental
*/
public final class Automata {
/**
* {@link #makeStringUnion(Collection)} limits terms of this max length to ensure the stack
* doesn't overflow while building, since our algorithm currently relies on recursion.
*/
public static final int MAX_STRING_UNION_TERM_LENGTH = 1000;

private Automata() {}

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -32,16 +32,9 @@
* of Minimal Acyclic Finite-State Automata by Daciuk, Mihov, Watson and Watson</a>. This requires
* sorted input data, but is very fast (nearly linear with the input size).
*
* @see #build(Collection)
* @see Automata#makeStringUnion(Collection)
*/
public final class StringsToAutomaton {

/**
* This builder rejects terms that are more than 1k chars long since it then uses recursion based
* on the length of the string, which might cause stack overflows.
*/
public static final int MAX_TERM_LENGTH = 1_000;
final class StringsToAutomaton {

/** The default constructor is private. Use static methods directly. */
private StringsToAutomaton() {
Expand Down Expand Up @@ -195,7 +188,7 @@ private static boolean referenceEquals(Object[] a1, Object[] a2) {
* or equal compared to any previous sequences added to this automaton (the input must be sorted).
*/
private void add(CharsRef current) {
if (current.length > MAX_TERM_LENGTH) {
if (current.length > Automata.MAX_STRING_UNION_TERM_LENGTH) {
throw new IllegalArgumentException(
"This builder doesn't allow terms that are larger than 1,000 characters, got " + current);
}
Expand Down Expand Up @@ -269,7 +262,7 @@ private static int convert(
* Build a minimal, deterministic automaton from a sorted list of {@link BytesRef} representing
* strings in UTF-8. These strings must be binary-sorted.
*/
public static Automaton build(Collection<BytesRef> input) {
static Automaton build(Collection<BytesRef> input) {
final StringsToAutomaton builder = new StringsToAutomaton();

CharsRefBuilder current = new CharsRefBuilder();
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -22,16 +22,16 @@
import java.util.List;
import org.apache.lucene.tests.analysis.BaseTokenStreamTestCase;
import org.apache.lucene.util.BytesRef;
import org.apache.lucene.util.automaton.Automata;
import org.apache.lucene.util.automaton.Automaton;
import org.apache.lucene.util.automaton.StringsToAutomaton;

public class TestAutomatonToTokenStream extends BaseTokenStreamTestCase {

public void testSinglePath() throws IOException {
List<BytesRef> acceptStrings = new ArrayList<>();
acceptStrings.add(new BytesRef("abc"));

Automaton flatPathAutomaton = StringsToAutomaton.build(acceptStrings);
Automaton flatPathAutomaton = Automata.makeStringUnion(acceptStrings);
TokenStream ts = AutomatonToTokenStream.toTokenStream(flatPathAutomaton);
assertTokenStreamContents(
ts,
Expand All @@ -48,7 +48,7 @@ public void testParallelPaths() throws IOException {
acceptStrings.add(new BytesRef("123"));
acceptStrings.add(new BytesRef("abc"));

Automaton flatPathAutomaton = StringsToAutomaton.build(acceptStrings);
Automaton flatPathAutomaton = Automata.makeStringUnion(acceptStrings);
TokenStream ts = AutomatonToTokenStream.toTokenStream(flatPathAutomaton);
assertTokenStreamContents(
ts,
Expand All @@ -65,7 +65,7 @@ public void testForkedPath() throws IOException {
acceptStrings.add(new BytesRef("ab3"));
acceptStrings.add(new BytesRef("abc"));

Automaton flatPathAutomaton = StringsToAutomaton.build(acceptStrings);
Automaton flatPathAutomaton = Automata.makeStringUnion(acceptStrings);
TokenStream ts = AutomatonToTokenStream.toTokenStream(flatPathAutomaton);
assertTokenStreamContents(
ts,
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -35,7 +35,7 @@ private CompiledAutomaton build(int determinizeWorkLimit, String... strings) {
terms.add(new BytesRef(s));
}
Collections.sort(terms);
final Automaton a = StringsToAutomaton.build(terms);
final Automaton a = Automata.makeStringUnion(terms);
return new CompiledAutomaton(a, true, false, false);
}

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -29,7 +29,7 @@
import org.apache.lucene.index.memory.MemoryIndex;
import org.apache.lucene.queries.spans.SpanQuery;
import org.apache.lucene.util.BytesRef;
import org.apache.lucene.util.automaton.StringsToAutomaton;
import org.apache.lucene.util.automaton.Automata;

/**
* Uses an {@link Analyzer} on content to get offsets and then populates a {@link MemoryIndex}.
Expand Down Expand Up @@ -66,7 +66,7 @@ private static CharArrayMatcher buildCombinedAutomaton(UHComponents components)
// to build an automaton on them
List<BytesRef> filteredTerms =
Arrays.stream(components.getTerms())
.filter(b -> b.length < StringsToAutomaton.MAX_TERM_LENGTH)
.filter(b -> b.length < Automata.MAX_STRING_UNION_TERM_LENGTH)
.toList();
allAutomata.add(CharArrayMatcher.fromTerms(filteredTerms));
}
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -58,7 +58,7 @@
import org.apache.lucene.tests.analysis.MockTokenizer;
import org.apache.lucene.tests.index.RandomIndexWriter;
import org.apache.lucene.tests.util.LuceneTestCase;
import org.apache.lucene.util.automaton.StringsToAutomaton;
import org.apache.lucene.util.automaton.Automata;
import org.junit.After;
import org.junit.Before;

Expand Down Expand Up @@ -1671,11 +1671,11 @@ public void testQueryWithLongTerm() throws IOException {
Query query =
new BooleanQuery.Builder()
.add(
new TermQuery(new Term("title", "a".repeat(StringsToAutomaton.MAX_TERM_LENGTH))),
new TermQuery(new Term("title", "a".repeat(Automata.MAX_STRING_UNION_TERM_LENGTH))),
BooleanClause.Occur.SHOULD)
.add(
new TermQuery(
new Term("title", "a".repeat(StringsToAutomaton.MAX_TERM_LENGTH + 1))),
new Term("title", "a".repeat(Automata.MAX_STRING_UNION_TERM_LENGTH + 1))),
BooleanClause.Occur.SHOULD)
.add(new TermQuery(new Term("title", "title")), BooleanClause.Occur.SHOULD)
.build();
Expand Down