Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

GH#12321: Marked DaciukMihovAutomatonBuilder as deprecated #12332

Merged
merged 1 commit into from
May 26, 2023
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
3 changes: 3 additions & 0 deletions lucene/CHANGES.txt
Original file line number Diff line number Diff line change
Expand Up @@ -15,6 +15,9 @@ API Changes
RuntimePermission "accessDeclaredMembers" is needed in applications using
SecurityManager. (Patrick Zhai, Ben Trent, Uwe Schindler)

* GITHUB#xx: DaciukMihovAutomatonBuilder has been marked deprecated in preparation of reducing its visibility in
a future release. (Greg Miller)

New Features
---------------------

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -40,8 +40,8 @@
import org.apache.lucene.util.BytesRef;
import org.apache.lucene.util.CharsRef;
import org.apache.lucene.util.CharsRefBuilder;
import org.apache.lucene.util.automaton.Automata;
import org.apache.lucene.util.automaton.Automaton;
import org.apache.lucene.util.automaton.DaciukMihovAutomatonBuilder;
import org.apache.lucene.util.automaton.Operations;
import org.apache.lucene.util.automaton.Transition;

Expand Down Expand Up @@ -780,7 +780,7 @@ public void testPathsNotLost() throws IOException {
acceptStrings.sort(Comparator.naturalOrder());

acceptStrings = acceptStrings.stream().limit(wordCount).collect(Collectors.toList());
Automaton nonFlattenedAutomaton = DaciukMihovAutomatonBuilder.build(acceptStrings);
Automaton nonFlattenedAutomaton = Automata.makeStringUnion(acceptStrings);

TokenStream ts = AutomatonToTokenStream.toTokenStream(nonFlattenedAutomaton);
TokenStream flattenedTokenStream = new FlattenGraphFilter(ts);
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -40,6 +40,11 @@
* @lucene.experimental
*/
public final class Automata {
/**
* {@link #makeStringUnion(Collection)} limits terms of this max length to ensure the stack
* doesn't overflow while building, since our algorithm currently relies on recursion.
*/
public static final int MAX_STRING_UNION_TERM_LENGTH = 1000;

private Automata() {}

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -32,14 +32,19 @@
*
* @see #build(Collection)
* @see Automata#makeStringUnion(Collection)
* @deprecated Visibility of this class will be reduced in a future release. Users can access this
* functionality directly through {@link Automata#makeStringUnion(Collection)}
*/
@Deprecated
public final class DaciukMihovAutomatonBuilder {

/**
* This builder rejects terms that are more than 1k chars long since it then uses recursion based
* on the length of the string, which might cause stack overflows.
*
* @deprecated See {@link Automata#MAX_STRING_UNION_TERM_LENGTH}
*/
public static final int MAX_TERM_LENGTH = 1_000;
@Deprecated public static final int MAX_TERM_LENGTH = 1_000;

/** The default constructor is private. Use static methods directly. */
private DaciukMihovAutomatonBuilder() {
Expand Down Expand Up @@ -193,7 +198,7 @@ private static boolean referenceEquals(Object[] a1, Object[] a2) {
* or equal compared to any previous sequences added to this automaton (the input must be sorted).
*/
private void add(CharsRef current) {
if (current.length > MAX_TERM_LENGTH) {
if (current.length > Automata.MAX_STRING_UNION_TERM_LENGTH) {
throw new IllegalArgumentException(
"This builder doesn't allow terms that are larger than 1,000 characters, got " + current);
}
Expand Down Expand Up @@ -266,7 +271,10 @@ private static int convert(
/**
* Build a minimal, deterministic automaton from a sorted list of {@link BytesRef} representing
* strings in UTF-8. These strings must be binary-sorted.
*
* @deprecated Please see {@link Automata#makeStringUnion(Collection)} instead
*/
@Deprecated
public static Automaton build(Collection<BytesRef> input) {
final DaciukMihovAutomatonBuilder builder = new DaciukMihovAutomatonBuilder();

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -22,16 +22,16 @@
import java.util.List;
import org.apache.lucene.tests.analysis.BaseTokenStreamTestCase;
import org.apache.lucene.util.BytesRef;
import org.apache.lucene.util.automaton.Automata;
import org.apache.lucene.util.automaton.Automaton;
import org.apache.lucene.util.automaton.DaciukMihovAutomatonBuilder;

public class TestAutomatonToTokenStream extends BaseTokenStreamTestCase {

public void testSinglePath() throws IOException {
List<BytesRef> acceptStrings = new ArrayList<>();
acceptStrings.add(new BytesRef("abc"));

Automaton flatPathAutomaton = DaciukMihovAutomatonBuilder.build(acceptStrings);
Automaton flatPathAutomaton = Automata.makeStringUnion(acceptStrings);
TokenStream ts = AutomatonToTokenStream.toTokenStream(flatPathAutomaton);
assertTokenStreamContents(
ts,
Expand All @@ -48,7 +48,7 @@ public void testParallelPaths() throws IOException {
acceptStrings.add(new BytesRef("123"));
acceptStrings.add(new BytesRef("abc"));

Automaton flatPathAutomaton = DaciukMihovAutomatonBuilder.build(acceptStrings);
Automaton flatPathAutomaton = Automata.makeStringUnion(acceptStrings);
TokenStream ts = AutomatonToTokenStream.toTokenStream(flatPathAutomaton);
assertTokenStreamContents(
ts,
Expand All @@ -65,7 +65,7 @@ public void testForkedPath() throws IOException {
acceptStrings.add(new BytesRef("ab3"));
acceptStrings.add(new BytesRef("abc"));

Automaton flatPathAutomaton = DaciukMihovAutomatonBuilder.build(acceptStrings);
Automaton flatPathAutomaton = Automata.makeStringUnion(acceptStrings);
TokenStream ts = AutomatonToTokenStream.toTokenStream(flatPathAutomaton);
assertTokenStreamContents(
ts,
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -35,7 +35,7 @@ private CompiledAutomaton build(int determinizeWorkLimit, String... strings) {
terms.add(new BytesRef(s));
}
Collections.sort(terms);
final Automaton a = DaciukMihovAutomatonBuilder.build(terms);
final Automaton a = Automata.makeStringUnion(terms);
return new CompiledAutomaton(a, true, false, determinizeWorkLimit, false);
}

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -30,7 +30,7 @@
import org.apache.lucene.index.memory.MemoryIndex;
import org.apache.lucene.queries.spans.SpanQuery;
import org.apache.lucene.util.BytesRef;
import org.apache.lucene.util.automaton.DaciukMihovAutomatonBuilder;
import org.apache.lucene.util.automaton.Automata;

/**
* Uses an {@link Analyzer} on content to get offsets and then populates a {@link MemoryIndex}.
Expand Down Expand Up @@ -67,7 +67,7 @@ private static CharArrayMatcher buildCombinedAutomaton(UHComponents components)
// to build an automaton on them
List<BytesRef> filteredTerms =
Arrays.stream(components.getTerms())
.filter(b -> b.length < DaciukMihovAutomatonBuilder.MAX_TERM_LENGTH)
.filter(b -> b.length < Automata.MAX_STRING_UNION_TERM_LENGTH)
.collect(Collectors.toList());
allAutomata.add(CharArrayMatcher.fromTerms(filteredTerms));
}
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -58,7 +58,7 @@
import org.apache.lucene.tests.analysis.MockTokenizer;
import org.apache.lucene.tests.index.RandomIndexWriter;
import org.apache.lucene.tests.util.LuceneTestCase;
import org.apache.lucene.util.automaton.DaciukMihovAutomatonBuilder;
import org.apache.lucene.util.automaton.Automata;
import org.junit.After;
import org.junit.Before;

Expand Down Expand Up @@ -1671,12 +1671,11 @@ public void testQueryWithLongTerm() throws IOException {
Query query =
new BooleanQuery.Builder()
.add(
new TermQuery(
new Term("title", "a".repeat(DaciukMihovAutomatonBuilder.MAX_TERM_LENGTH))),
new TermQuery(new Term("title", "a".repeat(Automata.MAX_STRING_UNION_TERM_LENGTH))),
BooleanClause.Occur.SHOULD)
.add(
new TermQuery(
new Term("title", "a".repeat(DaciukMihovAutomatonBuilder.MAX_TERM_LENGTH + 1))),
new Term("title", "a".repeat(Automata.MAX_STRING_UNION_TERM_LENGTH + 1))),
BooleanClause.Occur.SHOULD)
.add(new TermQuery(new Term("title", "title")), BooleanClause.Occur.SHOULD)
.build();
Expand Down