Skip to content

Commit

Permalink
apache#12276: rename DaciukMihovAutomatonBuilder to StringsToAutomaton
Browse files Browse the repository at this point in the history
  • Loading branch information
mikemccand committed May 18, 2023
1 parent 0c6e8ae commit ad3f89a
Show file tree
Hide file tree
Showing 10 changed files with 30 additions and 23 deletions.
2 changes: 2 additions & 0 deletions lucene/CHANGES.txt
Original file line number Diff line number Diff line change
Expand Up @@ -51,6 +51,8 @@ API Changes
They have already been runtime-checked to only be implemented by the specific classes
so this is effectively a non-breaking change.

* GITHUB#12276: Rename DaciukMihovAutomatonBuilder to StringsToAutomaton

New Features
---------------------

Expand Down
4 changes: 4 additions & 0 deletions lucene/MIGRATE.md
Original file line number Diff line number Diff line change
Expand Up @@ -89,6 +89,10 @@ for the currently-positioned document (doing so will result in undefined behavio
`IOContext.READONCE` for opening internally, as that's the only valid usage pattern for checksum input.
Callers should remove the parameter when calling this method.


### DaciukMihovAutomatonBuilder is renamed to StringsToAutomaton


## Migration from Lucene 9.0 to Lucene 9.1

### Test framework package migration and module (LUCENE-10301)
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -41,8 +41,8 @@
import org.apache.lucene.util.CharsRef;
import org.apache.lucene.util.CharsRefBuilder;
import org.apache.lucene.util.automaton.Automaton;
import org.apache.lucene.util.automaton.DaciukMihovAutomatonBuilder;
import org.apache.lucene.util.automaton.Operations;
import org.apache.lucene.util.automaton.StringsToAutomaton;
import org.apache.lucene.util.automaton.Transition;

public class TestFlattenGraphFilter extends BaseTokenStreamTestCase {
Expand Down Expand Up @@ -780,7 +780,7 @@ public void testPathsNotLost() throws IOException {
acceptStrings.sort(Comparator.naturalOrder());

acceptStrings = acceptStrings.stream().limit(wordCount).collect(Collectors.toList());
Automaton nonFlattenedAutomaton = DaciukMihovAutomatonBuilder.build(acceptStrings);
Automaton nonFlattenedAutomaton = StringsToAutomaton.build(acceptStrings);

TokenStream ts = AutomatonToTokenStream.toTokenStream(nonFlattenedAutomaton);
TokenStream flattenedTokenStream = new FlattenGraphFilter(ts);
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -573,7 +573,7 @@ public static Automaton makeStringUnion(Collection<BytesRef> utf8Strings) {
if (utf8Strings.isEmpty()) {
return makeEmpty();
} else {
return DaciukMihovAutomatonBuilder.build(utf8Strings);
return StringsToAutomaton.build(utf8Strings);
}
}
}
Original file line number Diff line number Diff line change
Expand Up @@ -27,13 +27,15 @@
import org.apache.lucene.util.UnicodeUtil;

/**
* Builds a minimal, deterministic {@link Automaton} that accepts a set of strings. The algorithm
* requires sorted input data, but is very fast (nearly linear with the input size).
* Builds a minimal, deterministic {@link Automaton} that accepts a set of strings using the
* algorithm described in <a href="https://aclanthology.org/J00-1002.pdf">Incremental Construction
* of Minimal Acyclic Finite-State Automata by Daciuk, Mihov, Watson and Watson</a>. This requires
* sorted input data, but is very fast (nearly linear with the input size).
*
* @see #build(Collection)
* @see Automata#makeStringUnion(Collection)
*/
public final class DaciukMihovAutomatonBuilder {
public final class StringsToAutomaton {

/**
* This builder rejects terms that are more than 1k chars long since it then uses recursion based
Expand All @@ -42,7 +44,7 @@ public final class DaciukMihovAutomatonBuilder {
public static final int MAX_TERM_LENGTH = 1_000;

/** The default constructor is private. Use static methods directly. */
private DaciukMihovAutomatonBuilder() {
private StringsToAutomaton() {
super();
}

Expand Down Expand Up @@ -246,7 +248,7 @@ private static int convert(
visited.put(s, converted);
int i = 0;
int[] labels = s.labels;
for (DaciukMihovAutomatonBuilder.State target : s.states) {
for (StringsToAutomaton.State target : s.states) {
a.addTransition(converted, convert(a, target, visited), labels[i++]);
}

Expand All @@ -258,7 +260,7 @@ private static int convert(
* strings in UTF-8. These strings must be binary-sorted.
*/
public static Automaton build(Collection<BytesRef> input) {
final DaciukMihovAutomatonBuilder builder = new DaciukMihovAutomatonBuilder();
final StringsToAutomaton builder = new StringsToAutomaton();

char[] chars = new char[0];
CharsRef ref = new CharsRef();
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -23,15 +23,15 @@
import org.apache.lucene.tests.analysis.BaseTokenStreamTestCase;
import org.apache.lucene.util.BytesRef;
import org.apache.lucene.util.automaton.Automaton;
import org.apache.lucene.util.automaton.DaciukMihovAutomatonBuilder;
import org.apache.lucene.util.automaton.StringsToAutomaton;

public class TestAutomatonToTokenStream extends BaseTokenStreamTestCase {

public void testSinglePath() throws IOException {
List<BytesRef> acceptStrings = new ArrayList<>();
acceptStrings.add(new BytesRef("abc"));

Automaton flatPathAutomaton = DaciukMihovAutomatonBuilder.build(acceptStrings);
Automaton flatPathAutomaton = StringsToAutomaton.build(acceptStrings);
TokenStream ts = AutomatonToTokenStream.toTokenStream(flatPathAutomaton);
assertTokenStreamContents(
ts,
Expand All @@ -48,7 +48,7 @@ public void testParallelPaths() throws IOException {
acceptStrings.add(new BytesRef("123"));
acceptStrings.add(new BytesRef("abc"));

Automaton flatPathAutomaton = DaciukMihovAutomatonBuilder.build(acceptStrings);
Automaton flatPathAutomaton = StringsToAutomaton.build(acceptStrings);
TokenStream ts = AutomatonToTokenStream.toTokenStream(flatPathAutomaton);
assertTokenStreamContents(
ts,
Expand All @@ -65,7 +65,7 @@ public void testForkedPath() throws IOException {
acceptStrings.add(new BytesRef("ab3"));
acceptStrings.add(new BytesRef("abc"));

Automaton flatPathAutomaton = DaciukMihovAutomatonBuilder.build(acceptStrings);
Automaton flatPathAutomaton = StringsToAutomaton.build(acceptStrings);
TokenStream ts = AutomatonToTokenStream.toTokenStream(flatPathAutomaton);
assertTokenStreamContents(
ts,
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -35,7 +35,7 @@ private CompiledAutomaton build(int determinizeWorkLimit, String... strings) {
terms.add(new BytesRef(s));
}
Collections.sort(terms);
final Automaton a = DaciukMihovAutomatonBuilder.build(terms);
final Automaton a = StringsToAutomaton.build(terms);
return new CompiledAutomaton(a, true, false, false);
}

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -22,20 +22,20 @@
import org.apache.lucene.util.ArrayUtil;
import org.apache.lucene.util.BytesRef;

public class TestDaciukMihovAutomatonBuilder extends LuceneTestCase {
public class TestStringsToAutomaton extends LuceneTestCase {

public void testLargeTerms() {
byte[] b10k = new byte[10_000];
Arrays.fill(b10k, (byte) 'a');
IllegalArgumentException e =
expectThrows(
IllegalArgumentException.class,
() -> DaciukMihovAutomatonBuilder.build(Collections.singleton(new BytesRef(b10k))));
() -> StringsToAutomaton.build(Collections.singleton(new BytesRef(b10k))));
assertTrue(
e.getMessage()
.startsWith("This builder doesn't allow terms that are larger than 1,000 characters"));

byte[] b1k = ArrayUtil.copyOfSubArray(b10k, 0, 1000);
DaciukMihovAutomatonBuilder.build(Collections.singleton(new BytesRef(b1k))); // no exception
StringsToAutomaton.build(Collections.singleton(new BytesRef(b1k))); // no exception
}
}
Original file line number Diff line number Diff line change
Expand Up @@ -29,7 +29,7 @@
import org.apache.lucene.index.memory.MemoryIndex;
import org.apache.lucene.queries.spans.SpanQuery;
import org.apache.lucene.util.BytesRef;
import org.apache.lucene.util.automaton.DaciukMihovAutomatonBuilder;
import org.apache.lucene.util.automaton.StringsToAutomaton;

/**
* Uses an {@link Analyzer} on content to get offsets and then populates a {@link MemoryIndex}.
Expand Down Expand Up @@ -66,7 +66,7 @@ private static CharArrayMatcher buildCombinedAutomaton(UHComponents components)
// to build an automaton on them
List<BytesRef> filteredTerms =
Arrays.stream(components.getTerms())
.filter(b -> b.length < DaciukMihovAutomatonBuilder.MAX_TERM_LENGTH)
.filter(b -> b.length < StringsToAutomaton.MAX_TERM_LENGTH)
.toList();
allAutomata.add(CharArrayMatcher.fromTerms(filteredTerms));
}
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -58,7 +58,7 @@
import org.apache.lucene.tests.analysis.MockTokenizer;
import org.apache.lucene.tests.index.RandomIndexWriter;
import org.apache.lucene.tests.util.LuceneTestCase;
import org.apache.lucene.util.automaton.DaciukMihovAutomatonBuilder;
import org.apache.lucene.util.automaton.StringsToAutomaton;
import org.junit.After;
import org.junit.Before;

Expand Down Expand Up @@ -1671,12 +1671,11 @@ public void testQueryWithLongTerm() throws IOException {
Query query =
new BooleanQuery.Builder()
.add(
new TermQuery(
new Term("title", "a".repeat(DaciukMihovAutomatonBuilder.MAX_TERM_LENGTH))),
new TermQuery(new Term("title", "a".repeat(StringsToAutomaton.MAX_TERM_LENGTH))),
BooleanClause.Occur.SHOULD)
.add(
new TermQuery(
new Term("title", "a".repeat(DaciukMihovAutomatonBuilder.MAX_TERM_LENGTH + 1))),
new Term("title", "a".repeat(StringsToAutomaton.MAX_TERM_LENGTH + 1))),
BooleanClause.Occur.SHOULD)
.add(new TermQuery(new Term("title", "title")), BooleanClause.Occur.SHOULD)
.build();
Expand Down

0 comments on commit ad3f89a

Please sign in to comment.