Skip to content

Commit

Permalink
StringsToAutomaton#build to take List as parameter instead of Collection
Browse files Browse the repository at this point in the history
  • Loading branch information
shubhamvishu committed Jul 9, 2023
1 parent d03c8f1 commit d820f8c
Show file tree
Hide file tree
Showing 5 changed files with 24 additions and 21 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -32,6 +32,7 @@
import java.io.IOException;
import java.util.ArrayList;
import java.util.Collection;
import java.util.List;
import org.apache.lucene.util.BytesRef;
import org.apache.lucene.util.BytesRefIterator;
import org.apache.lucene.util.StringHelper;
Expand All @@ -43,8 +44,8 @@
*/
public final class Automata {
/**
* {@link #makeStringUnion(Collection)} limits terms of this max length to ensure the stack
* doesn't overflow while building, since our algorithm currently relies on recursion.
* {@link #makeStringUnion(List)} limits terms of this max length to ensure the stack doesn't
* overflow while building, since our algorithm currently relies on recursion.
*/
public static final int MAX_STRING_UNION_TERM_LENGTH = 1000;

Expand Down Expand Up @@ -576,7 +577,7 @@ public static Automaton makeString(int[] word, int offset, int length) {
* @return An {@link Automaton} accepting all input strings. The resulting automaton is codepoint
* based (full unicode codepoints on transitions).
*/
public static Automaton makeStringUnion(Collection<BytesRef> utf8Strings) {
public static Automaton makeStringUnion(List<BytesRef> utf8Strings) {
if (utf8Strings.isEmpty()) {
return makeEmpty();
} else {
Expand All @@ -593,7 +594,7 @@ public static Automaton makeStringUnion(Collection<BytesRef> utf8Strings) {
* @return An {@link Automaton} accepting all input strings. The resulting automaton is binary
* based (UTF-8 encoded byte transition labels).
*/
public static Automaton makeBinaryStringUnion(Collection<BytesRef> utf8Strings) {
public static Automaton makeBinaryStringUnion(List<BytesRef> utf8Strings) {
if (utf8Strings.isEmpty()) {
return makeEmpty();
} else {
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -18,9 +18,9 @@

import java.io.IOException;
import java.util.Arrays;
import java.util.Collection;
import java.util.HashMap;
import java.util.IdentityHashMap;
import java.util.List;
import org.apache.lucene.util.ArrayUtil;
import org.apache.lucene.util.BytesRef;
import org.apache.lucene.util.BytesRefBuilder;
Expand All @@ -35,8 +35,8 @@
* to directly build a binary {@link Automaton} representation. Users should access this
* functionality through {@link Automata} static methods.
*
* @see Automata#makeStringUnion(Collection)
* @see Automata#makeBinaryStringUnion(Collection)
* @see Automata#makeStringUnion(List)
* @see Automata#makeBinaryStringUnion(List)
* @see Automata#makeStringUnion(BytesRefIterator)
* @see Automata#makeBinaryStringUnion(BytesRefIterator)
*/
Expand Down Expand Up @@ -238,7 +238,7 @@ private Automaton completeAndConvert() {
* UTF-8 codepoints as transition labels or binary (compiled) transition labels based on {@code
* asBinary}.
*/
static Automaton build(Collection<BytesRef> input, boolean asBinary) {
static Automaton build(List<BytesRef> input, boolean asBinary) {
final StringsToAutomaton builder = new StringsToAutomaton();

for (BytesRef b : input) {
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -280,7 +280,7 @@ public void testIntersectRandom() throws IOException {
acceptTerms.add(s2);
sortedAcceptTerms.add(newBytesRef(s2));
}
a = Automata.makeStringUnion(sortedAcceptTerms);
a = Automata.makeStringUnion(new ArrayList<>(sortedAcceptTerms));
}

final CompiledAutomaton c = new CompiledAutomaton(a, true, false, false);
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -76,7 +76,7 @@ public void setUp() throws Exception {
writer.addDocument(doc);
}

termsAutomaton = Automata.makeStringUnion(terms);
termsAutomaton = Automata.makeStringUnion(new ArrayList<>(terms));

reader = writer.getReader();
searcher = newSearcher(reader);
Expand Down Expand Up @@ -182,7 +182,8 @@ public void testIntersect() throws Exception {
}

Automaton actual =
Operations.determinize(Automata.makeStringUnion(found), DEFAULT_DETERMINIZE_WORK_LIMIT);
Operations.determinize(
Automata.makeStringUnion(new ArrayList<>(found)), DEFAULT_DETERMINIZE_WORK_LIMIT);
assertTrue(Operations.sameLanguage(expected, actual));
}
}
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -39,18 +39,16 @@ public class TestStringsToAutomaton extends LuceneTestCase {

public void testBasic() throws Exception {
List<BytesRef> terms = basicTerms();
Collections.sort(terms);

Automaton a = build(terms, false);
Automaton a = build(terms, false, false);
checkAutomaton(terms, a, false);
checkMinimized(a);
}

public void testBasicBinary() throws Exception {
List<BytesRef> terms = basicTerms();
Collections.sort(terms);

Automaton a = build(terms, true);
Automaton a = build(terms, true, false);
checkAutomaton(terms, a, true);
checkMinimized(a);
}
Expand Down Expand Up @@ -78,7 +76,7 @@ public void testRandomMinimized() throws Exception {
Automaton expected =
MinimizationOperations.minimize(
Operations.union(automatonList), Operations.DEFAULT_DETERMINIZE_WORK_LIMIT);
Automaton actual = build(sortedTerms, buildBinary);
Automaton actual = build(sortedTerms, buildBinary, true);
assertSameAutomaton(expected, actual);
}
}
Expand All @@ -96,8 +94,7 @@ public void testLargeTerms() throws Exception {
Arrays.fill(b10k, (byte) 'a');
IllegalArgumentException e =
expectThrows(
IllegalArgumentException.class,
() -> build(Collections.singleton(new BytesRef(b10k)), false));
IllegalArgumentException.class, () -> build(List.of(new BytesRef(b10k)), false, true));
assertTrue(
e.getMessage()
.startsWith(
Expand All @@ -106,7 +103,7 @@ public void testLargeTerms() throws Exception {
+ " characters"));

byte[] b1k = ArrayUtil.copyOfSubArray(b10k, 0, 1000);
build(Collections.singleton(new BytesRef(b1k)), false); // no exception
build(List.of(new BytesRef(b1k)), false, true); // no exception
}

private void testRandom(boolean allowBinary) throws Exception {
Expand All @@ -124,7 +121,7 @@ private void testRandom(boolean allowBinary) throws Exception {
}

List<BytesRef> sorted = terms.stream().sorted().toList();
Automaton a = build(sorted, allowBinary);
Automaton a = build(sorted, allowBinary, true);
checkAutomaton(sorted, a, allowBinary);
}
}
Expand Down Expand Up @@ -171,7 +168,11 @@ private List<BytesRef> basicTerms() {
return terms;
}

private Automaton build(Collection<BytesRef> terms, boolean asBinary) throws IOException {
private Automaton build(List<BytesRef> terms, boolean asBinary, boolean isTermsSorted)
throws IOException {
if (isTermsSorted == false) {
Collections.sort(terms);
}
if (random().nextBoolean()) {
return StringsToAutomaton.build(terms, asBinary);
} else {
Expand Down

0 comments on commit d820f8c

Please sign in to comment.