Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

StringsToAutomaton#build to take List as parameter instead of Collection #12427

Merged
merged 1 commit into from
Oct 30, 2023
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Original file line number Diff line number Diff line change
Expand Up @@ -43,8 +43,8 @@
*/
public final class Automata {
/**
* {@link #makeStringUnion(Collection)} limits terms of this max length to ensure the stack
* doesn't overflow while building, since our algorithm currently relies on recursion.
* {@link #makeStringUnion(Iterable)} limits terms of this max length to ensure the stack doesn't
* overflow while building, since our algorithm currently relies on recursion.
*/
public static final int MAX_STRING_UNION_TERM_LENGTH = 1000;

Expand Down Expand Up @@ -576,8 +576,8 @@ public static Automaton makeString(int[] word, int offset, int length) {
* @return An {@link Automaton} accepting all input strings. The resulting automaton is codepoint
* based (full unicode codepoints on transitions).
*/
public static Automaton makeStringUnion(Collection<BytesRef> utf8Strings) {
if (utf8Strings.isEmpty()) {
public static Automaton makeStringUnion(Iterable<BytesRef> utf8Strings) {
if (utf8Strings.iterator().hasNext() == false) {
return makeEmpty();
} else {
return StringsToAutomaton.build(utf8Strings, false);
Expand All @@ -593,8 +593,8 @@ public static Automaton makeStringUnion(Collection<BytesRef> utf8Strings) {
* @return An {@link Automaton} accepting all input strings. The resulting automaton is binary
* based (UTF-8 encoded byte transition labels).
*/
public static Automaton makeBinaryStringUnion(Collection<BytesRef> utf8Strings) {
if (utf8Strings.isEmpty()) {
public static Automaton makeBinaryStringUnion(Iterable<BytesRef> utf8Strings) {
if (utf8Strings.iterator().hasNext() == false) {
return makeEmpty();
} else {
return StringsToAutomaton.build(utf8Strings, true);
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -18,7 +18,6 @@

import java.io.IOException;
import java.util.Arrays;
import java.util.Collection;
import java.util.HashMap;
import java.util.IdentityHashMap;
import org.apache.lucene.util.ArrayUtil;
Expand All @@ -35,8 +34,8 @@
* to directly build a binary {@link Automaton} representation. Users should access this
* functionality through {@link Automata} static methods.
*
* @see Automata#makeStringUnion(Collection)
* @see Automata#makeBinaryStringUnion(Collection)
* @see Automata#makeStringUnion(Iterable)
* @see Automata#makeBinaryStringUnion(Iterable)
* @see Automata#makeStringUnion(BytesRefIterator)
* @see Automata#makeBinaryStringUnion(BytesRefIterator)
*/
Expand Down Expand Up @@ -238,7 +237,7 @@ private Automaton completeAndConvert() {
* UTF-8 codepoints as transition labels or binary (compiled) transition labels based on {@code
* asBinary}.
*/
static Automaton build(Collection<BytesRef> input, boolean asBinary) {
static Automaton build(Iterable<BytesRef> input, boolean asBinary) {
final StringsToAutomaton builder = new StringsToAutomaton();

for (BytesRef b : input) {
Expand Down Expand Up @@ -273,9 +272,11 @@ private void add(BytesRef current, boolean asBinary) {
+ current);
}
assert stateRegistry != null : "Automaton already built.";
assert previous == null || previous.get().compareTo(current) <= 0
: "Input must be in sorted UTF-8 order: " + previous.get() + " >= " + current;
assert setPrevious(current);
if (previous != null && previous.get().compareTo(current) > 0) {
throw new IllegalArgumentException(
"Input must be in sorted UTF-8 order: " + previous.get() + " >= " + current);
}
setPrevious(current);

// Reusable codepoint information if we're building a non-binary based automaton
UnicodeUtil.UTF8CodePoint codePoint = null;
Expand Down