Skip to content

Commit

Permalink
Replace assert with IAE in StringsToAutomaton#build if data is not so…
Browse files Browse the repository at this point in the history
…rted (#12427)

Also update the API to accept more general Iterable<BytesRef> instead of Collection<BytesRef>
  • Loading branch information
shubhamvishu authored and gsmiller committed Oct 30, 2023
1 parent 2388d89 commit 5b26498
Show file tree
Hide file tree
Showing 3 changed files with 21 additions and 17 deletions.
4 changes: 4 additions & 0 deletions lucene/CHANGES.txt
Original file line number Diff line number Diff line change
Expand Up @@ -23,6 +23,10 @@ API Changes

* GITHUB#12718: Make IndexSearcher#getSlices final as it is not expected to be overridden (Luca Cavanna)

* GITHUB#12427: Automata#makeStringUnion #makeBinaryStringUnion now accept Iterable<BytesRef> instead of
Collection<BytesRef>. They also now explicitly throw IllegalArgumentException if input data is not properly sorted
instead of relying on assert. (Shubham Chaudhary)

New Features
---------------------

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -43,8 +43,8 @@
*/
public final class Automata {
/**
* {@link #makeStringUnion(Collection)} limits terms of this max length to ensure the stack
* doesn't overflow while building, since our algorithm currently relies on recursion.
* {@link #makeStringUnion(Iterable)} limits terms of this max length to ensure the stack doesn't
* overflow while building, since our algorithm currently relies on recursion.
*/
public static final int MAX_STRING_UNION_TERM_LENGTH = 1000;

Expand Down Expand Up @@ -576,8 +576,8 @@ public static Automaton makeString(int[] word, int offset, int length) {
* @return An {@link Automaton} accepting all input strings. The resulting automaton is codepoint
* based (full unicode codepoints on transitions).
*/
public static Automaton makeStringUnion(Collection<BytesRef> utf8Strings) {
if (utf8Strings.isEmpty()) {
public static Automaton makeStringUnion(Iterable<BytesRef> utf8Strings) {
if (utf8Strings.iterator().hasNext() == false) {
return makeEmpty();
} else {
return DaciukMihovAutomatonBuilder.build(utf8Strings, false);
Expand All @@ -593,8 +593,8 @@ public static Automaton makeStringUnion(Collection<BytesRef> utf8Strings) {
* @return An {@link Automaton} accepting all input strings. The resulting automaton is binary
* based (UTF-8 encoded byte transition labels).
*/
public static Automaton makeBinaryStringUnion(Collection<BytesRef> utf8Strings) {
if (utf8Strings.isEmpty()) {
public static Automaton makeBinaryStringUnion(Iterable<BytesRef> utf8Strings) {
if (utf8Strings.iterator().hasNext() == false) {
return makeEmpty();
} else {
return DaciukMihovAutomatonBuilder.build(utf8Strings, true);
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -18,7 +18,6 @@

import java.io.IOException;
import java.util.Arrays;
import java.util.Collection;
import java.util.HashMap;
import java.util.IdentityHashMap;
import org.apache.lucene.util.ArrayUtil;
Expand All @@ -31,13 +30,12 @@
* Builds a minimal, deterministic {@link Automaton} that accepts a set of strings. The algorithm
* requires sorted input data, but is very fast (nearly linear with the input size).
*
* @see #build(Collection)
* @see Automata#makeStringUnion(Collection)
* @see Automata#makeBinaryStringUnion(Collection)
* @see Automata#makeStringUnion(Iterable)
* @see Automata#makeBinaryStringUnion(Iterable)
* @see Automata#makeStringUnion(BytesRefIterator)
* @see Automata#makeBinaryStringUnion(BytesRefIterator)
* @deprecated Visibility of this class will be reduced in a future release. Users can access this
* functionality directly through {@link Automata#makeStringUnion(Collection)}
* functionality directly through {@link Automata#makeStringUnion(Iterable)}
*/
@Deprecated
public final class DaciukMihovAutomatonBuilder {
Expand Down Expand Up @@ -244,18 +242,18 @@ private Automaton completeAndConvert() {
* Build a minimal, deterministic automaton from a sorted list of {@link BytesRef} representing
* strings in UTF-8. These strings must be binary-sorted.
*
* @deprecated Please see {@link Automata#makeStringUnion(Collection)} instead
* @deprecated Please see {@link Automata#makeStringUnion(Iterable)} instead
*/
@Deprecated
public static Automaton build(Collection<BytesRef> input) {
public static Automaton build(Iterable<BytesRef> input) {
return build(input, false);
}

/**
* Build a minimal, deterministic automaton from a sorted list of {@link BytesRef} representing
* strings in UTF-8. These strings must be binary-sorted.
*/
static Automaton build(Collection<BytesRef> input, boolean asBinary) {
static Automaton build(Iterable<BytesRef> input, boolean asBinary) {
final DaciukMihovAutomatonBuilder builder = new DaciukMihovAutomatonBuilder();

for (BytesRef b : input) {
Expand Down Expand Up @@ -290,9 +288,11 @@ private void add(BytesRef current, boolean asBinary) {
+ current);
}
assert stateRegistry != null : "Automaton already built.";
assert previous == null || previous.get().compareTo(current) <= 0
: "Input must be in sorted UTF-8 order: " + previous.get() + " >= " + current;
assert setPrevious(current);
if (previous != null && previous.get().compareTo(current) > 0) {
throw new IllegalArgumentException(
"Input must be in sorted UTF-8 order: " + previous.get() + " >= " + current);
}
setPrevious(current);

// Reusable codepoint information if we're building a non-binary based automaton
UnicodeUtil.UTF8CodePoint codePoint = null;
Expand Down

0 comments on commit 5b26498

Please sign in to comment.