From 5b26498ec72220f6d88d5759dbfb070709bf4bcd Mon Sep 17 00:00:00 2001 From: Shubham Chaudhary <36742242+shubhamvishu@users.noreply.github.com> Date: Tue, 31 Oct 2023 01:17:56 +0530 Subject: [PATCH] Replace assert with IAE in StringsToAutomaton#build if data is not sorted (#12427) Also update the API to accept more general Iterable instead of Collection --- lucene/CHANGES.txt | 4 ++++ .../lucene/util/automaton/Automata.java | 12 +++++----- .../DaciukMihovAutomatonBuilder.java | 22 +++++++++---------- 3 files changed, 21 insertions(+), 17 deletions(-) diff --git a/lucene/CHANGES.txt b/lucene/CHANGES.txt index 5f5a239be380..60672b6c9684 100644 --- a/lucene/CHANGES.txt +++ b/lucene/CHANGES.txt @@ -23,6 +23,10 @@ API Changes * GITHUB#12718: Make IndexSearcher#getSlices final as it is not expected to be overridden (Luca Cavanna) +* GITHUB#12427: Automata#makeStringUnion #makeBinaryStringUnion now accept Iterable instead of + Collection. They also now explicitly throw IllegalArgumentException if input data is not properly sorted + instead of relying on assert. (Shubham Chaudhary) + New Features --------------------- diff --git a/lucene/core/src/java/org/apache/lucene/util/automaton/Automata.java b/lucene/core/src/java/org/apache/lucene/util/automaton/Automata.java index c829429d3c9b..cf1581fc9127 100644 --- a/lucene/core/src/java/org/apache/lucene/util/automaton/Automata.java +++ b/lucene/core/src/java/org/apache/lucene/util/automaton/Automata.java @@ -43,8 +43,8 @@ */ public final class Automata { /** - * {@link #makeStringUnion(Collection)} limits terms of this max length to ensure the stack - * doesn't overflow while building, since our algorithm currently relies on recursion. + * {@link #makeStringUnion(Iterable)} limits terms of this max length to ensure the stack doesn't + * overflow while building, since our algorithm currently relies on recursion. */ public static final int MAX_STRING_UNION_TERM_LENGTH = 1000; @@ -576,8 +576,8 @@ public static Automaton makeString(int[] word, int offset, int length) { * @return An {@link Automaton} accepting all input strings. The resulting automaton is codepoint * based (full unicode codepoints on transitions). */ - public static Automaton makeStringUnion(Collection utf8Strings) { - if (utf8Strings.isEmpty()) { + public static Automaton makeStringUnion(Iterable utf8Strings) { + if (utf8Strings.iterator().hasNext() == false) { return makeEmpty(); } else { return DaciukMihovAutomatonBuilder.build(utf8Strings, false); @@ -593,8 +593,8 @@ public static Automaton makeStringUnion(Collection utf8Strings) { * @return An {@link Automaton} accepting all input strings. The resulting automaton is binary * based (UTF-8 encoded byte transition labels). */ - public static Automaton makeBinaryStringUnion(Collection utf8Strings) { - if (utf8Strings.isEmpty()) { + public static Automaton makeBinaryStringUnion(Iterable utf8Strings) { + if (utf8Strings.iterator().hasNext() == false) { return makeEmpty(); } else { return DaciukMihovAutomatonBuilder.build(utf8Strings, true); diff --git a/lucene/core/src/java/org/apache/lucene/util/automaton/DaciukMihovAutomatonBuilder.java b/lucene/core/src/java/org/apache/lucene/util/automaton/DaciukMihovAutomatonBuilder.java index df12250f4efc..d87e259fa89f 100644 --- a/lucene/core/src/java/org/apache/lucene/util/automaton/DaciukMihovAutomatonBuilder.java +++ b/lucene/core/src/java/org/apache/lucene/util/automaton/DaciukMihovAutomatonBuilder.java @@ -18,7 +18,6 @@ import java.io.IOException; import java.util.Arrays; -import java.util.Collection; import java.util.HashMap; import java.util.IdentityHashMap; import org.apache.lucene.util.ArrayUtil; @@ -31,13 +30,12 @@ * Builds a minimal, deterministic {@link Automaton} that accepts a set of strings. The algorithm * requires sorted input data, but is very fast (nearly linear with the input size). * - * @see #build(Collection) - * @see Automata#makeStringUnion(Collection) - * @see Automata#makeBinaryStringUnion(Collection) + * @see Automata#makeStringUnion(Iterable) + * @see Automata#makeBinaryStringUnion(Iterable) * @see Automata#makeStringUnion(BytesRefIterator) * @see Automata#makeBinaryStringUnion(BytesRefIterator) * @deprecated Visibility of this class will be reduced in a future release. Users can access this - * functionality directly through {@link Automata#makeStringUnion(Collection)} + * functionality directly through {@link Automata#makeStringUnion(Iterable)} */ @Deprecated public final class DaciukMihovAutomatonBuilder { @@ -244,10 +242,10 @@ private Automaton completeAndConvert() { * Build a minimal, deterministic automaton from a sorted list of {@link BytesRef} representing * strings in UTF-8. These strings must be binary-sorted. * - * @deprecated Please see {@link Automata#makeStringUnion(Collection)} instead + * @deprecated Please see {@link Automata#makeStringUnion(Iterable)} instead */ @Deprecated - public static Automaton build(Collection input) { + public static Automaton build(Iterable input) { return build(input, false); } @@ -255,7 +253,7 @@ public static Automaton build(Collection input) { * Build a minimal, deterministic automaton from a sorted list of {@link BytesRef} representing * strings in UTF-8. These strings must be binary-sorted. */ - static Automaton build(Collection input, boolean asBinary) { + static Automaton build(Iterable input, boolean asBinary) { final DaciukMihovAutomatonBuilder builder = new DaciukMihovAutomatonBuilder(); for (BytesRef b : input) { @@ -290,9 +288,11 @@ private void add(BytesRef current, boolean asBinary) { + current); } assert stateRegistry != null : "Automaton already built."; - assert previous == null || previous.get().compareTo(current) <= 0 - : "Input must be in sorted UTF-8 order: " + previous.get() + " >= " + current; - assert setPrevious(current); + if (previous != null && previous.get().compareTo(current) > 0) { + throw new IllegalArgumentException( + "Input must be in sorted UTF-8 order: " + previous.get() + " >= " + current); + } + setPrevious(current); // Reusable codepoint information if we're building a non-binary based automaton UnicodeUtil.UTF8CodePoint codePoint = null;