Skip to content

Commit

Permalink
Check and enforce the input Collection is sorted for creating Strings…
Browse files Browse the repository at this point in the history
…ToAutomaton
  • Loading branch information
shubhamvishu committed Jul 11, 2023
1 parent b4619d8 commit 27b08c4
Show file tree
Hide file tree
Showing 9 changed files with 81 additions and 9 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -780,7 +780,7 @@ public void testPathsNotLost() throws IOException {
acceptStrings.sort(Comparator.naturalOrder());

acceptStrings = acceptStrings.stream().limit(wordCount).collect(Collectors.toList());
Automaton nonFlattenedAutomaton = Automata.makeStringUnion(acceptStrings);
Automaton nonFlattenedAutomaton = Automata.makeStringUnion(acceptStrings, true);

TokenStream ts = AutomatonToTokenStream.toTokenStream(nonFlattenedAutomaton);
TokenStream flattenedTokenStream = new FlattenGraphFilter(ts);
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -31,7 +31,12 @@

import java.io.IOException;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.Collection;
import java.util.List;
import java.util.SortedSet;
import java.util.TreeSet;
import org.apache.lucene.util.ArrayUtil;
import org.apache.lucene.util.BytesRef;
import org.apache.lucene.util.BytesRefIterator;
import org.apache.lucene.util.StringHelper;
Expand Down Expand Up @@ -568,6 +573,32 @@ public static Automaton makeString(int[] word, int offset, int length) {
return a;
}

private static List<BytesRef> makeStringsSorted(Collection<BytesRef> utf8Strings) {
BytesRef[] terms = utf8Strings.toArray(new BytesRef[0]);
ArrayUtil.timSort(terms);
return Arrays.asList(terms);
}

/**
* Returns a new (deterministic and minimal) automaton that accepts the union of the given
* collection of {@link BytesRef}s representing UTF-8 encoded strings.
*
* @param utf8Strings The input strings, UTF-8 encoded. The collection must be in sorted order.
* @param sorted The boolean value to specify if the input strings are sorted or not.
* @return An {@link Automaton} accepting all input strings. The resulting automaton is codepoint
* based (full unicode codepoints on transitions).
*/
public static Automaton makeStringUnion(Collection<BytesRef> utf8Strings, boolean sorted) {
if (utf8Strings.isEmpty()) {
return makeEmpty();
} else {
if (sorted == false) {
return StringsToAutomaton.build(makeStringsSorted(utf8Strings), false);
}
return StringsToAutomaton.build(utf8Strings, false);
}
}

/**
* Returns a new (deterministic and minimal) automaton that accepts the union of the given
* collection of {@link BytesRef}s representing UTF-8 encoded strings.
Expand All @@ -580,10 +611,40 @@ public static Automaton makeStringUnion(Collection<BytesRef> utf8Strings) {
if (utf8Strings.isEmpty()) {
return makeEmpty();
} else {
boolean sorted =
utf8Strings.size() == 1
|| ((utf8Strings instanceof SortedSet
&& ((SortedSet<BytesRef>) utf8Strings).comparator() == null)
|| (utf8Strings instanceof TreeSet
&& ((TreeSet<BytesRef>) utf8Strings).comparator() == null));
if (sorted == false) {
return StringsToAutomaton.build(makeStringsSorted(utf8Strings), false);
}
return StringsToAutomaton.build(utf8Strings, false);
}
}

/**
* Returns a new (deterministic and minimal) automaton that accepts the union of the given
* collection of {@link BytesRef}s representing UTF-8 encoded strings. The resulting automaton
* will be built in a binary representation.
*
* @param utf8Strings The input strings, UTF-8 encoded. The collection must be in sorted order.
* @param sorted The boolean value to specify if the input strings are sorted or not.
* @return An {@link Automaton} accepting all input strings. The resulting automaton is binary
* based (UTF-8 encoded byte transition labels).
*/
public static Automaton makeBinaryStringUnion(Collection<BytesRef> utf8Strings, boolean sorted) {
if (utf8Strings.isEmpty()) {
return makeEmpty();
} else {
if (sorted == false) {
return StringsToAutomaton.build(makeStringsSorted(utf8Strings), true);
}
return StringsToAutomaton.build(utf8Strings, true);
}
}

/**
* Returns a new (deterministic and minimal) automaton that accepts the union of the given
* collection of {@link BytesRef}s representing UTF-8 encoded strings. The resulting automaton
Expand All @@ -597,6 +658,15 @@ public static Automaton makeBinaryStringUnion(Collection<BytesRef> utf8Strings)
if (utf8Strings.isEmpty()) {
return makeEmpty();
} else {
boolean sorted =
utf8Strings.size() == 1
|| ((utf8Strings instanceof SortedSet
&& ((SortedSet<BytesRef>) utf8Strings).comparator() == null)
|| (utf8Strings instanceof TreeSet
&& ((TreeSet<BytesRef>) utf8Strings).comparator() == null));
if (sorted == false) {
return StringsToAutomaton.build(makeStringsSorted(utf8Strings), true);
}
return StringsToAutomaton.build(utf8Strings, true);
}
}
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -36,7 +36,9 @@
* functionality through {@link Automata} static methods.
*
* @see Automata#makeStringUnion(Collection)
* @see Automata#makeStringUnion(Collection, boolean)
* @see Automata#makeBinaryStringUnion(Collection)
* @see Automata#makeBinaryStringUnion(Collection, boolean)
* @see Automata#makeStringUnion(BytesRefIterator)
* @see Automata#makeBinaryStringUnion(BytesRefIterator)
*/
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -31,7 +31,7 @@ public void testSinglePath() throws IOException {
List<BytesRef> acceptStrings = new ArrayList<>();
acceptStrings.add(new BytesRef("abc"));

Automaton flatPathAutomaton = Automata.makeStringUnion(acceptStrings);
Automaton flatPathAutomaton = Automata.makeStringUnion(acceptStrings, false);
TokenStream ts = AutomatonToTokenStream.toTokenStream(flatPathAutomaton);
assertTokenStreamContents(
ts,
Expand All @@ -48,7 +48,7 @@ public void testParallelPaths() throws IOException {
acceptStrings.add(new BytesRef("123"));
acceptStrings.add(new BytesRef("abc"));

Automaton flatPathAutomaton = Automata.makeStringUnion(acceptStrings);
Automaton flatPathAutomaton = Automata.makeStringUnion(acceptStrings, false);
TokenStream ts = AutomatonToTokenStream.toTokenStream(flatPathAutomaton);
assertTokenStreamContents(
ts,
Expand All @@ -65,7 +65,7 @@ public void testForkedPath() throws IOException {
acceptStrings.add(new BytesRef("ab3"));
acceptStrings.add(new BytesRef("abc"));

Automaton flatPathAutomaton = Automata.makeStringUnion(acceptStrings);
Automaton flatPathAutomaton = Automata.makeStringUnion(acceptStrings, false);
TokenStream ts = AutomatonToTokenStream.toTokenStream(flatPathAutomaton);
assertTokenStreamContents(
ts,
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -248,6 +248,6 @@ public void testBiggishAutomaton() {
terms.add(new BytesRef(TestUtil.randomUnicodeString(random())));
}
Collections.sort(terms);
new AutomatonQuery(new Term("foo", "bar"), Automata.makeStringUnion(terms));
new AutomatonQuery(new Term("foo", "bar"), Automata.makeStringUnion(terms, true));
}
}
Original file line number Diff line number Diff line change
Expand Up @@ -807,7 +807,7 @@ private Automaton unionTerms(Collection<BytesRef> terms) {
}
List<BytesRef> termsList = new ArrayList<>(terms);
Collections.sort(termsList);
a = Automata.makeStringUnion(termsList);
a = Automata.makeStringUnion(termsList, true);
}

return randomNoOp(a);
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -35,7 +35,7 @@ private CompiledAutomaton build(int determinizeWorkLimit, String... strings) {
terms.add(new BytesRef(s));
}
Collections.sort(terms);
final Automaton a = Automata.makeStringUnion(terms);
final Automaton a = Automata.makeStringUnion(terms, true);
return new CompiledAutomaton(a, true, false, false);
}

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -42,7 +42,7 @@ public void testStringUnion() {
}

Collections.sort(strings);
Automaton union = Automata.makeStringUnion(strings);
Automaton union = Automata.makeStringUnion(strings, true);
assertTrue(union.isDeterministic());
assertFalse(Operations.hasDeadStatesFromInitial(union));

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -512,7 +512,7 @@ public TokenStreamComponents createComponents(String fieldName) {
List<BytesRef> stringsList = new ArrayList<>(strings);
Collections.sort(stringsList);

Automaton a = Automata.makeStringUnion(stringsList);
Automaton a = Automata.makeStringUnion(stringsList, true);

// Translate automaton to query:

Expand Down

0 comments on commit 27b08c4

Please sign in to comment.