diff --git a/lucene/CHANGES.txt b/lucene/CHANGES.txt index 4df94417bdcd..c97d0ade6481 100644 --- a/lucene/CHANGES.txt +++ b/lucene/CHANGES.txt @@ -18,7 +18,9 @@ New Features Improvements --------------------- -GITHUB#12245: Add support for Score Mode to `ToParentBlockJoinQuery` explain. (Marcus Eagan via Mikhail Khludnev) +* GITHUB#12245: Add support for Score Mode to `ToParentBlockJoinQuery` explain. (Marcus Eagan via Mikhail Khludnev) + +* GITHUB#12305: Minor cleanup and improvements to DaciukMihovAutomatonBuilder. (Greg Miller) Optimizations --------------------- diff --git a/lucene/core/src/java/org/apache/lucene/util/automaton/DaciukMihovAutomatonBuilder.java b/lucene/core/src/java/org/apache/lucene/util/automaton/DaciukMihovAutomatonBuilder.java index 94002b04a40b..2fe13101168e 100644 --- a/lucene/core/src/java/org/apache/lucene/util/automaton/DaciukMihovAutomatonBuilder.java +++ b/lucene/core/src/java/org/apache/lucene/util/automaton/DaciukMihovAutomatonBuilder.java @@ -24,7 +24,7 @@ import org.apache.lucene.util.ArrayUtil; import org.apache.lucene.util.BytesRef; import org.apache.lucene.util.CharsRef; -import org.apache.lucene.util.UnicodeUtil; +import org.apache.lucene.util.CharsRefBuilder; /** * Builds a minimal, deterministic {@link Automaton} that accepts a set of strings. The algorithm @@ -179,10 +179,10 @@ private static boolean referenceEquals(Object[] a1, Object[] a2) { private HashMap stateRegistry = new HashMap<>(); /** Root automaton state. */ - private State root = new State(); + private final State root = new State(); /** Previous sequence added to the automaton in {@link #add(CharsRef)}. */ - private CharsRef previous; + private CharsRefBuilder previous; /** A comparator used for enforcing sorted UTF8 order, used in assertions only. */ @SuppressWarnings("deprecation") @@ -192,23 +192,33 @@ private static boolean referenceEquals(Object[] a1, Object[] a2) { * Add another character sequence to this automaton. The sequence must be lexicographically larger * or equal compared to any previous sequences added to this automaton (the input must be sorted). */ - public void add(CharsRef current) { + private void add(CharsRef current) { if (current.length > MAX_TERM_LENGTH) { throw new IllegalArgumentException( "This builder doesn't allow terms that are larger than 1,000 characters, got " + current); } assert stateRegistry != null : "Automaton already built."; - assert previous == null || comparator.compare(previous, current) <= 0 + assert previous == null || comparator.compare(previous.get(), current) <= 0 : "Input must be in sorted UTF-8 order: " + previous + " >= " + current; assert setPrevious(current); // Descend in the automaton (find matching prefix). int pos = 0, max = current.length(); - State next, state = root; - while (pos < max && (next = state.lastChild(Character.codePointAt(current, pos))) != null) { + State state = root; + for (; ; ) { + assert pos <= max; + if (pos == max) { + break; + } + + int codePoint = Character.codePointAt(current, pos); + State next = state.lastChild(codePoint); + if (next == null) { + break; + } + state = next; - // todo, optimize me - pos += Character.charCount(Character.codePointAt(current, pos)); + pos += Character.charCount(codePoint); } if (state.hasChildren()) replaceOrRegister(state); @@ -222,7 +232,7 @@ public void add(CharsRef current) { * * @return Root automaton state. */ - public State complete() { + private State complete() { if (this.stateRegistry == null) throw new IllegalStateException(); if (root.hasChildren()) replaceOrRegister(root); @@ -260,27 +270,24 @@ private static int convert( public static Automaton build(Collection input) { final DaciukMihovAutomatonBuilder builder = new DaciukMihovAutomatonBuilder(); - char[] chars = new char[0]; - CharsRef ref = new CharsRef(); + CharsRefBuilder current = new CharsRefBuilder(); for (BytesRef b : input) { - chars = ArrayUtil.grow(chars, b.length); - final int len = UnicodeUtil.UTF8toUTF16(b, chars); - ref.chars = chars; - ref.length = len; - builder.add(ref); + current.copyUTF8Bytes(b); + builder.add(current.get()); } Automaton.Builder a = new Automaton.Builder(); - convert(a, builder.complete(), new IdentityHashMap()); + convert(a, builder.complete(), new IdentityHashMap<>()); return a.finish(); } /** Copy current into an internal buffer. */ private boolean setPrevious(CharsRef current) { - // don't need to copy, once we fix https://issues.apache.org/jira/browse/LUCENE-3277 - // still, called only from assert - previous = CharsRef.deepCopyOf(current); + if (previous == null) { + previous = new CharsRefBuilder(); + } + previous.copyChars(current); return true; }