From de63512a1d369dc5e0ca00259290eae3871a2a98 Mon Sep 17 00:00:00 2001 From: Greg Miller Date: Wed, 17 May 2023 13:34:51 -0700 Subject: [PATCH 1/4] Minor cleanup and improvements to DaciukMihovAutomatonBuilder --- lucene/CHANGES.txt | 4 +++- .../DaciukMihovAutomatonBuilder.java | 24 +++++++++++++------ 2 files changed, 20 insertions(+), 8 deletions(-) diff --git a/lucene/CHANGES.txt b/lucene/CHANGES.txt index 57c255853db9..1f592a16cf18 100644 --- a/lucene/CHANGES.txt +++ b/lucene/CHANGES.txt @@ -123,7 +123,9 @@ New Features Improvements --------------------- -GITHUB#12245: Add support for Score Mode to `ToParentBlockJoinQuery` explain. (Marcus Eagan via Mikhail Khludnev) +* GITHUB#12245: Add support for Score Mode to `ToParentBlockJoinQuery` explain. (Marcus Eagan via Mikhail Khludnev) + +* GITHUB#xx: Minor cleanup and improvements to DaciukMihovAutomatonBuilder. (Greg Miller) Optimizations --------------------- diff --git a/lucene/core/src/java/org/apache/lucene/util/automaton/DaciukMihovAutomatonBuilder.java b/lucene/core/src/java/org/apache/lucene/util/automaton/DaciukMihovAutomatonBuilder.java index 94002b04a40b..c68a604959de 100644 --- a/lucene/core/src/java/org/apache/lucene/util/automaton/DaciukMihovAutomatonBuilder.java +++ b/lucene/core/src/java/org/apache/lucene/util/automaton/DaciukMihovAutomatonBuilder.java @@ -179,7 +179,7 @@ private static boolean referenceEquals(Object[] a1, Object[] a2) { private HashMap stateRegistry = new HashMap<>(); /** Root automaton state. */ - private State root = new State(); + private final State root = new State(); /** Previous sequence added to the automaton in {@link #add(CharsRef)}. */ private CharsRef previous; @@ -192,7 +192,7 @@ private static boolean referenceEquals(Object[] a1, Object[] a2) { * Add another character sequence to this automaton. The sequence must be lexicographically larger * or equal compared to any previous sequences added to this automaton (the input must be sorted). */ - public void add(CharsRef current) { + private void add(CharsRef current) { if (current.length > MAX_TERM_LENGTH) { throw new IllegalArgumentException( "This builder doesn't allow terms that are larger than 1,000 characters, got " + current); @@ -205,10 +205,20 @@ public void add(CharsRef current) { // Descend in the automaton (find matching prefix). int pos = 0, max = current.length(); State next, state = root; - while (pos < max && (next = state.lastChild(Character.codePointAt(current, pos))) != null) { + for (; ; ) { + assert pos <= max; + if (pos == max) { + break; + } + + int codePoint = Character.codePointAt(current, pos); + next = state.lastChild(codePoint); + if (next == null) { + break; + } + state = next; - // todo, optimize me - pos += Character.charCount(Character.codePointAt(current, pos)); + pos += Character.charCount(codePoint); } if (state.hasChildren()) replaceOrRegister(state); @@ -222,7 +232,7 @@ public void add(CharsRef current) { * * @return Root automaton state. */ - public State complete() { + private State complete() { if (this.stateRegistry == null) throw new IllegalStateException(); if (root.hasChildren()) replaceOrRegister(root); @@ -271,7 +281,7 @@ public static Automaton build(Collection input) { } Automaton.Builder a = new Automaton.Builder(); - convert(a, builder.complete(), new IdentityHashMap()); + convert(a, builder.complete(), new IdentityHashMap<>()); return a.finish(); } From 4eebed9f224823fd5cc58db81c1f6e03845cbdd5 Mon Sep 17 00:00:00 2001 From: Greg Miller Date: Wed, 17 May 2023 14:51:37 -0700 Subject: [PATCH 2/4] changes entry and minor tweak --- lucene/CHANGES.txt | 2 +- .../lucene/util/automaton/DaciukMihovAutomatonBuilder.java | 4 ++-- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/lucene/CHANGES.txt b/lucene/CHANGES.txt index 1f592a16cf18..c022616c709a 100644 --- a/lucene/CHANGES.txt +++ b/lucene/CHANGES.txt @@ -125,7 +125,7 @@ Improvements * GITHUB#12245: Add support for Score Mode to `ToParentBlockJoinQuery` explain. (Marcus Eagan via Mikhail Khludnev) -* GITHUB#xx: Minor cleanup and improvements to DaciukMihovAutomatonBuilder. (Greg Miller) +* GITHUB#12305: Minor cleanup and improvements to DaciukMihovAutomatonBuilder. (Greg Miller) Optimizations --------------------- diff --git a/lucene/core/src/java/org/apache/lucene/util/automaton/DaciukMihovAutomatonBuilder.java b/lucene/core/src/java/org/apache/lucene/util/automaton/DaciukMihovAutomatonBuilder.java index c68a604959de..46eabe61f11a 100644 --- a/lucene/core/src/java/org/apache/lucene/util/automaton/DaciukMihovAutomatonBuilder.java +++ b/lucene/core/src/java/org/apache/lucene/util/automaton/DaciukMihovAutomatonBuilder.java @@ -204,7 +204,7 @@ private void add(CharsRef current) { // Descend in the automaton (find matching prefix). int pos = 0, max = current.length(); - State next, state = root; + State state = root; for (; ; ) { assert pos <= max; if (pos == max) { @@ -212,7 +212,7 @@ private void add(CharsRef current) { } int codePoint = Character.codePointAt(current, pos); - next = state.lastChild(codePoint); + State next = state.lastChild(codePoint); if (next == null) { break; } From db37893bf0bafedee0bf5be560665255b6adc4aa Mon Sep 17 00:00:00 2001 From: Greg Miller Date: Wed, 17 May 2023 16:58:37 -0700 Subject: [PATCH 3/4] make use of CharsRefBuilder to clean up code and avoid new object creation for asserts --- .../DaciukMihovAutomatonBuilder.java | 30 +++++++++---------- 1 file changed, 14 insertions(+), 16 deletions(-) diff --git a/lucene/core/src/java/org/apache/lucene/util/automaton/DaciukMihovAutomatonBuilder.java b/lucene/core/src/java/org/apache/lucene/util/automaton/DaciukMihovAutomatonBuilder.java index 46eabe61f11a..a985c790b08b 100644 --- a/lucene/core/src/java/org/apache/lucene/util/automaton/DaciukMihovAutomatonBuilder.java +++ b/lucene/core/src/java/org/apache/lucene/util/automaton/DaciukMihovAutomatonBuilder.java @@ -16,15 +16,16 @@ */ package org.apache.lucene.util.automaton; +import org.apache.lucene.util.ArrayUtil; +import org.apache.lucene.util.BytesRef; +import org.apache.lucene.util.CharsRef; +import org.apache.lucene.util.CharsRefBuilder; + import java.util.Arrays; import java.util.Collection; import java.util.Comparator; import java.util.HashMap; import java.util.IdentityHashMap; -import org.apache.lucene.util.ArrayUtil; -import org.apache.lucene.util.BytesRef; -import org.apache.lucene.util.CharsRef; -import org.apache.lucene.util.UnicodeUtil; /** * Builds a minimal, deterministic {@link Automaton} that accepts a set of strings. The algorithm @@ -182,7 +183,7 @@ private static boolean referenceEquals(Object[] a1, Object[] a2) { private final State root = new State(); /** Previous sequence added to the automaton in {@link #add(CharsRef)}. */ - private CharsRef previous; + private CharsRefBuilder previous; /** A comparator used for enforcing sorted UTF8 order, used in assertions only. */ @SuppressWarnings("deprecation") @@ -198,7 +199,7 @@ private void add(CharsRef current) { "This builder doesn't allow terms that are larger than 1,000 characters, got " + current); } assert stateRegistry != null : "Automaton already built."; - assert previous == null || comparator.compare(previous, current) <= 0 + assert previous == null || comparator.compare(previous.get(), current) <= 0 : "Input must be in sorted UTF-8 order: " + previous + " >= " + current; assert setPrevious(current); @@ -270,14 +271,10 @@ private static int convert( public static Automaton build(Collection input) { final DaciukMihovAutomatonBuilder builder = new DaciukMihovAutomatonBuilder(); - char[] chars = new char[0]; - CharsRef ref = new CharsRef(); + CharsRefBuilder current = new CharsRefBuilder(); for (BytesRef b : input) { - chars = ArrayUtil.grow(chars, b.length); - final int len = UnicodeUtil.UTF8toUTF16(b, chars); - ref.chars = chars; - ref.length = len; - builder.add(ref); + current.copyUTF8Bytes(b); + builder.add(current.get()); } Automaton.Builder a = new Automaton.Builder(); @@ -288,9 +285,10 @@ public static Automaton build(Collection input) { /** Copy current into an internal buffer. */ private boolean setPrevious(CharsRef current) { - // don't need to copy, once we fix https://issues.apache.org/jira/browse/LUCENE-3277 - // still, called only from assert - previous = CharsRef.deepCopyOf(current); + if (previous == null) { + previous = new CharsRefBuilder(); + } + previous.copyChars(current); return true; } From 3c5602336295431333a37ea2623467f21576f0a7 Mon Sep 17 00:00:00 2001 From: Greg Miller Date: Wed, 17 May 2023 17:01:26 -0700 Subject: [PATCH 4/4] spotless --- .../util/automaton/DaciukMihovAutomatonBuilder.java | 9 ++++----- 1 file changed, 4 insertions(+), 5 deletions(-) diff --git a/lucene/core/src/java/org/apache/lucene/util/automaton/DaciukMihovAutomatonBuilder.java b/lucene/core/src/java/org/apache/lucene/util/automaton/DaciukMihovAutomatonBuilder.java index a985c790b08b..2fe13101168e 100644 --- a/lucene/core/src/java/org/apache/lucene/util/automaton/DaciukMihovAutomatonBuilder.java +++ b/lucene/core/src/java/org/apache/lucene/util/automaton/DaciukMihovAutomatonBuilder.java @@ -16,16 +16,15 @@ */ package org.apache.lucene.util.automaton; -import org.apache.lucene.util.ArrayUtil; -import org.apache.lucene.util.BytesRef; -import org.apache.lucene.util.CharsRef; -import org.apache.lucene.util.CharsRefBuilder; - import java.util.Arrays; import java.util.Collection; import java.util.Comparator; import java.util.HashMap; import java.util.IdentityHashMap; +import org.apache.lucene.util.ArrayUtil; +import org.apache.lucene.util.BytesRef; +import org.apache.lucene.util.CharsRef; +import org.apache.lucene.util.CharsRefBuilder; /** * Builds a minimal, deterministic {@link Automaton} that accepts a set of strings. The algorithm