From 9fa1e9d6dde7e11168142108b8533deb660bca36 Mon Sep 17 00:00:00 2001 From: Greg Miller <gsmiller@gmail.com> Date: Wed, 10 May 2023 07:00:47 -0700 Subject: [PATCH 1/2] Initial version --- .../apache/lucene/search/AutomatonQuery.java | 38 +++- .../apache/lucene/search/TermInSetQuery.java | 168 +++--------------- 2 files changed, 65 insertions(+), 141 deletions(-) diff --git a/lucene/core/src/java/org/apache/lucene/search/AutomatonQuery.java b/lucene/core/src/java/org/apache/lucene/search/AutomatonQuery.java index 093dd0f53f91..038159343a13 100644 --- a/lucene/core/src/java/org/apache/lucene/search/AutomatonQuery.java +++ b/lucene/core/src/java/org/apache/lucene/search/AutomatonQuery.java @@ -91,11 +91,47 @@ public AutomatonQuery(final Term term, Automaton automaton, boolean isBinary) { */ public AutomatonQuery( final Term term, Automaton automaton, boolean isBinary, RewriteMethod rewriteMethod) { + this(term, automaton, false, true, isBinary, rewriteMethod); + } + + /** + * Create a new AutomatonQuery from an {@link Automaton}. + * + * @param term Term containing field and possibly some pattern structure. The term text is + * ignored. + * @param automaton Automaton to run, terms that are accepted are considered a match. + * @param isFinite if true, this automaton is already finite + * @param isBinary if true, this automaton is already binary and will not go through the + * UTF32ToUTF8 conversion + */ + public AutomatonQuery( + final Term term, Automaton automaton, boolean isFinite, boolean simplify, boolean isBinary) { + this(term, automaton, isFinite, simplify, isBinary, CONSTANT_SCORE_BLENDED_REWRITE); + } + + /** + * Create a new AutomatonQuery from an {@link Automaton}. + * + * @param term Term containing field and possibly some pattern structure. The term text is + * ignored. + * @param automaton Automaton to run, terms that are accepted are considered a match. + * @param isFinite if true, this automaton is already finite + * @param isBinary if true, this automaton is already binary and will not go through the + * UTF32ToUTF8 conversion + * @param rewriteMethod the rewriteMethod to use to build the final query from the automaton + */ + public AutomatonQuery( + final Term term, + Automaton automaton, + boolean isFinite, + boolean simplify, + boolean isBinary, + RewriteMethod rewriteMethod) { super(term.field(), rewriteMethod); this.term = term; this.automaton = automaton; this.automatonIsBinary = isBinary; - this.compiled = new CompiledAutomaton(automaton, false, true, isBinary); + this.compiled = new CompiledAutomaton(automaton, isFinite, simplify, isBinary); this.ramBytesUsed = BASE_RAM_BYTES + term.ramBytesUsed() + automaton.ramBytesUsed() + compiled.ramBytesUsed(); diff --git a/lucene/core/src/java/org/apache/lucene/search/TermInSetQuery.java b/lucene/core/src/java/org/apache/lucene/search/TermInSetQuery.java index 542f94e65f8e..47e61742f99a 100644 --- a/lucene/core/src/java/org/apache/lucene/search/TermInSetQuery.java +++ b/lucene/core/src/java/org/apache/lucene/search/TermInSetQuery.java @@ -17,26 +17,20 @@ package org.apache.lucene.search; import java.io.IOException; -import java.io.UncheckedIOException; +import java.util.ArrayList; import java.util.Arrays; import java.util.Collection; -import java.util.Collections; +import java.util.List; import java.util.SortedSet; -import org.apache.lucene.index.FilteredTermsEnum; -import org.apache.lucene.index.PrefixCodedTerms; -import org.apache.lucene.index.PrefixCodedTerms.TermIterator; import org.apache.lucene.index.Term; -import org.apache.lucene.index.Terms; -import org.apache.lucene.index.TermsEnum; -import org.apache.lucene.util.Accountable; import org.apache.lucene.util.ArrayUtil; -import org.apache.lucene.util.AttributeSource; import org.apache.lucene.util.BytesRef; import org.apache.lucene.util.BytesRefBuilder; -import org.apache.lucene.util.RamUsageEstimator; +import org.apache.lucene.util.IntsRef; import org.apache.lucene.util.automaton.Automata; import org.apache.lucene.util.automaton.Automaton; -import org.apache.lucene.util.automaton.ByteRunAutomaton; +import org.apache.lucene.util.automaton.FiniteStringsIterator; +import org.apache.lucene.util.fst.Util; /** * Specialization for a disjunction over many terms that, by default, behaves like a {@link @@ -69,29 +63,25 @@ * * <p>NOTE: This query produces scores that are equal to its boost */ -public class TermInSetQuery extends MultiTermQuery implements Accountable { - - private static final long BASE_RAM_BYTES_USED = - RamUsageEstimator.shallowSizeOfInstance(TermInSetQuery.class); - +public class TermInSetQuery extends AutomatonQuery { private final String field; - private final PrefixCodedTerms termData; - private final int termDataHashCode; // cached hashcode of termData + private final int termCount; public TermInSetQuery(String field, Collection<BytesRef> terms) { - this(field, packTerms(field, terms)); + super(new Term(field), toAutomaton(terms), true, false, true); + this.field = field; + this.termCount = terms.size(); } public TermInSetQuery(String field, BytesRef... terms) { - this(field, packTerms(field, Arrays.asList(terms))); + this(field, Arrays.asList(terms)); } /** Creates a new {@link TermInSetQuery} from the given collection of terms. */ public TermInSetQuery(RewriteMethod rewriteMethod, String field, Collection<BytesRef> terms) { - super(field, rewriteMethod); + super(new Term(field), toAutomaton(terms), true, false, true, rewriteMethod); this.field = field; - this.termData = packTerms(field, terms); - termDataHashCode = termData.hashCode(); + this.termCount = terms.size(); } /** Creates a new {@link TermInSetQuery} from the given array of terms. */ @@ -99,14 +89,7 @@ public TermInSetQuery(RewriteMethod rewriteMethod, String field, BytesRef... ter this(rewriteMethod, field, Arrays.asList(terms)); } - private TermInSetQuery(String field, PrefixCodedTerms termData) { - super(field, MultiTermQuery.CONSTANT_SCORE_BLENDED_REWRITE); - this.field = field; - this.termData = termData; - termDataHashCode = termData.hashCode(); - } - - private static PrefixCodedTerms packTerms(String field, Collection<BytesRef> terms) { + private static Automaton toAutomaton(Collection<BytesRef> terms) { BytesRef[] sortedTerms = terms.toArray(new BytesRef[0]); // already sorted if we are a SortedSet with natural order boolean sorted = @@ -114,7 +97,7 @@ private static PrefixCodedTerms packTerms(String field, Collection<BytesRef> ter if (sorted == false) { ArrayUtil.timSort(sortedTerms); } - PrefixCodedTerms.Builder builder = new PrefixCodedTerms.Builder(); + List<BytesRef> sortedAndDeduped = new ArrayList<>(terms.size()); BytesRefBuilder previous = null; for (BytesRef term : sortedTerms) { if (previous == null) { @@ -122,136 +105,41 @@ private static PrefixCodedTerms packTerms(String field, Collection<BytesRef> ter } else if (previous.get().equals(term)) { continue; // deduplicate } - builder.add(field, term); + sortedAndDeduped.add(term); previous.copyBytes(term); } - return builder.finish(); + return Automata.makeBinaryStringUnion(sortedAndDeduped); } @Override public long getTermsCount() throws IOException { - return termData.size(); - } - - @Override - public void visit(QueryVisitor visitor) { - if (visitor.acceptField(field) == false) { - return; - } - if (termData.size() == 1) { - visitor.consumeTerms(this, new Term(field, termData.iterator().next())); - } - if (termData.size() > 1) { - visitor.consumeTermsMatching(this, field, this::asByteRunAutomaton); - } - } - - // TODO: This is pretty heavy-weight. If we have TermInSetQuery directly extend AutomatonQuery - // we won't have to do this (see GH#12176). - private ByteRunAutomaton asByteRunAutomaton() { - try { - Automaton a = Automata.makeBinaryStringUnion(termData.iterator()); - return new ByteRunAutomaton(a, true); - } catch (IOException e) { - // Shouldn't happen since termData.iterator() provides an interator implementation that - // never throws: - throw new UncheckedIOException(e); - } - } - - @Override - public boolean equals(Object other) { - return sameClassAs(other) && equalsTo(getClass().cast(other)); - } - - private boolean equalsTo(TermInSetQuery other) { - // no need to check 'field' explicitly since it is encoded in 'termData' - // termData might be heavy to compare so check the hash code first - return termDataHashCode == other.termDataHashCode && termData.equals(other.termData); - } - - @Override - public int hashCode() { - return 31 * classHash() + termDataHashCode; + return termCount; } + /** + * This is semi-expensive as it fully traverses the automaton to create a helpful toString + * representation. + */ @Override public String toString(String defaultField) { StringBuilder builder = new StringBuilder(); builder.append(field); builder.append(":("); - TermIterator iterator = termData.iterator(); boolean first = true; - for (BytesRef term = iterator.next(); term != null; term = iterator.next()) { - if (!first) { + FiniteStringsIterator it = new FiniteStringsIterator(automaton); + BytesRefBuilder scratch = new BytesRefBuilder(); + for (IntsRef term = it.next(); term != null; term = it.next()) { + if (first == false) { builder.append(' '); } first = false; - builder.append(Term.toString(term)); + BytesRef t = Util.toBytesRef(term, scratch); + builder.append(Term.toString(t)); } builder.append(')'); return builder.toString(); } - - @Override - public long ramBytesUsed() { - return BASE_RAM_BYTES_USED + termData.ramBytesUsed(); - } - - @Override - public Collection<Accountable> getChildResources() { - return Collections.emptyList(); - } - - @Override - protected TermsEnum getTermsEnum(Terms terms, AttributeSource atts) throws IOException { - return new SetEnum(terms.iterator()); - } - - /** - * Like a baby {@link org.apache.lucene.index.AutomatonTermsEnum}, ping-pong intersects the terms - * dict against our encoded query terms. - */ - private class SetEnum extends FilteredTermsEnum { - private final TermIterator iterator; - private BytesRef seekTerm; - - SetEnum(TermsEnum termsEnum) { - super(termsEnum); - iterator = termData.iterator(); - seekTerm = iterator.next(); - } - - @Override - protected AcceptStatus accept(BytesRef term) throws IOException { - // next() our iterator until it is >= the incoming term - // if it matches exactly, it's a hit, otherwise it's a miss - int cmp = 0; - while (seekTerm != null && (cmp = seekTerm.compareTo(term)) < 0) { - seekTerm = iterator.next(); - } - if (seekTerm == null) { - return AcceptStatus.END; - } else if (cmp == 0) { - return AcceptStatus.YES_AND_SEEK; - } else { - return AcceptStatus.NO_AND_SEEK; - } - } - - @Override - protected BytesRef nextSeekTerm(BytesRef currentTerm) throws IOException { - // next() our iterator until it is > the currentTerm, must always make progress. - if (currentTerm == null) { - return seekTerm; - } - while (seekTerm != null && seekTerm.compareTo(currentTerm) <= 0) { - seekTerm = iterator.next(); - } - return seekTerm; - } - } } From 74459ee6f5f4bf45f00e6f2f8e2813317eb816b1 Mon Sep 17 00:00:00 2001 From: Greg Miller <gsmiller@gmail.com> Date: Fri, 2 Jun 2023 15:54:14 -0700 Subject: [PATCH 2/2] temporarily remove broken test case --- .../lucene/search/TestTermInSetQuery.java | 46 ------------------- 1 file changed, 46 deletions(-) diff --git a/lucene/core/src/test/org/apache/lucene/search/TestTermInSetQuery.java b/lucene/core/src/test/org/apache/lucene/search/TestTermInSetQuery.java index a62d7f8fc4d9..e78689e1e46b 100644 --- a/lucene/core/src/test/org/apache/lucene/search/TestTermInSetQuery.java +++ b/lucene/core/src/test/org/apache/lucene/search/TestTermInSetQuery.java @@ -25,7 +25,6 @@ import java.util.List; import java.util.Set; import java.util.concurrent.atomic.AtomicInteger; -import java.util.function.Supplier; import java.util.stream.Collectors; import org.apache.lucene.document.Document; import org.apache.lucene.document.Field.Store; @@ -47,7 +46,6 @@ import org.apache.lucene.tests.util.TestUtil; import org.apache.lucene.util.BytesRef; import org.apache.lucene.util.IOUtils; -import org.apache.lucene.util.automaton.ByteRunAutomaton; public class TestTermInSetQuery extends LuceneTestCase { @@ -359,48 +357,4 @@ public void testIsConsideredCostlyByQueryCache() throws IOException { // cached after two uses assertTrue(policy.shouldCache(query)); } - - public void testVisitor() { - // singleton reports back to consumeTerms() - TermInSetQuery singleton = new TermInSetQuery("field", newBytesRef("term1")); - singleton.visit( - new QueryVisitor() { - @Override - public void consumeTerms(Query query, Term... terms) { - assertEquals(1, terms.length); - assertEquals(new Term("field", newBytesRef("term1")), terms[0]); - } - - @Override - public void consumeTermsMatching( - Query query, String field, Supplier<ByteRunAutomaton> automaton) { - fail("Singleton TermInSetQuery should not try to build ByteRunAutomaton"); - } - }); - - // multiple values built into automaton - List<BytesRef> terms = new ArrayList<>(); - for (int i = 0; i < 100; i++) { - terms.add(newBytesRef("term" + i)); - } - TermInSetQuery t = new TermInSetQuery("field", terms); - t.visit( - new QueryVisitor() { - @Override - public void consumeTerms(Query query, Term... terms) { - fail("TermInSetQuery with multiple terms should build automaton"); - } - - @Override - public void consumeTermsMatching( - Query query, String field, Supplier<ByteRunAutomaton> automaton) { - ByteRunAutomaton a = automaton.get(); - BytesRef test = newBytesRef("nonmatching"); - assertFalse(a.run(test.bytes, test.offset, test.length)); - for (BytesRef term : terms) { - assertTrue(a.run(term.bytes, term.offset, term.length)); - } - } - }); - } }