Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

[DRAFT] GH#12176: TermInSetQuery extends AutomatonQuery #12312

Draft
wants to merge 2 commits into
base: main
Choose a base branch
from
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Original file line number Diff line number Diff line change
Expand Up @@ -91,11 +91,47 @@ public AutomatonQuery(final Term term, Automaton automaton, boolean isBinary) {
*/
public AutomatonQuery(
final Term term, Automaton automaton, boolean isBinary, RewriteMethod rewriteMethod) {
this(term, automaton, false, true, isBinary, rewriteMethod);
}

/**
* Create a new AutomatonQuery from an {@link Automaton}.
*
* @param term Term containing field and possibly some pattern structure. The term text is
* ignored.
* @param automaton Automaton to run, terms that are accepted are considered a match.
* @param isFinite if true, this automaton is already finite
* @param isBinary if true, this automaton is already binary and will not go through the
* UTF32ToUTF8 conversion
*/
public AutomatonQuery(
final Term term, Automaton automaton, boolean isFinite, boolean simplify, boolean isBinary) {
this(term, automaton, isFinite, simplify, isBinary, CONSTANT_SCORE_BLENDED_REWRITE);
}

/**
* Create a new AutomatonQuery from an {@link Automaton}.
*
* @param term Term containing field and possibly some pattern structure. The term text is
* ignored.
* @param automaton Automaton to run, terms that are accepted are considered a match.
* @param isFinite if true, this automaton is already finite
* @param isBinary if true, this automaton is already binary and will not go through the
* UTF32ToUTF8 conversion
* @param rewriteMethod the rewriteMethod to use to build the final query from the automaton
*/
public AutomatonQuery(
final Term term,
Automaton automaton,
boolean isFinite,
boolean simplify,
boolean isBinary,
RewriteMethod rewriteMethod) {
super(term.field(), rewriteMethod);
this.term = term;
this.automaton = automaton;
this.automatonIsBinary = isBinary;
this.compiled = new CompiledAutomaton(automaton, false, true, isBinary);
this.compiled = new CompiledAutomaton(automaton, isFinite, simplify, isBinary);

this.ramBytesUsed =
BASE_RAM_BYTES + term.ramBytesUsed() + automaton.ramBytesUsed() + compiled.ramBytesUsed();
Expand Down
168 changes: 28 additions & 140 deletions lucene/core/src/java/org/apache/lucene/search/TermInSetQuery.java
Original file line number Diff line number Diff line change
Expand Up @@ -17,26 +17,20 @@
package org.apache.lucene.search;

import java.io.IOException;
import java.io.UncheckedIOException;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.Collection;
import java.util.Collections;
import java.util.List;
import java.util.SortedSet;
import org.apache.lucene.index.FilteredTermsEnum;
import org.apache.lucene.index.PrefixCodedTerms;
import org.apache.lucene.index.PrefixCodedTerms.TermIterator;
import org.apache.lucene.index.Term;
import org.apache.lucene.index.Terms;
import org.apache.lucene.index.TermsEnum;
import org.apache.lucene.util.Accountable;
import org.apache.lucene.util.ArrayUtil;
import org.apache.lucene.util.AttributeSource;
import org.apache.lucene.util.BytesRef;
import org.apache.lucene.util.BytesRefBuilder;
import org.apache.lucene.util.RamUsageEstimator;
import org.apache.lucene.util.IntsRef;
import org.apache.lucene.util.automaton.Automata;
import org.apache.lucene.util.automaton.Automaton;
import org.apache.lucene.util.automaton.ByteRunAutomaton;
import org.apache.lucene.util.automaton.FiniteStringsIterator;
import org.apache.lucene.util.fst.Util;

/**
* Specialization for a disjunction over many terms that, by default, behaves like a {@link
Expand Down Expand Up @@ -69,189 +63,83 @@
*
* <p>NOTE: This query produces scores that are equal to its boost
*/
public class TermInSetQuery extends MultiTermQuery implements Accountable {

private static final long BASE_RAM_BYTES_USED =
RamUsageEstimator.shallowSizeOfInstance(TermInSetQuery.class);

public class TermInSetQuery extends AutomatonQuery {
private final String field;
private final PrefixCodedTerms termData;
private final int termDataHashCode; // cached hashcode of termData
private final int termCount;

public TermInSetQuery(String field, Collection<BytesRef> terms) {
this(field, packTerms(field, terms));
super(new Term(field), toAutomaton(terms), true, false, true);
this.field = field;
this.termCount = terms.size();
}

public TermInSetQuery(String field, BytesRef... terms) {
this(field, packTerms(field, Arrays.asList(terms)));
this(field, Arrays.asList(terms));
}

/** Creates a new {@link TermInSetQuery} from the given collection of terms. */
public TermInSetQuery(RewriteMethod rewriteMethod, String field, Collection<BytesRef> terms) {
super(field, rewriteMethod);
super(new Term(field), toAutomaton(terms), true, false, true, rewriteMethod);
this.field = field;
this.termData = packTerms(field, terms);
termDataHashCode = termData.hashCode();
this.termCount = terms.size();
}

/** Creates a new {@link TermInSetQuery} from the given array of terms. */
public TermInSetQuery(RewriteMethod rewriteMethod, String field, BytesRef... terms) {
this(rewriteMethod, field, Arrays.asList(terms));
}

private TermInSetQuery(String field, PrefixCodedTerms termData) {
super(field, MultiTermQuery.CONSTANT_SCORE_BLENDED_REWRITE);
this.field = field;
this.termData = termData;
termDataHashCode = termData.hashCode();
}

private static PrefixCodedTerms packTerms(String field, Collection<BytesRef> terms) {
private static Automaton toAutomaton(Collection<BytesRef> terms) {
BytesRef[] sortedTerms = terms.toArray(new BytesRef[0]);
// already sorted if we are a SortedSet with natural order
boolean sorted =
terms instanceof SortedSet && ((SortedSet<BytesRef>) terms).comparator() == null;
if (sorted == false) {
ArrayUtil.timSort(sortedTerms);
}
PrefixCodedTerms.Builder builder = new PrefixCodedTerms.Builder();
List<BytesRef> sortedAndDeduped = new ArrayList<>(terms.size());
BytesRefBuilder previous = null;
for (BytesRef term : sortedTerms) {
if (previous == null) {
previous = new BytesRefBuilder();
} else if (previous.get().equals(term)) {
continue; // deduplicate
}
builder.add(field, term);
sortedAndDeduped.add(term);
previous.copyBytes(term);
}

return builder.finish();
return Automata.makeBinaryStringUnion(sortedAndDeduped);
}

@Override
public long getTermsCount() throws IOException {
return termData.size();
}

@Override
public void visit(QueryVisitor visitor) {
if (visitor.acceptField(field) == false) {
return;
}
if (termData.size() == 1) {
visitor.consumeTerms(this, new Term(field, termData.iterator().next()));
}
if (termData.size() > 1) {
visitor.consumeTermsMatching(this, field, this::asByteRunAutomaton);
}
}

// TODO: This is pretty heavy-weight. If we have TermInSetQuery directly extend AutomatonQuery
// we won't have to do this (see GH#12176).
private ByteRunAutomaton asByteRunAutomaton() {
try {
Automaton a = Automata.makeBinaryStringUnion(termData.iterator());
return new ByteRunAutomaton(a, true);
} catch (IOException e) {
// Shouldn't happen since termData.iterator() provides an interator implementation that
// never throws:
throw new UncheckedIOException(e);
}
}

@Override
public boolean equals(Object other) {
return sameClassAs(other) && equalsTo(getClass().cast(other));
}

private boolean equalsTo(TermInSetQuery other) {
// no need to check 'field' explicitly since it is encoded in 'termData'
// termData might be heavy to compare so check the hash code first
return termDataHashCode == other.termDataHashCode && termData.equals(other.termData);
}

@Override
public int hashCode() {
return 31 * classHash() + termDataHashCode;
return termCount;
}

/**
* This is semi-expensive as it fully traverses the automaton to create a helpful toString
* representation.
*/
@Override
public String toString(String defaultField) {
StringBuilder builder = new StringBuilder();
builder.append(field);
builder.append(":(");

TermIterator iterator = termData.iterator();
boolean first = true;
for (BytesRef term = iterator.next(); term != null; term = iterator.next()) {
if (!first) {
FiniteStringsIterator it = new FiniteStringsIterator(automaton);
BytesRefBuilder scratch = new BytesRefBuilder();
for (IntsRef term = it.next(); term != null; term = it.next()) {
if (first == false) {
builder.append(' ');
}
first = false;
builder.append(Term.toString(term));
BytesRef t = Util.toBytesRef(term, scratch);
builder.append(Term.toString(t));
}
builder.append(')');

return builder.toString();
}

@Override
public long ramBytesUsed() {
return BASE_RAM_BYTES_USED + termData.ramBytesUsed();
}

@Override
public Collection<Accountable> getChildResources() {
return Collections.emptyList();
}

@Override
protected TermsEnum getTermsEnum(Terms terms, AttributeSource atts) throws IOException {
return new SetEnum(terms.iterator());
}

/**
* Like a baby {@link org.apache.lucene.index.AutomatonTermsEnum}, ping-pong intersects the terms
* dict against our encoded query terms.
*/
private class SetEnum extends FilteredTermsEnum {
private final TermIterator iterator;
private BytesRef seekTerm;

SetEnum(TermsEnum termsEnum) {
super(termsEnum);
iterator = termData.iterator();
seekTerm = iterator.next();
}

@Override
protected AcceptStatus accept(BytesRef term) throws IOException {
// next() our iterator until it is >= the incoming term
// if it matches exactly, it's a hit, otherwise it's a miss
int cmp = 0;
while (seekTerm != null && (cmp = seekTerm.compareTo(term)) < 0) {
seekTerm = iterator.next();
}
if (seekTerm == null) {
return AcceptStatus.END;
} else if (cmp == 0) {
return AcceptStatus.YES_AND_SEEK;
} else {
return AcceptStatus.NO_AND_SEEK;
}
}

@Override
protected BytesRef nextSeekTerm(BytesRef currentTerm) throws IOException {
// next() our iterator until it is > the currentTerm, must always make progress.
if (currentTerm == null) {
return seekTerm;
}
while (seekTerm != null && seekTerm.compareTo(currentTerm) <= 0) {
seekTerm = iterator.next();
}
return seekTerm;
}
}
}
Original file line number Diff line number Diff line change
Expand Up @@ -25,7 +25,6 @@
import java.util.List;
import java.util.Set;
import java.util.concurrent.atomic.AtomicInteger;
import java.util.function.Supplier;
import java.util.stream.Collectors;
import org.apache.lucene.document.Document;
import org.apache.lucene.document.Field.Store;
Expand All @@ -47,7 +46,6 @@
import org.apache.lucene.tests.util.TestUtil;
import org.apache.lucene.util.BytesRef;
import org.apache.lucene.util.IOUtils;
import org.apache.lucene.util.automaton.ByteRunAutomaton;

public class TestTermInSetQuery extends LuceneTestCase {

Expand Down Expand Up @@ -359,48 +357,4 @@ public void testIsConsideredCostlyByQueryCache() throws IOException {
// cached after two uses
assertTrue(policy.shouldCache(query));
}

public void testVisitor() {
// singleton reports back to consumeTerms()
TermInSetQuery singleton = new TermInSetQuery("field", newBytesRef("term1"));
singleton.visit(
new QueryVisitor() {
@Override
public void consumeTerms(Query query, Term... terms) {
assertEquals(1, terms.length);
assertEquals(new Term("field", newBytesRef("term1")), terms[0]);
}

@Override
public void consumeTermsMatching(
Query query, String field, Supplier<ByteRunAutomaton> automaton) {
fail("Singleton TermInSetQuery should not try to build ByteRunAutomaton");
}
});

// multiple values built into automaton
List<BytesRef> terms = new ArrayList<>();
for (int i = 0; i < 100; i++) {
terms.add(newBytesRef("term" + i));
}
TermInSetQuery t = new TermInSetQuery("field", terms);
t.visit(
new QueryVisitor() {
@Override
public void consumeTerms(Query query, Term... terms) {
fail("TermInSetQuery with multiple terms should build automaton");
}

@Override
public void consumeTermsMatching(
Query query, String field, Supplier<ByteRunAutomaton> automaton) {
ByteRunAutomaton a = automaton.get();
BytesRef test = newBytesRef("nonmatching");
assertFalse(a.run(test.bytes, test.offset, test.length));
for (BytesRef term : terms) {
assertTrue(a.run(term.bytes, term.offset, term.length));
}
}
});
}
}