Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Expose iterator over query terms in TermInSetQuery #12280

Closed
wants to merge 4 commits into from
Closed
Show file tree
Hide file tree
Changes from 3 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion lucene/CHANGES.txt
Original file line number Diff line number Diff line change
Expand Up @@ -110,7 +110,7 @@ Other

API Changes
---------------------
(No changes)
* GITHUB#12280: Expose iterator over query terms in TermInSetQuery (Greg Miller)

New Features
---------------------
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -34,6 +34,7 @@
import org.apache.lucene.util.AttributeSource;
import org.apache.lucene.util.BytesRef;
import org.apache.lucene.util.BytesRefBuilder;
import org.apache.lucene.util.BytesRefIterator;
import org.apache.lucene.util.RamUsageEstimator;
import org.apache.lucene.util.automaton.Automata;
import org.apache.lucene.util.automaton.Automaton;
Expand Down Expand Up @@ -137,6 +138,11 @@ public long getTermsCount() throws IOException {
return termData.size();
}

/** Returns an iterator of the provided query terms */
public BytesRefIterator getQueryTerms() {
return termData.iterator();
}

@Override
public void visit(QueryVisitor visitor) {
if (visitor.acceptField(field) == false) {
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -46,6 +46,7 @@
import org.apache.lucene.tests.util.RamUsageTester;
import org.apache.lucene.tests.util.TestUtil;
import org.apache.lucene.util.BytesRef;
import org.apache.lucene.util.BytesRefIterator;
import org.apache.lucene.util.IOUtils;
import org.apache.lucene.util.automaton.ByteRunAutomaton;

Expand Down Expand Up @@ -403,4 +404,18 @@ public void consumeTermsMatching(
}
});
}

public void testQueryTermIteration() throws Exception {
Set<BytesRef> terms = new HashSet<>();
for (int i = 0; i < 100; i++) {
terms.add(new BytesRef(TestUtil.randomAnalysisString(random(), 10, true)));
}
List<BytesRef> expected = terms.stream().sorted().toList();
TermInSetQuery q = new TermInSetQuery("field", expected);
BytesRefIterator it = q.getQueryTerms();
for (BytesRef e : expected) {
assertEquals(e, it.next());
}
assertNull(it.next());
}
}
Original file line number Diff line number Diff line change
@@ -0,0 +1,117 @@
package org.apache.lucene.sandbox.search;

import java.io.IOException;
import java.util.Collection;
import org.apache.lucene.index.ImpactsEnum;
import org.apache.lucene.index.PostingsEnum;
import org.apache.lucene.index.TermState;
import org.apache.lucene.index.Terms;
import org.apache.lucene.index.TermsEnum;
import org.apache.lucene.search.TermInSetQuery;
import org.apache.lucene.util.AttributeSource;
import org.apache.lucene.util.BytesRef;
import org.apache.lucene.util.BytesRefIterator;

/**
* {@link TermInSetQuery} optimized for a primary key-like field.
*
* <p>Relies on {@link TermsEnum#seekExact(BytesRef)} instead of {@link
* TermsEnum#seekCeil(BytesRef)} to produce a terms iterator, which is compatible with {@code
* BloomFilteringPostingsFormat}.
*/
public class PKTermInSetQuery extends TermInSetQuery {
Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This class is for demo purposes only. I'm not suggesting we merge it as part of this PR. I only want to demonstrate how a class might leverage getQueryTerms.

public PKTermInSetQuery(String field, Collection<BytesRef> terms) {
super(field, terms);
}

public PKTermInSetQuery(String field, BytesRef... terms) {
super(field, terms);
}

public PKTermInSetQuery(RewriteMethod rewriteMethod, String field, Collection<BytesRef> terms) {
super(rewriteMethod, field, terms);
}

public PKTermInSetQuery(RewriteMethod rewriteMethod, String field, BytesRef... terms) {
super(rewriteMethod, field, terms);
}

@Override
protected TermsEnum getTermsEnum(Terms terms, AttributeSource atts) throws IOException {
final TermsEnum tEnum = terms.iterator();
final BytesRefIterator queryTerms = getQueryTerms();

return new TermsEnum() {
@Override
public BytesRef next() throws IOException {
BytesRef nextTerm;
while ((nextTerm = queryTerms.next()) != null) {
if (tEnum.seekExact(nextTerm)) {
break;
}
}
return nextTerm;
}

@Override
public AttributeSource attributes() {
return tEnum.attributes();
}

@Override
public BytesRef term() throws IOException {
return tEnum.term();
}

@Override
public long ord() throws IOException {
return tEnum.ord();
}

@Override
public int docFreq() throws IOException {
return tEnum.docFreq();
}

@Override
public long totalTermFreq() throws IOException {
return tEnum.totalTermFreq();
}

@Override
public PostingsEnum postings(PostingsEnum reuse, int flags) throws IOException {
return tEnum.postings(reuse, flags);
}

@Override
public ImpactsEnum impacts(int flags) throws IOException {
return tEnum.impacts(flags);
}

@Override
public TermState termState() throws IOException {
return tEnum.termState();
}

@Override
public boolean seekExact(BytesRef text) throws IOException {
throw new UnsupportedOperationException();
}

@Override
public SeekStatus seekCeil(BytesRef text) throws IOException {
throw new UnsupportedOperationException();
}

@Override
public void seekExact(long ord) throws IOException {
throw new UnsupportedOperationException();
}

@Override
public void seekExact(BytesRef term, TermState state) throws IOException {
throw new UnsupportedOperationException();
}
};
}
}