From c870b8b06c0f342e22a61744b5a85b86cd11e7bc Mon Sep 17 00:00:00 2001 From: Greg Miller Date: Mon, 8 May 2023 16:35:42 -0700 Subject: [PATCH 1/4] Expose iterator over query terms in TermInSetQuery --- lucene/CHANGES.txt | 2 +- .../org/apache/lucene/search/TermInSetQuery.java | 6 ++++++ .../apache/lucene/search/TestTermInSetQuery.java | 15 +++++++++++++++ 3 files changed, 22 insertions(+), 1 deletion(-) diff --git a/lucene/CHANGES.txt b/lucene/CHANGES.txt index db018fb21446..e50f5d615add 100644 --- a/lucene/CHANGES.txt +++ b/lucene/CHANGES.txt @@ -110,7 +110,7 @@ Other API Changes --------------------- -(No changes) +* GITHUB#xx: Expose iterator over query terms in TermInSetQuery (Greg Miller) New Features --------------------- diff --git a/lucene/core/src/java/org/apache/lucene/search/TermInSetQuery.java b/lucene/core/src/java/org/apache/lucene/search/TermInSetQuery.java index b989ac07849f..fba4ec795d51 100644 --- a/lucene/core/src/java/org/apache/lucene/search/TermInSetQuery.java +++ b/lucene/core/src/java/org/apache/lucene/search/TermInSetQuery.java @@ -34,6 +34,7 @@ import org.apache.lucene.util.AttributeSource; import org.apache.lucene.util.BytesRef; import org.apache.lucene.util.BytesRefBuilder; +import org.apache.lucene.util.BytesRefIterator; import org.apache.lucene.util.RamUsageEstimator; import org.apache.lucene.util.automaton.Automata; import org.apache.lucene.util.automaton.Automaton; @@ -137,6 +138,11 @@ public long getTermsCount() throws IOException { return termData.size(); } + /** Returns an iterator of the provided query terms */ + public BytesRefIterator getQueryTerms() { + return termData.iterator(); + } + @Override public void visit(QueryVisitor visitor) { if (visitor.acceptField(field) == false) { diff --git a/lucene/core/src/test/org/apache/lucene/search/TestTermInSetQuery.java b/lucene/core/src/test/org/apache/lucene/search/TestTermInSetQuery.java index a62d7f8fc4d9..d313112800b5 100644 --- a/lucene/core/src/test/org/apache/lucene/search/TestTermInSetQuery.java +++ b/lucene/core/src/test/org/apache/lucene/search/TestTermInSetQuery.java @@ -46,6 +46,7 @@ import org.apache.lucene.tests.util.RamUsageTester; import org.apache.lucene.tests.util.TestUtil; import org.apache.lucene.util.BytesRef; +import org.apache.lucene.util.BytesRefIterator; import org.apache.lucene.util.IOUtils; import org.apache.lucene.util.automaton.ByteRunAutomaton; @@ -403,4 +404,18 @@ public void consumeTermsMatching( } }); } + + public void testQueryTermIteration() throws Exception { + Set terms = new HashSet<>(); + for (int i = 0; i < 100; i++) { + terms.add(new BytesRef(TestUtil.randomAnalysisString(random(), 10, true))); + } + List expected = terms.stream().sorted().toList(); + TermInSetQuery q = new TermInSetQuery("field", expected); + BytesRefIterator it = q.getQueryTerms(); + for (BytesRef e : expected) { + assertEquals(e, it.next()); + } + assertNull(it.next()); + } } From 8b745552dc228113cf69d264ff3b96637dfe4aec Mon Sep 17 00:00:00 2001 From: Greg Miller Date: Mon, 8 May 2023 16:43:53 -0700 Subject: [PATCH 2/4] changes --- lucene/CHANGES.txt | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/lucene/CHANGES.txt b/lucene/CHANGES.txt index e50f5d615add..5cfb167c2a03 100644 --- a/lucene/CHANGES.txt +++ b/lucene/CHANGES.txt @@ -110,7 +110,7 @@ Other API Changes --------------------- -* GITHUB#xx: Expose iterator over query terms in TermInSetQuery (Greg Miller) +* GITHUB#12280: Expose iterator over query terms in TermInSetQuery (Greg Miller) New Features --------------------- From 49eb9f11fd86e2771f202c232506e78712421bc0 Mon Sep 17 00:00:00 2001 From: Greg Miller Date: Tue, 9 May 2023 09:26:16 -0700 Subject: [PATCH 3/4] nocommit: add demo sandbox query to show extension use-case --- .../sandbox/search/PKTermInSetQuery.java | 117 ++++++++++++++++++ 1 file changed, 117 insertions(+) create mode 100644 lucene/sandbox/src/java/org/apache/lucene/sandbox/search/PKTermInSetQuery.java diff --git a/lucene/sandbox/src/java/org/apache/lucene/sandbox/search/PKTermInSetQuery.java b/lucene/sandbox/src/java/org/apache/lucene/sandbox/search/PKTermInSetQuery.java new file mode 100644 index 000000000000..11542a754e79 --- /dev/null +++ b/lucene/sandbox/src/java/org/apache/lucene/sandbox/search/PKTermInSetQuery.java @@ -0,0 +1,117 @@ +package org.apache.lucene.sandbox.search; + +import java.io.IOException; +import java.util.Collection; +import org.apache.lucene.index.ImpactsEnum; +import org.apache.lucene.index.PostingsEnum; +import org.apache.lucene.index.TermState; +import org.apache.lucene.index.Terms; +import org.apache.lucene.index.TermsEnum; +import org.apache.lucene.search.TermInSetQuery; +import org.apache.lucene.util.AttributeSource; +import org.apache.lucene.util.BytesRef; +import org.apache.lucene.util.BytesRefIterator; + +/** + * {@link TermInSetQuery} optimized for a primary key-like field. + * + *

Relies on {@link TermsEnum#seekExact(BytesRef)} instead of {@link + * TermsEnum#seekCeil(BytesRef)} to produce a terms iterator, which is compatible with {@code + * BloomFilteringPostingsFormat}. + */ +public class PKTermInSetQuery extends TermInSetQuery { + public PKTermInSetQuery(String field, Collection terms) { + super(field, terms); + } + + public PKTermInSetQuery(String field, BytesRef... terms) { + super(field, terms); + } + + public PKTermInSetQuery(RewriteMethod rewriteMethod, String field, Collection terms) { + super(rewriteMethod, field, terms); + } + + public PKTermInSetQuery(RewriteMethod rewriteMethod, String field, BytesRef... terms) { + super(rewriteMethod, field, terms); + } + + @Override + protected TermsEnum getTermsEnum(Terms terms, AttributeSource atts) throws IOException { + final TermsEnum tEnum = terms.iterator(); + final BytesRefIterator queryTerms = getQueryTerms(); + + return new TermsEnum() { + @Override + public BytesRef next() throws IOException { + BytesRef nextTerm; + while ((nextTerm = queryTerms.next()) != null) { + if (tEnum.seekExact(nextTerm)) { + break; + } + } + return nextTerm; + } + + @Override + public AttributeSource attributes() { + return tEnum.attributes(); + } + + @Override + public BytesRef term() throws IOException { + return tEnum.term(); + } + + @Override + public long ord() throws IOException { + return tEnum.ord(); + } + + @Override + public int docFreq() throws IOException { + return tEnum.docFreq(); + } + + @Override + public long totalTermFreq() throws IOException { + return tEnum.totalTermFreq(); + } + + @Override + public PostingsEnum postings(PostingsEnum reuse, int flags) throws IOException { + return tEnum.postings(reuse, flags); + } + + @Override + public ImpactsEnum impacts(int flags) throws IOException { + return tEnum.impacts(flags); + } + + @Override + public TermState termState() throws IOException { + return tEnum.termState(); + } + + @Override + public boolean seekExact(BytesRef text) throws IOException { + throw new UnsupportedOperationException(); + } + + @Override + public SeekStatus seekCeil(BytesRef text) throws IOException { + throw new UnsupportedOperationException(); + } + + @Override + public void seekExact(long ord) throws IOException { + throw new UnsupportedOperationException(); + } + + @Override + public void seekExact(BytesRef term, TermState state) throws IOException { + throw new UnsupportedOperationException(); + } + }; + } +} From 8af5cfe94eb17d88a423e5aa385caf8cab167de2 Mon Sep 17 00:00:00 2001 From: Greg Miller Date: Tue, 9 May 2023 09:31:58 -0700 Subject: [PATCH 4/4] add license header to make checks happy --- .../lucene/sandbox/search/PKTermInSetQuery.java | 16 ++++++++++++++++ 1 file changed, 16 insertions(+) diff --git a/lucene/sandbox/src/java/org/apache/lucene/sandbox/search/PKTermInSetQuery.java b/lucene/sandbox/src/java/org/apache/lucene/sandbox/search/PKTermInSetQuery.java index 11542a754e79..4cc10c5569cb 100644 --- a/lucene/sandbox/src/java/org/apache/lucene/sandbox/search/PKTermInSetQuery.java +++ b/lucene/sandbox/src/java/org/apache/lucene/sandbox/search/PKTermInSetQuery.java @@ -1,3 +1,19 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ package org.apache.lucene.sandbox.search; import java.io.IOException;