Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

LUCENE-10002: Deprecate IndexSearch#search(Query, Collector) in favor of IndexSearcher#search(Query, CollectorManager) - TopFieldCollectorManager & TopScoreDocCollectorManager #240

Merged
Show file tree
Hide file tree
Changes from 15 commits
Commits
Show all changes
18 commits
Select commit Hold shift + click to select a range
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Original file line number Diff line number Diff line change
Expand Up @@ -24,14 +24,13 @@
import org.apache.lucene.index.IndexReader;
import org.apache.lucene.index.MultiBits;
import org.apache.lucene.index.StoredFields;
import org.apache.lucene.search.Collector;
import org.apache.lucene.search.IndexSearcher;
import org.apache.lucene.search.Query;
import org.apache.lucene.search.ScoreDoc;
import org.apache.lucene.search.Sort;
import org.apache.lucene.search.TopDocs;
import org.apache.lucene.search.TopFieldCollector;
import org.apache.lucene.search.TopScoreDocCollector;
import org.apache.lucene.search.TopFieldCollectorManager;
import org.apache.lucene.search.TopScoreDocCollectorManager;
import org.apache.lucene.store.Directory;
import org.apache.lucene.util.Bits;

Expand Down Expand Up @@ -110,17 +109,20 @@ public int doLogic() throws Exception {
// the IndexSearcher search methods that take
// Weight public again, we can go back to
// pulling the Weight ourselves:
TopFieldCollector collector =
TopFieldCollector.create(sort, numHits, withTotalHits() ? Integer.MAX_VALUE : 1);
searcher.search(q, collector);
hits = collector.topDocs();
int totalHitsThreshold = withTotalHits() ? Integer.MAX_VALUE : 1;
TopFieldCollectorManager collectorManager =
new TopFieldCollectorManager(
sort, numHits, null, totalHitsThreshold, searcher.getSlices().length > 1);
hits = searcher.search(q, collectorManager);
} else {
hits = searcher.search(q, numHits);
}
} else {
Collector collector = createCollector();
searcher.search(q, collector);
// hits = collector.topDocs();
int totalHitsThreshold = withTotalHits() ? Integer.MAX_VALUE : 1;
TopScoreDocCollectorManager collectorManager =
new TopScoreDocCollectorManager(
numHits(), null, totalHitsThreshold, searcher.getSlices().length > 1);
hits = searcher.search(q, collectorManager);
}

if (hits != null) {
Expand Down Expand Up @@ -182,10 +184,6 @@ protected int withTopDocs(IndexSearcher searcher, Query q, TopDocs hits) throws
return res;
}

protected Collector createCollector() throws Exception {
return TopScoreDocCollector.create(numHits(), withTotalHits() ? Integer.MAX_VALUE : 1);
}

protected Document retrieveDoc(StoredFields storedFields, int id) throws IOException {
return storedFields.document(id);
}
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -19,8 +19,6 @@
import org.apache.lucene.benchmark.byTask.PerfRunData;
import org.apache.lucene.benchmark.byTask.feeds.QueryMaker;
import org.apache.lucene.benchmark.byTask.utils.Config;
import org.apache.lucene.search.Collector;
import org.apache.lucene.search.TopScoreDocCollector;

/** Does search w/ a custom collector */
public class SearchWithCollectorTask extends SearchTask {
Expand All @@ -45,20 +43,6 @@ public boolean withCollector() {
return true;
}

@Override
protected Collector createCollector() throws Exception {
zacharymorn marked this conversation as resolved.
Show resolved Hide resolved
Collector collector = null;
if (clnName.equalsIgnoreCase("topScoreDoc") == true) {
collector = TopScoreDocCollector.create(numHits(), Integer.MAX_VALUE);
} else if (clnName.length() > 0) {
collector = Class.forName(clnName).asSubclass(Collector.class).getConstructor().newInstance();

} else {
collector = super.createCollector();
}
return collector;
}

@Override
public QueryMaker getQueryMaker() {
return getRunData().getQueryMaker(this);
Expand Down
73 changes: 13 additions & 60 deletions lucene/core/src/java/org/apache/lucene/search/IndexSearcher.java
Original file line number Diff line number Diff line change
Expand Up @@ -19,7 +19,6 @@
import java.io.IOException;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.Collection;
import java.util.Collections;
import java.util.Comparator;
import java.util.List;
Expand Down Expand Up @@ -62,9 +61,9 @@
* match lots of documents, counting the number of hits may take much longer than computing the top
* hits so this trade-off allows to get some minimal information about the hit count without slowing
* down search too much. The {@link TopDocs#scoreDocs} array is always accurate however. If this
* behavior doesn't suit your needs, you should create collectors manually with either {@link
* TopScoreDocCollector#create} or {@link TopFieldCollector#create} and call {@link #search(Query,
* Collector)}.
* behavior doesn't suit your needs, you should create collectorManagers manually with either {@link
* TopScoreDocCollectorManager} or {@link TopFieldCollectorManager} and call {@link #search(Query,
* CollectorManager)}.
*
* <p><a id="thread-safety"></a>
*
Expand Down Expand Up @@ -455,35 +454,10 @@ public TopDocs searchAfter(ScoreDoc after, Query query, int numHits) throws IOEx
}

final int cappedNumHits = Math.min(numHits, limit);

final LeafSlice[] leafSlices = getSlices();
final CollectorManager<TopScoreDocCollector, TopDocs> manager =
new CollectorManager<TopScoreDocCollector, TopDocs>() {

private final HitsThresholdChecker hitsThresholdChecker =
leafSlices.length <= 1
? HitsThresholdChecker.create(Math.max(TOTAL_HITS_THRESHOLD, numHits))
: HitsThresholdChecker.createShared(Math.max(TOTAL_HITS_THRESHOLD, numHits));

private final MaxScoreAccumulator minScoreAcc =
leafSlices.length <= 1 ? null : new MaxScoreAccumulator();

@Override
public TopScoreDocCollector newCollector() throws IOException {
return TopScoreDocCollector.create(
cappedNumHits, after, hitsThresholdChecker, minScoreAcc);
}

@Override
public TopDocs reduce(Collection<TopScoreDocCollector> collectors) throws IOException {
final TopDocs[] topDocs = new TopDocs[collectors.size()];
int i = 0;
for (TopScoreDocCollector collector : collectors) {
topDocs[i++] = collector.topDocs();
}
return TopDocs.merge(0, cappedNumHits, topDocs);
}
};
final boolean supportsConcurrency = getSlices().length > 1;
CollectorManager<TopScoreDocCollector, TopDocs> manager =
new TopScoreDocCollectorManager(
cappedNumHits, after, TOTAL_HITS_THRESHOLD, supportsConcurrency);

return search(query, manager);
}
Expand All @@ -510,7 +484,10 @@ public TopDocs search(Query query, int n) throws IOException {
*
* @throws TooManyClauses If a query would exceed {@link IndexSearcher#getMaxClauseCount()}
* clauses.
* @deprecated This method is being deprecated in favor of {@link IndexSearcher#search(Query,
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I think we like the @lucene.deprecated tag?

Is your reasoning behind keeping this in there for now that all of our uses of this (internal to Lucene) haven't yet migrated as part of this change? Would the plan me to migrate all usages off of this internally and then actually remove it on main/9.0, or are you thinking of keeping it around until 10.0? I think our backwards compatibility policy is such that we could just directly remove this on main/9.0, but then leave it like you have it (marked deprecated) if you choose to backport this to 8x. Since this method is so fundamental though, I could easily see an argument to keep it around for an extra major release to give users more time to migrate. Then again, the migration path seems pretty straight-forward. What do you think?

Copy link
Contributor Author

@zacharymorn zacharymorn Aug 20, 2021

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I think we like the @lucene.deprecated tag?

Do you mean @lucene.deprecated should be created and used instead of using @deprecated? I can't seems to find that tag is being used in lucene actually...I think @lucene.deprecated most likely won't be recognized by IDE or build tool to signal / warn the use of deprecated methods though.

Is your reasoning behind keeping this in there for now that all of our uses of this (internal to Lucene) haven't yet migrated as part of this change? Would the plan me to migrate all usages off of this internally and then actually remove it on main/9.0, or are you thinking of keeping it around until 10.0? I think our backwards compatibility policy is such that we could just directly remove this on main/9.0, but then leave it like you have it (marked deprecated) if you choose to backport this to 8x. Since this method is so fundamental though, I could easily see an argument to keep it around for an extra major release to give users more time to migrate. Then again, the migration path seems pretty straight-forward. What do you think?

As explained in https://issues.apache.org/jira/browse/LUCENE-10002?focusedCommentId=17397827&page=com.atlassian.jira.plugin.system.issuetabpanels:comment-tabpanel#comment-17397827, I use deprecation instead of direct removal in this PR exactly for the reasons you mentioned:

  1. Not all internal uses have been migrated in this PR (which may require another few thousand lines of changes).
  2. I personally feel that we should give users more time for migration and testing out the changes, so 10.0 release might be a good timing for complete removal.

I would be interested in seeing how folks feel about the timing of removal, but after #1 is completed, complete removal of IndexSearcher#search(Query, Collector) in lucene codebase should require only straightforward deletion changes.

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I think we should try to keep IndexSearcher#search(Query, Collector) and the TopXXXCollector#create factory methods during the 9.x series, as I expect it to be quite commonly used. However in my opinion it would be fine to cut over expert classes like those in the benchmark module without providing such long bw compatibility.

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

+1 makes sense to keep it in 9.0. As for my comment about @lucene.deprecated, I was getting this mixed up with something else. Apologies for any confusion!

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

As for my comment about @lucene.deprecated, I was getting this mixed up with something else. Apologies for any confusion!

Though, I think there are both @deprecated (javadoc tag) and @Deprecated (Java class annotation) -- we do both of them typically (looks like you did here @zacharymorn, great!).

* CollectorManager)} due to its support for concurrency in IndexSearcher
*/
@Deprecated
public void search(Query query, Collector results) throws IOException {
query = rewrite(query, results.scoreMode().needsScores());
search(leafContexts, createWeight(query, results.scoreMode(), 1), results);
Expand Down Expand Up @@ -602,34 +579,10 @@ private TopFieldDocs searchAfter(
final Sort rewrittenSort = sort.rewrite(this);
final LeafSlice[] leafSlices = getSlices();

final boolean supportsConcurrency = leafSlices.length > 1;
final CollectorManager<TopFieldCollector, TopFieldDocs> manager =
new CollectorManager<>() {

private final HitsThresholdChecker hitsThresholdChecker =
leafSlices.length <= 1
? HitsThresholdChecker.create(Math.max(TOTAL_HITS_THRESHOLD, numHits))
: HitsThresholdChecker.createShared(Math.max(TOTAL_HITS_THRESHOLD, numHits));

private final MaxScoreAccumulator minScoreAcc =
leafSlices.length <= 1 ? null : new MaxScoreAccumulator();

@Override
public TopFieldCollector newCollector() throws IOException {
// TODO: don't pay the price for accurate hit counts by default
return TopFieldCollector.create(
rewrittenSort, cappedNumHits, after, hitsThresholdChecker, minScoreAcc);
}

@Override
public TopFieldDocs reduce(Collection<TopFieldCollector> collectors) throws IOException {
final TopFieldDocs[] topDocs = new TopFieldDocs[collectors.size()];
int i = 0;
for (TopFieldCollector collector : collectors) {
topDocs[i++] = collector.topDocs();
}
return TopDocs.merge(rewrittenSort, 0, cappedNumHits, topDocs);
}
};
new TopFieldCollectorManager(
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Wonderful to see all this redundant code go away!

rewrittenSort, cappedNumHits, after, TOTAL_HITS_THRESHOLD, supportsConcurrency);

TopFieldDocs topDocs = search(query, manager);
if (doDocScores) {
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -45,7 +45,10 @@ public TopDocs rescore(IndexSearcher searcher, TopDocs firstPassTopDocs, int top

List<LeafReaderContext> leaves = searcher.getIndexReader().leaves();

TopFieldCollector collector = TopFieldCollector.create(sort, topN, Integer.MAX_VALUE);
TopFieldCollector collector =
new TopFieldCollectorManager(
sort, topN, null, Integer.MAX_VALUE, searcher.getSlices().length > 1)
.newCollector();

// Now merge sort docIDs from hits, with reader's leaves:
int hitUpto = 0;
Expand Down
3 changes: 1 addition & 2 deletions lucene/core/src/java/org/apache/lucene/search/TopDocs.java
Original file line number Diff line number Diff line change
Expand Up @@ -232,8 +232,7 @@ public static TopDocs merge(
/**
* Returns a new TopFieldDocs, containing topN results across the provided TopFieldDocs, sorting
* by the specified {@link Sort}. Each of the TopDocs must have been sorted by the same Sort, and
* sort field values must have been filled (ie, <code>fillFields=true</code> must be passed to
* {@link TopFieldCollector#create}).
* sort field values must have been filled.
*
* @see #merge(Sort, int, int, TopFieldDocs[])
* @lucene.experimental
Expand Down
108 changes: 17 additions & 91 deletions lucene/core/src/java/org/apache/lucene/search/TopFieldCollector.java
Original file line number Diff line number Diff line change
Expand Up @@ -18,7 +18,6 @@

import java.io.IOException;
import java.util.Arrays;
import java.util.Collection;
import java.util.Comparator;
import java.util.List;
import java.util.Objects;
Expand Down Expand Up @@ -174,7 +173,7 @@ private static boolean canEarlyTerminateOnPrefix(Sort searchSort, Sort indexSort
* Implements a TopFieldCollector over one SortField criteria, with tracking
zacharymorn marked this conversation as resolved.
Show resolved Hide resolved
* document scores and maxScore.
*/
private static class SimpleFieldCollector extends TopFieldCollector {
static class SimpleFieldCollector extends TopFieldCollector {
final Sort sort;
final FieldValueHitQueue<Entry> queue;

Expand Down Expand Up @@ -225,7 +224,7 @@ public void collect(int doc) throws IOException {
/*
* Implements a TopFieldCollector when after != null.
*/
private static final class PagingFieldCollector extends TopFieldCollector {
static final class PagingFieldCollector extends TopFieldCollector {

final Sort sort;
int collectedHits;
Expand Down Expand Up @@ -405,9 +404,13 @@ protected void updateMinCompetitiveScore(Scorable scorer) throws IOException {
* count of the result will be accurate. {@link Integer#MAX_VALUE} may be used to make the hit
* count accurate, but this will also make query processing slower.
* @return a {@link TopFieldCollector} instance which will sort the results by the sort criteria.
* @deprecated This method is deprecated in favor of the constructor of {@link
* TopFieldCollectorManager} due to its support for concurrency in IndexSearcher
*/
@Deprecated
gsmiller marked this conversation as resolved.
Show resolved Hide resolved
public static TopFieldCollector create(Sort sort, int numHits, int totalHitsThreshold) {
return create(sort, numHits, null, totalHitsThreshold);
return new TopFieldCollectorManager(sort, numHits, null, totalHitsThreshold, false)
.newCollector();
}

/**
Expand All @@ -429,106 +432,29 @@ public static TopFieldCollector create(Sort sort, int numHits, int totalHitsThre
* field is indexed both with doc values and points. In this case, there is an assumption that
* the same data is stored in these points and doc values.
* @return a {@link TopFieldCollector} instance which will sort the results by the sort criteria.
* @deprecated This method is deprecated in favor of the constructor of {@link
* TopFieldCollectorManager} due to its support for concurrency in IndexSearcher
*/
@Deprecated
gsmiller marked this conversation as resolved.
Show resolved Hide resolved
public static TopFieldCollector create(
Sort sort, int numHits, FieldDoc after, int totalHitsThreshold) {
if (totalHitsThreshold < 0) {
throw new IllegalArgumentException(
"totalHitsThreshold must be >= 0, got " + totalHitsThreshold);
}

return create(
sort,
numHits,
after,
HitsThresholdChecker.create(Math.max(totalHitsThreshold, numHits)),
null /* bottomValueChecker */);
}

/**
* Same as above with additional parameters to allow passing in the threshold checker and the max
* score accumulator.
*/
static TopFieldCollector create(
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Since you're fully killing two methods here, does that mean that you're not intending to maintain back compat on 9.0? I assume these would get added back in and marked deprecated if backporting to 8x? I think it's reasonable, just checking what your intention is.

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Sorry just realized I missed a few comments earlier (they were collapsed on github UI). I think this one and the one from TopScoreDocCollector.java should be safe to remove directly even when backporting to 8x, as they are package private and hence ideally no users should be using them (and assuming we don't support hacky access to these methods such as reflection or byte code manipulation) ?

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Indeed we don't support access to pkg-private code.

Sort sort,
int numHits,
FieldDoc after,
HitsThresholdChecker hitsThresholdChecker,
MaxScoreAccumulator minScoreAcc) {

if (sort.getSort().length == 0) {
throw new IllegalArgumentException("Sort must contain at least one field");
}

if (numHits <= 0) {
throw new IllegalArgumentException(
"numHits must be > 0; please use TotalHitCountCollector if you just need the total hit count");
}

if (hitsThresholdChecker == null) {
throw new IllegalArgumentException("hitsThresholdChecker should not be null");
}

FieldValueHitQueue<Entry> queue = FieldValueHitQueue.create(sort.getSort(), numHits);

if (after == null) {
// inform a comparator that sort is based on this single field
// to enable some optimizations for skipping over non-competitive documents
// We can't set single sort when the `after` parameter is non-null as it's
// an implicit sort over the document id.
if (queue.comparators.length == 1) {
queue.comparators[0].setSingleSort();
}
return new SimpleFieldCollector(sort, queue, numHits, hitsThresholdChecker, minScoreAcc);
} else {
if (after.fields == null) {
throw new IllegalArgumentException(
"after.fields wasn't set; you must pass fillFields=true for the previous search");
}

if (after.fields.length != sort.getSort().length) {
throw new IllegalArgumentException(
"after.fields has "
+ after.fields.length
+ " values but sort has "
+ sort.getSort().length);
}

return new PagingFieldCollector(
sort, queue, after, numHits, hitsThresholdChecker, minScoreAcc);
}
return new TopFieldCollectorManager(sort, numHits, after, totalHitsThreshold, false)
.newCollector();
}

/**
* Create a CollectorManager which uses a shared hit counter to maintain number of hits and a
* shared {@link MaxScoreAccumulator} to propagate the minimum score accross segments if the
* primary sort is by relevancy.
*
* @deprecated This method is deprecated in favor of the constructor of {@link
* TopFieldCollectorManager} due to its support for concurrency in IndexSearcher
*/
@Deprecated
public static CollectorManager<TopFieldCollector, TopFieldDocs> createSharedManager(
Sort sort, int numHits, FieldDoc after, int totalHitsThreshold) {

int totalHitsMax = Math.max(totalHitsThreshold, numHits);
return new CollectorManager<>() {
private final HitsThresholdChecker hitsThresholdChecker =
HitsThresholdChecker.createShared(totalHitsMax);
private final MaxScoreAccumulator minScoreAcc =
totalHitsMax == Integer.MAX_VALUE ? null : new MaxScoreAccumulator();

@Override
public TopFieldCollector newCollector() throws IOException {
return create(sort, numHits, after, hitsThresholdChecker, minScoreAcc);
}

@Override
public TopFieldDocs reduce(Collection<TopFieldCollector> collectors) throws IOException {
final TopFieldDocs[] topDocs = new TopFieldDocs[collectors.size()];
int i = 0;
for (TopFieldCollector collector : collectors) {
topDocs[i++] = collector.topDocs();
}
return TopDocs.merge(sort, 0, numHits, topDocs);
}
};
return new TopFieldCollectorManager(sort, numHits, after, totalHitsThreshold, true);
}

/**
Expand Down
Loading