-
Notifications
You must be signed in to change notification settings - Fork 1.1k
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
Add two new "Seeded" Knn queries for seeded vector search (#14084)
### Description In some vector search cases, users may already know some documents that are likely related to a query. Let's support seeding HNSW's scoring stage with these documents, rather than using HNSW's hierarchical stage. An example use case is hybrid search, where both a traditional and vector search are performed. The top results from the traditional search are likely reasonable seeds for the vector search. Even when not performing hybrid search, traditional matching can often be faster than traversing the hierarchy, which can be used to speed up the vector search process (up to 2x faster for the same effectiveness), as was demonstrated in [this article](https://arxiv.org/abs/2307.16779) (full disclosure: seanmacavaney is an author of the article). The main changes are: - A new "seeded" focused knn collector and collector manager - Two new basic knn queries that expose using these specialized collectors for seeded entrypoint - `HnswGraphSearcher`, which bypasses the `findBestEntryPoint` step if seeds are provided. //cc @seanmacavaney Co-authored-by: Sean MacAvaney <smacavaney@bloomberg.com> Co-authored-by: Sean MacAvaney <sean.macavaney@gmail.com> Co-authored-by: Christine Poerschke <cpoerschke@apache.org>
- Loading branch information
1 parent
905efa9
commit 34f0453
Showing
17 changed files
with
1,075 additions
and
89 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
97 changes: 97 additions & 0 deletions
97
lucene/core/src/java/org/apache/lucene/search/SeededKnnByteVectorQuery.java
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,97 @@ | ||
/* | ||
* Licensed to the Apache Software Foundation (ASF) under one or more | ||
* contributor license agreements. See the NOTICE file distributed with | ||
* this work for additional information regarding copyright ownership. | ||
* The ASF licenses this file to You under the Apache License, Version 2.0 | ||
* (the "License"); you may not use this file except in compliance with | ||
* the License. You may obtain a copy of the License at | ||
* | ||
* http://www.apache.org/licenses/LICENSE-2.0 | ||
* | ||
* Unless required by applicable law or agreed to in writing, software | ||
* distributed under the License is distributed on an "AS IS" BASIS, | ||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | ||
* See the License for the specific language governing permissions and | ||
* limitations under the License. | ||
*/ | ||
package org.apache.lucene.search; | ||
|
||
import java.io.IOException; | ||
import java.util.Objects; | ||
import org.apache.lucene.index.ByteVectorValues; | ||
import org.apache.lucene.search.knn.KnnCollectorManager; | ||
import org.apache.lucene.search.knn.SeededKnnCollectorManager; | ||
|
||
/** | ||
* This is a version of knn byte vector query that provides a query seed to initiate the vector | ||
* search. NOTE: The underlying format is free to ignore the provided seed | ||
* | ||
* <p>See <a href="https://dl.acm.org/doi/10.1145/3539618.3591715">"Lexically-Accelerated Dense | ||
* Retrieval"</a> (Kulkarni, Hrishikesh and MacAvaney, Sean and Goharian, Nazli and Frieder, Ophir). | ||
* In SIGIR '23: Proceedings of the 46th International ACM SIGIR Conference on Research and | ||
* Development in Information Retrieval Pages 152 - 162 | ||
* | ||
* @lucene.experimental | ||
*/ | ||
public class SeededKnnByteVectorQuery extends KnnByteVectorQuery { | ||
final Query seed; | ||
final Weight seedWeight; | ||
|
||
/** | ||
* Construct a new SeededKnnByteVectorQuery instance | ||
* | ||
* @param field knn byte vector field to query | ||
* @param target the query vector | ||
* @param k number of neighbors to return | ||
* @param filter a filter on the neighbors to return | ||
* @param seed a query seed to initiate the vector format search | ||
*/ | ||
public SeededKnnByteVectorQuery(String field, byte[] target, int k, Query filter, Query seed) { | ||
super(field, target, k, filter); | ||
this.seed = Objects.requireNonNull(seed); | ||
this.seedWeight = null; | ||
} | ||
|
||
SeededKnnByteVectorQuery(String field, byte[] target, int k, Query filter, Weight seedWeight) { | ||
super(field, target, k, filter); | ||
this.seed = null; | ||
this.seedWeight = Objects.requireNonNull(seedWeight); | ||
} | ||
|
||
@Override | ||
public Query rewrite(IndexSearcher indexSearcher) throws IOException { | ||
if (seedWeight != null) { | ||
return super.rewrite(indexSearcher); | ||
} | ||
BooleanQuery.Builder booleanSeedQueryBuilder = | ||
new BooleanQuery.Builder() | ||
.add(seed, BooleanClause.Occur.MUST) | ||
.add(new FieldExistsQuery(field), BooleanClause.Occur.FILTER); | ||
if (filter != null) { | ||
booleanSeedQueryBuilder.add(filter, BooleanClause.Occur.FILTER); | ||
} | ||
Query seedRewritten = indexSearcher.rewrite(booleanSeedQueryBuilder.build()); | ||
Weight seedWeight = indexSearcher.createWeight(seedRewritten, ScoreMode.TOP_SCORES, 1f); | ||
SeededKnnByteVectorQuery rewritten = | ||
new SeededKnnByteVectorQuery(field, target, k, filter, seedWeight); | ||
return rewritten.rewrite(indexSearcher); | ||
} | ||
|
||
@Override | ||
protected KnnCollectorManager getKnnCollectorManager(int k, IndexSearcher searcher) { | ||
if (seedWeight == null) { | ||
throw new UnsupportedOperationException("must be rewritten before constructing manager"); | ||
} | ||
return new SeededKnnCollectorManager( | ||
super.getKnnCollectorManager(k, searcher), | ||
seedWeight, | ||
k, | ||
leaf -> { | ||
ByteVectorValues vv = leaf.getByteVectorValues(field); | ||
if (vv == null) { | ||
ByteVectorValues.checkField(leaf.getContext().reader(), field); | ||
} | ||
return vv; | ||
}); | ||
} | ||
} |
97 changes: 97 additions & 0 deletions
97
lucene/core/src/java/org/apache/lucene/search/SeededKnnFloatVectorQuery.java
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,97 @@ | ||
/* | ||
* Licensed to the Apache Software Foundation (ASF) under one or more | ||
* contributor license agreements. See the NOTICE file distributed with | ||
* this work for additional information regarding copyright ownership. | ||
* The ASF licenses this file to You under the Apache License, Version 2.0 | ||
* (the "License"); you may not use this file except in compliance with | ||
* the License. You may obtain a copy of the License at | ||
* | ||
* http://www.apache.org/licenses/LICENSE-2.0 | ||
* | ||
* Unless required by applicable law or agreed to in writing, software | ||
* distributed under the License is distributed on an "AS IS" BASIS, | ||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | ||
* See the License for the specific language governing permissions and | ||
* limitations under the License. | ||
*/ | ||
package org.apache.lucene.search; | ||
|
||
import java.io.IOException; | ||
import java.util.Objects; | ||
import org.apache.lucene.index.FloatVectorValues; | ||
import org.apache.lucene.search.knn.KnnCollectorManager; | ||
import org.apache.lucene.search.knn.SeededKnnCollectorManager; | ||
|
||
/** | ||
* This is a version of knn float vector query that provides a query seed to initiate the vector | ||
* search. NOTE: The underlying format is free to ignore the provided seed. | ||
* | ||
* <p>See <a href="https://dl.acm.org/doi/10.1145/3539618.3591715">"Lexically-Accelerated Dense | ||
* Retrieval"</a> (Kulkarni, Hrishikesh and MacAvaney, Sean and Goharian, Nazli and Frieder, Ophir). | ||
* In SIGIR '23: Proceedings of the 46th International ACM SIGIR Conference on Research and | ||
* Development in Information Retrieval Pages 152 - 162 | ||
* | ||
* @lucene.experimental | ||
*/ | ||
public class SeededKnnFloatVectorQuery extends KnnFloatVectorQuery { | ||
final Query seed; | ||
final Weight seedWeight; | ||
|
||
/** | ||
* Construct a new SeededKnnFloatVectorQuery instance | ||
* | ||
* @param field knn float vector field to query | ||
* @param target the query vector | ||
* @param k number of neighbors to return | ||
* @param filter a filter on the neighbors to return | ||
* @param seed a query seed to initiate the vector format search | ||
*/ | ||
public SeededKnnFloatVectorQuery(String field, float[] target, int k, Query filter, Query seed) { | ||
super(field, target, k, filter); | ||
this.seed = Objects.requireNonNull(seed); | ||
this.seedWeight = null; | ||
} | ||
|
||
SeededKnnFloatVectorQuery(String field, float[] target, int k, Query filter, Weight seedWeight) { | ||
super(field, target, k, filter); | ||
this.seed = null; | ||
this.seedWeight = Objects.requireNonNull(seedWeight); | ||
} | ||
|
||
@Override | ||
public Query rewrite(IndexSearcher indexSearcher) throws IOException { | ||
if (seedWeight != null) { | ||
return super.rewrite(indexSearcher); | ||
} | ||
BooleanQuery.Builder booleanSeedQueryBuilder = | ||
new BooleanQuery.Builder() | ||
.add(seed, BooleanClause.Occur.MUST) | ||
.add(new FieldExistsQuery(field), BooleanClause.Occur.FILTER); | ||
if (filter != null) { | ||
booleanSeedQueryBuilder.add(filter, BooleanClause.Occur.FILTER); | ||
} | ||
Query seedRewritten = indexSearcher.rewrite(booleanSeedQueryBuilder.build()); | ||
Weight seedWeight = indexSearcher.createWeight(seedRewritten, ScoreMode.TOP_SCORES, 1f); | ||
SeededKnnFloatVectorQuery rewritten = | ||
new SeededKnnFloatVectorQuery(field, target, k, filter, seedWeight); | ||
return rewritten.rewrite(indexSearcher); | ||
} | ||
|
||
@Override | ||
protected KnnCollectorManager getKnnCollectorManager(int k, IndexSearcher searcher) { | ||
if (seedWeight == null) { | ||
throw new UnsupportedOperationException("must be rewritten before constructing manager"); | ||
} | ||
return new SeededKnnCollectorManager( | ||
super.getKnnCollectorManager(k, searcher), | ||
seedWeight, | ||
k, | ||
leaf -> { | ||
FloatVectorValues vv = leaf.getFloatVectorValues(field); | ||
if (vv == null) { | ||
FloatVectorValues.checkField(leaf.getContext().reader(), field); | ||
} | ||
return vv; | ||
}); | ||
} | ||
} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
28 changes: 28 additions & 0 deletions
28
lucene/core/src/java/org/apache/lucene/search/knn/EntryPointProvider.java
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,28 @@ | ||
/* | ||
* Licensed to the Apache Software Foundation (ASF) under one or more | ||
* contributor license agreements. See the NOTICE file distributed with | ||
* this work for additional information regarding copyright ownership. | ||
* The ASF licenses this file to You under the Apache License, Version 2.0 | ||
* (the "License"); you may not use this file except in compliance with | ||
* the License. You may obtain a copy of the License at | ||
* | ||
* http://www.apache.org/licenses/LICENSE-2.0 | ||
* | ||
* Unless required by applicable law or agreed to in writing, software | ||
* distributed under the License is distributed on an "AS IS" BASIS, | ||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | ||
* See the License for the specific language governing permissions and | ||
* limitations under the License. | ||
*/ | ||
package org.apache.lucene.search.knn; | ||
|
||
import org.apache.lucene.search.DocIdSetIterator; | ||
|
||
/** Provides entry points for the kNN search */ | ||
public interface EntryPointProvider { | ||
/** Iterator of valid entry points for the kNN search */ | ||
DocIdSetIterator entryPoints(); | ||
|
||
/** Number of valid entry points for the kNN search */ | ||
int numberOfEntryPoints(); | ||
} |
Oops, something went wrong.