Skip to content

Commit

Permalink
Add support for more than one inner_hit when searching nested vectors (
Browse files Browse the repository at this point in the history
…elastic#104006)

This commit adds the ability to gather more than one inner_hit when
searching nested kNN.

# Global kNN example

```
POST test/_search
{
    "_source": false,
    "fields": [
        "name"
    ],
    "knn": {
        "field": "nested.vector",
        "query_vector": [
            -0.5,
            90,
            -10,
            14.8,
            -156
        ],
        "k": 3,
        "num_candidates": 3,
        "inner_hits": {
            "size": 2,
            "fields": [
                "nested.paragraph_id"
            ],
            "_source": false
        }
    }
}
```

Results in

<details>

```
{
    "took": 66,
    "timed_out": false,
    "_shards": {
        "total": 2,
        "successful": 2,
        "skipped": 0,
        "failed": 0
    },
    "hits": {
        "total": {
            "value": 2,
            "relation": "eq"
        },
        "max_score": 0.009090909,
        "hits": [
            {
                "_index": "test",
                "_id": "2",
                "_score": 0.009090909,
                "fields": {
                    "name": [
                        "moose.jpg"
                    ]
                },
                "inner_hits": {
                    "nested": {
                        "hits": {
                            "total": {
                                "value": 2,
                                "relation": "eq"
                            },
                            "max_score": 0.009090909,
                            "hits": [
                                {
                                    "_index": "test",
                                    "_id": "2",
                                    "_nested": {
                                        "field": "nested",
                                        "offset": 0
                                    },
                                    "_score": 0.009090909,
                                    "fields": {
                                        "nested": [
                                            {
                                                "paragraph_id": [
                                                    "0"
                                                ]
                                            }
                                        ]
                                    }
                                },
                                {
                                    "_index": "test",
                                    "_id": "2",
                                    "_nested": {
                                        "field": "nested",
                                        "offset": 1
                                    },
                                    "_score": 0.004968944,
                                    "fields": {
                                        "nested": [
                                            {
                                                "paragraph_id": [
                                                    "2"
                                                ]
                                            }
                                        ]
                                    }
                                }
                            ]
                        }
                    }
                }
            },
            {
                "_index": "test",
                "_id": "3",
                "_score": 0.0021519717,
                "fields": {
                    "name": [
                        "rabbit.jpg"
                    ]
                },
                "inner_hits": {
                    "nested": {
                        "hits": {
                            "total": {
                                "value": 1,
                                "relation": "eq"
                            },
                            "max_score": 0.0021519717,
                            "hits": [
                                {
                                    "_index": "test",
                                    "_id": "3",
                                    "_nested": {
                                        "field": "nested",
                                        "offset": 0
                                    },
                                    "_score": 0.0021519717,
                                    "fields": {
                                        "nested": [
                                            {
                                                "paragraph_id": [
                                                    "0"
                                                ]
                                            }
                                        ]
                                    }
                                }
                            ]
                        }
                    }
                }
            }
        ]
    }
}
```

</details>

# kNN Query example

With a kNN query, this opens an interesting door, which allows for
multiple inner_hit scoring schemes.

## Nearest by max passage only

```
POST test/_search
{
    "size": 3,
    "query": {
        "nested": {
            "path": "nested",
            "score_mode": "max",
            "query": {
                "knn": {
                    "field": "nested.vector",
                    "query_vector": [
                        -0.5,
                        90,
                        -10,
                        14.8,
                        -156
                    ],
                    "num_candidates": 5
                }
            },
            "inner_hits": {
                "size": 2,
                "_source": false,
                "fields": [
                    "nested.paragraph_id"
                ]
            }
        }
    }
}
```

</details>

closes: elastic#102950
  • Loading branch information
benwtrent authored Jan 17, 2024
1 parent b4d1e95 commit e4feaff
Show file tree
Hide file tree
Showing 24 changed files with 908 additions and 26 deletions.
5 changes: 5 additions & 0 deletions docs/changelog/104006.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,5 @@
pr: 104006
summary: Add support for more than one `inner_hit` when searching nested vectors
area: Vector Search
type: enhancement
issues: []
10 changes: 4 additions & 6 deletions docs/reference/search/search-your-data/knn-search.asciidoc
Original file line number Diff line number Diff line change
Expand Up @@ -821,9 +821,6 @@ Now we have filtered based on the top level `"creation_time"` and only one docum
Additionally, if you wanted to extract the nearest passage for a matched document, you can supply <<inner-hits, inner_hits>>
to the `knn` clause.

NOTE: `inner_hits` for kNN will only ever return a single hit, the nearest passage vector.
Setting `"size"` to any value greater than `1` will have no effect on the results.

NOTE: When using `inner_hits` and multiple `knn` clauses, be sure to specify the <<inner-hits-options,`inner_hits.name`>>
field. Otherwise, a naming clash can occur and fail the search request.

Expand All @@ -848,7 +845,8 @@ POST passage_vectors/_search
"_source": false,
"fields": [
"paragraph.text"
]
],
"size": 1
}
}
}
Expand Down Expand Up @@ -891,7 +889,7 @@ Now the result will contain the nearest found paragraph when searching.
"paragraph": {
"hits": {
"total": {
"value": 1,
"value": 2,
"relation": "eq"
},
"max_score": 1.0,
Expand Down Expand Up @@ -935,7 +933,7 @@ Now the result will contain the nearest found paragraph when searching.
"paragraph": {
"hits": {
"total": {
"value": 1,
"value": 2,
"relation": "eq"
},
"max_score": 0.9997144,
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,9 @@ setup:
indices.create:
index: test
body:
settings:
index:
number_of_shards: 2
mappings:
properties:
name:
Expand Down Expand Up @@ -135,6 +138,172 @@ setup:
- match: {hits.hits.0.fields.name.0: "rabbit.jpg"}
- match: {hits.hits.0.inner_hits.nested.hits.hits.0.fields.nested.0.paragraph_id.0: "0"}
---
"nested kNN search inner_hits size > 1":
- skip:
version: ' - 8.12.99'
reason: 'inner_hits on nested kNN search added in 8.13'

- do:
index:
index: test
id: "4"
body:
name: moose.jpg
nested:
- paragraph_id: 0
vector: [ -0.5, 100.0, -13, 14.8, -156.0 ]
- paragraph_id: 2
vector: [ 0, 100.0, 0, 14.8, -156.0 ]
- paragraph_id: 3
vector: [ 0, 1.0, 0, 1.8, -15.0 ]

- do:
index:
index: test
id: "5"
body:
name: moose.jpg
nested:
- paragraph_id: 0
vector: [ -0.5, 100.0, -13, 14.8, -156.0 ]
- paragraph_id: 2
vector: [ 0, 100.0, 0, 14.8, -156.0 ]
- paragraph_id: 3
vector: [ 0, 1.0, 0, 1.8, -15.0 ]

- do:
index:
index: test
id: "6"
body:
name: moose.jpg
nested:
- paragraph_id: 0
vector: [ -0.5, 100.0, -13, 14.8, -156.0 ]
- paragraph_id: 2
vector: [ 0, 100.0, 0, 14.8, -156.0 ]
- paragraph_id: 3
vector: [ 0, 1.0, 0, 1.8, -15.0 ]
- do:
indices.refresh: { }

- do:
search:
index: test
body:
fields: [ "name" ]
knn:
field: nested.vector
query_vector: [-0.5, 90.0, -10, 14.8, -156.0]
k: 3
num_candidates: 5
inner_hits: {size: 2, fields: ["nested.paragraph_id"], _source: false}

- match: {hits.total.value: 3}
- length: { hits.hits.0.inner_hits.nested.hits.hits: 2 }
- length: { hits.hits.1.inner_hits.nested.hits.hits: 2 }
- length: { hits.hits.2.inner_hits.nested.hits.hits: 2 }

- match: { hits.hits.0.fields.name.0: "moose.jpg" }
- match: { hits.hits.0.inner_hits.nested.hits.hits.0.fields.nested.0.paragraph_id.0: "0" }

- do:
search:
index: test
body:
fields: [ "name" ]
knn:
field: nested.vector
query_vector: [-0.5, 90.0, -10, 14.8, -156.0]
k: 5
num_candidates: 5
inner_hits: {size: 2, fields: ["nested.paragraph_id"], _source: false}

- match: {hits.total.value: 5}
# All these initial matches are "moose.jpg", which has 3 nested vectors, but two are closest
- match: {hits.hits.0.fields.name.0: "moose.jpg"}
- length: { hits.hits.0.inner_hits.nested.hits.hits: 2 }
- match: { hits.hits.0.inner_hits.nested.hits.hits.0.fields.nested.0.paragraph_id.0: "0" }
- match: { hits.hits.0.inner_hits.nested.hits.hits.1.fields.nested.0.paragraph_id.0: "2" }
- match: {hits.hits.1.fields.name.0: "moose.jpg"}
- length: { hits.hits.1.inner_hits.nested.hits.hits: 2 }
- match: { hits.hits.1.inner_hits.nested.hits.hits.0.fields.nested.0.paragraph_id.0: "0" }
- match: { hits.hits.1.inner_hits.nested.hits.hits.1.fields.nested.0.paragraph_id.0: "2" }
- match: {hits.hits.2.fields.name.0: "moose.jpg"}
- length: { hits.hits.2.inner_hits.nested.hits.hits: 2 }
- match: { hits.hits.2.inner_hits.nested.hits.hits.0.fields.nested.0.paragraph_id.0: "0" }
- match: { hits.hits.2.inner_hits.nested.hits.hits.1.fields.nested.0.paragraph_id.0: "2" }
- match: {hits.hits.3.fields.name.0: "moose.jpg"}
- length: { hits.hits.3.inner_hits.nested.hits.hits: 2 }
- match: { hits.hits.3.inner_hits.nested.hits.hits.0.fields.nested.0.paragraph_id.0: "0" }
- match: { hits.hits.3.inner_hits.nested.hits.hits.1.fields.nested.0.paragraph_id.0: "2" }
# Rabbit only has one passage vector
- match: {hits.hits.4.fields.name.0: "rabbit.jpg"}
- length: { hits.hits.4.inner_hits.nested.hits.hits: 1 }

- do:
search:
index: test
body:
fields: [ "name" ]
knn:
field: nested.vector
query_vector: [ -0.5, 90.0, -10, 14.8, -156.0 ]
k: 3
num_candidates: 3
filter: {term: {name: "cow.jpg"}}
inner_hits: {size: 3, fields: ["nested.paragraph_id"], _source: false}

- match: {hits.total.value: 1}
- match: { hits.hits.0._id: "1" }
- length: { hits.hits.0.inner_hits.nested.hits.hits: 2 }
- match: { hits.hits.0.inner_hits.nested.hits.hits.0.fields.nested.0.paragraph_id.0: "0" }
- match: { hits.hits.0.inner_hits.nested.hits.hits.1.fields.nested.0.paragraph_id.0: "1" }
---
"nested kNN search inner_hits & boosting":
- skip:
version: ' - 8.12.99'
reason: 'inner_hits on nested kNN search added in 8.13'
features: close_to

- do:
search:
index: test
body:
fields: [ "name" ]
knn:
field: nested.vector
query_vector: [-0.5, 90.0, -10, 14.8, -156.0]
k: 3
num_candidates: 5
inner_hits: {size: 2, fields: ["nested.paragraph_id"], _source: false}

- close_to: { hits.hits.0._score: {value: 0.00909090, error: 0.00001} }
- close_to: { hits.hits.0.inner_hits.nested.hits.hits.0._score: {value: 0.00909090, error: 0.00001} }
- close_to: { hits.hits.1._score: {value: 0.0021519717, error: 0.00001} }
- close_to: { hits.hits.1.inner_hits.nested.hits.hits.0._score: {value: 0.0021519717, error: 0.00001} }
- close_to: { hits.hits.2._score: {value: 0.00001, error: 0.00001} }
- close_to: { hits.hits.2.inner_hits.nested.hits.hits.0._score: {value: 0.00001, error: 0.00001} }

- do:
search:
index: test
body:
fields: [ "name" ]
knn:
field: nested.vector
query_vector: [-0.5, 90.0, -10, 14.8, -156.0]
k: 3
num_candidates: 5
boost: 2
inner_hits: {size: 2, fields: ["nested.paragraph_id"], _source: false}
- close_to: { hits.hits.0._score: {value: 0.0181818, error: 0.00001} }
- close_to: { hits.hits.0.inner_hits.nested.hits.hits.0._score: {value: 0.0181818, error: 0.00001} }
- close_to: { hits.hits.1._score: {value: 0.0043039434, error: 0.00001} }
- close_to: { hits.hits.1.inner_hits.nested.hits.hits.0._score: {value: 0.0043039434, error: 0.00001} }
- close_to: { hits.hits.2._score: {value: 0.00002, error: 0.00001} }
- close_to: { hits.hits.2.inner_hits.nested.hits.hits.0._score: {value: 0.00002, error: 0.00001} }
---
"nested kNN search inner_hits & profiling":
- skip:
version: ' - 8.12.99'
Expand All @@ -144,7 +313,6 @@ setup:
index: test
body:
profile: true
_source: false
fields: [ "name" ]
knn:
field: nested.vector
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -186,7 +186,6 @@ setup:
- match: {hits.hits.0.fields.name.0: "rabbit.jpg"}
- match: { hits.hits.0.inner_hits.nested.hits.hits.0.fields.nested.0.paragraph_id.0: "0" }
---

"nested kNN search post-filtered on nested fields DOES NOT work":
- do:
search:
Expand All @@ -211,3 +210,112 @@ setup:
# TODO: fix it on Lucene level so nested knn respects num_candidates
# or do pre-filtering
- match: {hits.total.value: 0}
---
"nested kNN search inner_hits size > 1":
- skip:
version: ' - 8.12.99'
reason: 'inner_hits on nested kNN search added in 8.13'

- do:
index:
index: test
id: "4"
body:
name: moose.jpg
nested:
- paragraph_id: 0
vector: [ -0.5, 100.0, -13, 14.8, -156.0 ]
- paragraph_id: 2
vector: [ 0, 100.0, 0, 14.8, -156.0 ]
- paragraph_id: 3
vector: [ 0, 1.0, 0, 1.8, -15.0 ]

- do:
index:
index: test
id: "5"
body:
name: moose.jpg
nested:
- paragraph_id: 0
vector: [ -0.5, 100.0, -13, 14.8, -156.0 ]
- paragraph_id: 2
vector: [ 0, 100.0, 0, 14.8, -156.0 ]
- paragraph_id: 3
vector: [ 0, 1.0, 0, 1.8, -15.0 ]

- do:
index:
index: test
id: "6"
body:
name: moose.jpg
nested:
- paragraph_id: 0
vector: [ -0.5, 100.0, -13, 14.8, -156.0 ]
- paragraph_id: 2
vector: [ 0, 100.0, 0, 14.8, -156.0 ]
- paragraph_id: 3
vector: [ 0, 1.0, 0, 1.8, -15.0 ]
- do:
indices.refresh: { }

- do:
search:
index: test
size: 3
body:
fields: [ "name" ]
query:
nested:
path: nested
query:
knn:
field: nested.vector
query_vector: [ -0.5, 90.0, -10, 14.8, -156.0 ]
num_candidates: 5
inner_hits: { size: 2, "fields": [ "nested.paragraph_id" ], _source: false }

- match: {hits.total.value: 5}
- length: { hits.hits.0.inner_hits.nested.hits.hits: 2 }
- length: { hits.hits.1.inner_hits.nested.hits.hits: 2 }
- length: { hits.hits.2.inner_hits.nested.hits.hits: 2 }


- do:
search:
index: test
size: 5
body:
fields: [ "name" ]
query:
nested:
path: nested
query:
knn:
field: nested.vector
query_vector: [ -0.5, 90.0, -10, 14.8, -156.0 ]
num_candidates: 5
inner_hits: { size: 2, "fields": [ "nested.paragraph_id" ], _source: false }

- match: {hits.total.value: 5}
# All these initial matches are "moose.jpg", which has 3 nested vectors, but two are closest
- match: {hits.hits.0.fields.name.0: "moose.jpg"}
- length: { hits.hits.0.inner_hits.nested.hits.hits: 2 }
- match: { hits.hits.0.inner_hits.nested.hits.hits.0.fields.nested.0.paragraph_id.0: "0" }
- match: { hits.hits.0.inner_hits.nested.hits.hits.1.fields.nested.0.paragraph_id.0: "2" }
- match: {hits.hits.1.fields.name.0: "moose.jpg"}
- length: { hits.hits.1.inner_hits.nested.hits.hits: 2 }
- match: { hits.hits.1.inner_hits.nested.hits.hits.0.fields.nested.0.paragraph_id.0: "0" }
- match: { hits.hits.1.inner_hits.nested.hits.hits.1.fields.nested.0.paragraph_id.0: "2" }
- match: {hits.hits.2.fields.name.0: "moose.jpg"}
- length: { hits.hits.2.inner_hits.nested.hits.hits: 2 }
- match: { hits.hits.2.inner_hits.nested.hits.hits.0.fields.nested.0.paragraph_id.0: "0" }
- match: { hits.hits.2.inner_hits.nested.hits.hits.1.fields.nested.0.paragraph_id.0: "2" }
- match: {hits.hits.3.fields.name.0: "moose.jpg"}
- length: { hits.hits.3.inner_hits.nested.hits.hits: 2 }
- match: { hits.hits.3.inner_hits.nested.hits.hits.0.fields.nested.0.paragraph_id.0: "0" }
- match: { hits.hits.3.inner_hits.nested.hits.hits.1.fields.nested.0.paragraph_id.0: "2" }
# Rabbit only has one passage vector
- match: {hits.hits.4.fields.name.0: "rabbit.jpg"}
- length: { hits.hits.4.inner_hits.nested.hits.hits: 1 }
Original file line number Diff line number Diff line change
Expand Up @@ -11,6 +11,7 @@
import org.elasticsearch.action.get.GetResponse;
import org.elasticsearch.cluster.health.ClusterHealthStatus;
import org.elasticsearch.common.settings.Settings;
import org.elasticsearch.index.query.InnerHitBuilder;
import org.elasticsearch.search.vectors.KnnSearchBuilder;
import org.elasticsearch.test.ESIntegTestCase;

Expand Down Expand Up @@ -66,8 +67,9 @@ public void testSimpleNested() throws Exception {
refresh();

assertResponse(
prepareSearch("test").setKnnSearch(List.of(new KnnSearchBuilder("nested.vector", new float[] { 1, 1, 1 }, 1, 1, null)))
.setAllowPartialSearchResults(false),
prepareSearch("test").setKnnSearch(
List.of(new KnnSearchBuilder("nested.vector", new float[] { 1, 1, 1 }, 1, 1, null).innerHit(new InnerHitBuilder()))
).setAllowPartialSearchResults(false),
response -> assertThat(response.getHits().getHits().length, greaterThan(0))
);
}
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -187,6 +187,7 @@ static TransportVersion def(int id) {
public static final TransportVersion DATE_HISTOGRAM_SUPPORT_DOWNSAMPLED_TZ = def(8_574_00_0);
public static final TransportVersion PEERFINDER_REPORTS_PEERS_MASTERS = def(8_575_00_0);
public static final TransportVersion ESQL_MULTI_CLUSTERS_ENRICH = def(8_576_00_0);
public static final TransportVersion NESTED_KNN_MORE_INNER_HITS = def(8_577_00_0);

/*
* STOP! READ THIS FIRST! No, really,
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -151,7 +151,11 @@ ShardSearchRequest rewriteShardSearchRequest(ShardSearchRequest request) {
}
scoreDocs.sort(Comparator.comparingInt(scoreDoc -> scoreDoc.doc));
String nestedPath = dfsKnnResults.getNestedPath();
QueryBuilder query = new KnnScoreDocQueryBuilder(scoreDocs.toArray(new ScoreDoc[0]));
QueryBuilder query = new KnnScoreDocQueryBuilder(
scoreDocs.toArray(new ScoreDoc[0]),
source.knnSearch().get(i).getField(),
source.knnSearch().get(i).getQueryVector()
).boost(source.knnSearch().get(i).boost());
if (nestedPath != null) {
query = new NestedQueryBuilder(nestedPath, query, ScoreMode.Max).innerHit(source.knnSearch().get(i).innerHit());
}
Expand Down
Loading

0 comments on commit e4feaff

Please sign in to comment.