Skip to content

Commit

Permalink
Ability to compute vector similarity scores with DoubleValuesSource (#…
Browse files Browse the repository at this point in the history
…12548)

### Description

This PR addresses the issue #12394. It adds an API **`similarityToQueryVector`** to `DoubleValuesSource` to compute vector similarity scores between the query vector and the `KnnByteVectorField`/`KnnFloatVectorField` for documents using the 2 new DVS implementations (`ByteVectorSimilarityValuesSource` for byte vectors and `FloatVectorSimilarityValuesSource` for float vectors). Below are the method signatures added to DVS in this PR:

- `DoubleValues similarityToQueryVector(LeafReaderContext ctx, float[] queryVector, String vectorField)` *(uses ByteVectorSimilarityValuesSource)*
- `DoubleValues similarityToQueryVector(LeafReaderContext ctx, byte[] queryVector, String vectorField)` *(uses FloatVectorSimilarityValuesSource)*

Closes #12394
  • Loading branch information
shubhamvishu authored and benwtrent committed Oct 12, 2023
1 parent c2331b4 commit 4ac2f43
Show file tree
Hide file tree
Showing 6 changed files with 644 additions and 1 deletion.
3 changes: 2 additions & 1 deletion lucene/CHANGES.txt
Original file line number Diff line number Diff line change
Expand Up @@ -23,7 +23,8 @@ API Changes

New Features
---------------------
(No changes)
* GITHUB#12548: Added similarityToQueryVector API to compute vector similarity scores
with DoubleValuesSource. (Shubham Chaudhary)

Improvements
---------------------
Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,80 @@
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/

package org.apache.lucene.search;

import java.io.IOException;
import java.util.Arrays;
import java.util.Objects;
import org.apache.lucene.index.ByteVectorValues;
import org.apache.lucene.index.LeafReaderContext;
import org.apache.lucene.index.VectorSimilarityFunction;

/**
* A {@link DoubleValuesSource} which computes the vector similarity scores between the query vector
* and the {@link org.apache.lucene.document.KnnByteVectorField} for documents.
*/
class ByteVectorSimilarityValuesSource extends VectorSimilarityValuesSource {
private final byte[] queryVector;

public ByteVectorSimilarityValuesSource(byte[] vector, String fieldName) {
super(fieldName);
this.queryVector = vector;
}

@Override
public DoubleValues getValues(LeafReaderContext ctx, DoubleValues scores) throws IOException {
final ByteVectorValues vectorValues = ctx.reader().getByteVectorValues(fieldName);
VectorSimilarityFunction function =
ctx.reader().getFieldInfos().fieldInfo(fieldName).getVectorSimilarityFunction();
return new DoubleValues() {
@Override
public double doubleValue() throws IOException {
return function.compare(queryVector, vectorValues.vectorValue());
}

@Override
public boolean advanceExact(int doc) throws IOException {
return doc >= vectorValues.docID()
&& (vectorValues.docID() == doc || vectorValues.advance(doc) == doc);
}
};
}

@Override
public int hashCode() {
return Objects.hash(fieldName, Arrays.hashCode(queryVector));
}

@Override
public boolean equals(Object obj) {
if (this == obj) return true;
if (obj == null || getClass() != obj.getClass()) return false;
ByteVectorSimilarityValuesSource other = (ByteVectorSimilarityValuesSource) obj;
return Objects.equals(fieldName, other.fieldName)
&& Arrays.equals(queryVector, other.queryVector);
}

@Override
public String toString() {
return "ByteVectorSimilarityValuesSource(fieldName="
+ fieldName
+ " queryVector="
+ Arrays.toString(queryVector)
+ ")";
}
}
Original file line number Diff line number Diff line change
Expand Up @@ -24,6 +24,7 @@
import org.apache.lucene.index.DocValues;
import org.apache.lucene.index.LeafReaderContext;
import org.apache.lucene.index.NumericDocValues;
import org.apache.lucene.index.VectorEncoding;
import org.apache.lucene.search.comparators.DoubleComparator;

/**
Expand Down Expand Up @@ -172,6 +173,52 @@ public LongValuesSource rewrite(IndexSearcher searcher) throws IOException {
}
}

/**
* Returns a DoubleValues instance for computing the vector similarity score per document against
* the byte query vector
*
* @param ctx the context for which to return the DoubleValues
* @param queryVector byte query vector
* @param vectorField knn byte field name
* @return DoubleValues instance
* @throws IOException if an {@link IOException} occurs
*/
public static DoubleValues similarityToQueryVector(
LeafReaderContext ctx, byte[] queryVector, String vectorField) throws IOException {
if (ctx.reader().getFieldInfos().fieldInfo(vectorField).getVectorEncoding()
!= VectorEncoding.BYTE) {
throw new IllegalArgumentException(
"Field "
+ vectorField
+ " does not have the expected vector encoding: "
+ VectorEncoding.BYTE);
}
return new ByteVectorSimilarityValuesSource(queryVector, vectorField).getValues(ctx, null);
}

/**
* Returns a DoubleValues instance for computing the vector similarity score per document against
* the float query vector
*
* @param ctx the context for which to return the DoubleValues
* @param queryVector float query vector
* @param vectorField knn float field name
* @return DoubleValues instance
* @throws IOException if an {@link IOException} occurs
*/
public static DoubleValues similarityToQueryVector(
LeafReaderContext ctx, float[] queryVector, String vectorField) throws IOException {
if (ctx.reader().getFieldInfos().fieldInfo(vectorField).getVectorEncoding()
!= VectorEncoding.FLOAT32) {
throw new IllegalArgumentException(
"Field "
+ vectorField
+ " does not have the expected vector encoding: "
+ VectorEncoding.FLOAT32);
}
return new FloatVectorSimilarityValuesSource(queryVector, vectorField).getValues(ctx, null);
}

/**
* Creates a DoubleValuesSource that wraps a generic NumericDocValues field
*
Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,81 @@
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/

package org.apache.lucene.search;

import java.io.IOException;
import java.util.Arrays;
import java.util.Objects;
import org.apache.lucene.index.FloatVectorValues;
import org.apache.lucene.index.LeafReaderContext;
import org.apache.lucene.index.VectorSimilarityFunction;

/**
* A {@link DoubleValuesSource} which computes the vector similarity scores between the query vector
* and the {@link org.apache.lucene.document.KnnFloatVectorField} for documents.
*/
class FloatVectorSimilarityValuesSource extends VectorSimilarityValuesSource {

private final float[] queryVector;

public FloatVectorSimilarityValuesSource(float[] vector, String fieldName) {
super(fieldName);
this.queryVector = vector;
}

@Override
public DoubleValues getValues(LeafReaderContext ctx, DoubleValues scores) throws IOException {
final FloatVectorValues vectorValues = ctx.reader().getFloatVectorValues(fieldName);
VectorSimilarityFunction function =
ctx.reader().getFieldInfos().fieldInfo(fieldName).getVectorSimilarityFunction();
return new DoubleValues() {
@Override
public double doubleValue() throws IOException {
return function.compare(queryVector, vectorValues.vectorValue());
}

@Override
public boolean advanceExact(int doc) throws IOException {
return doc >= vectorValues.docID()
&& (vectorValues.docID() == doc || vectorValues.advance(doc) == doc);
}
};
}

@Override
public int hashCode() {
return Objects.hash(fieldName, Arrays.hashCode(queryVector));
}

@Override
public boolean equals(Object obj) {
if (this == obj) return true;
if (obj == null || getClass() != obj.getClass()) return false;
FloatVectorSimilarityValuesSource other = (FloatVectorSimilarityValuesSource) obj;
return Objects.equals(fieldName, other.fieldName)
&& Arrays.equals(queryVector, other.queryVector);
}

@Override
public String toString() {
return "FloatVectorSimilarityValuesSource(fieldName="
+ fieldName
+ " queryVector="
+ Arrays.toString(queryVector)
+ ")";
}
}
Original file line number Diff line number Diff line change
@@ -0,0 +1,53 @@
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/

package org.apache.lucene.search;

import java.io.IOException;
import org.apache.lucene.index.LeafReaderContext;

/**
* An abstract class that provides the vector similarity scores between the query vector and the
* {@link org.apache.lucene.document.KnnFloatVectorField} or {@link
* org.apache.lucene.document.KnnByteVectorField} for documents.
*/
abstract class VectorSimilarityValuesSource extends DoubleValuesSource {
protected final String fieldName;

public VectorSimilarityValuesSource(String fieldName) {
this.fieldName = fieldName;
}

@Override
public abstract DoubleValues getValues(LeafReaderContext ctx, DoubleValues scores)
throws IOException;

@Override
public boolean needsScores() {
return false;
}

@Override
public DoubleValuesSource rewrite(IndexSearcher reader) throws IOException {
return this;
}

@Override
public boolean isCacheable(LeafReaderContext ctx) {
return true;
}
}
Loading

0 comments on commit 4ac2f43

Please sign in to comment.