-
Notifications
You must be signed in to change notification settings - Fork 343
/
Copy pathhello_hybrid_bm25.py
178 lines (152 loc) · 6.52 KB
/
hello_hybrid_bm25.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
# A demo showing hybrid semantic search with dense and full text search with BM25
# using Milvus.
#
# You can optionally choose to use the BGE-M3 model to embed the text as dense
# vectors, or simply use random generated vectors as an example.
#
# You can also use the BGE CrossEncoder model to rerank the search results.
#
# Note that the full text search feature is only available in Milvus 2.4.0 or
# higher version. Make sure you follow https://milvus.io/docs/install_standalone-docker.md
# to set up the latest version of Milvus in your local environment.
# To connect to Milvus server, you need the python client library called pymilvus.
# To use BGE-M3 model, you need to install the optional `model` module in pymilvus.
# You can get them by simply running the following commands:
#
# pip install pymilvus
# pip install pymilvus[model]
# If true, use BGE-M3 model to generate dense vectors.
# If false, use random numbers to compose dense vectors.
use_bge_m3 = False
# If true, the search result will be reranked using BGE CrossEncoder model.
use_reranker = False
# The overall steps are as follows:
# 1. embed the text as dense and sparse vectors
# 2. setup a Milvus collection to store the dense and sparse vectors
# 3. insert the data to Milvus
# 4. search and inspect the result!
import random
import string
import numpy as np
from pymilvus import (
utility,
FieldSchema,
CollectionSchema,
DataType,
Collection,
AnnSearchRequest,
RRFRanker,
connections,
Function,
FunctionType,
)
# 1. prepare a small corpus to search
docs = [
"Artificial intelligence was founded as an academic discipline in 1956.",
"Alan Turing was the first person to conduct substantial research in AI.",
"Born in Maida Vale, London, Turing was raised in southern England.",
]
# add some randomly generated texts
docs.extend(
[
" ".join(
"".join(random.choice(string.ascii_lowercase) for _ in range(random.randint(1, 8)))
for _ in range(10)
)
for _ in range(1000)
]
)
query = "Who started AI research?"
def random_embedding(texts):
rng = np.random.default_rng()
return {
"dense": np.random.rand(len(texts), 768),
}
dense_dim = 768
ef = random_embedding
if use_bge_m3:
# BGE-M3 model is included in the optional `model` module in pymilvus, to
# install it, simply run "pip install pymilvus[model]".
from pymilvus.model.hybrid import BGEM3EmbeddingFunction
ef = BGEM3EmbeddingFunction(use_fp16=False, device="cpu")
dense_dim = ef.dim["dense"]
docs_embeddings = ef(docs)
query_embeddings = ef([query])
# 2. setup Milvus collection and index
connections.connect("default", host="localhost", port="19530")
# Specify the data schema for the new Collection.
fields = [
# Use auto generated id as primary key
FieldSchema(name="pk", dtype=DataType.VARCHAR, is_primary=True, auto_id=True, max_length=100),
# Store the original text to retrieve based on semantically distance
FieldSchema(name="text", dtype=DataType.VARCHAR, max_length=512, enable_tokenizer=True),
# We need a sparse vector field to perform full text search with BM25,
# but you don't need to provide data for it when inserting data.
FieldSchema(name="sparse_vector", dtype=DataType.SPARSE_FLOAT_VECTOR),
FieldSchema(name="dense_vector", dtype=DataType.FLOAT_VECTOR, dim=dense_dim),
]
functions = [
Function(
name="bm25",
function_type=FunctionType.BM25,
input_field_names=["text"],
output_field_names="sparse_vector",
)
]
schema = CollectionSchema(fields, "", functions=functions)
col_name = "hybrid_bm25_demo"
# Now we can create the new collection with above name and schema.
col = Collection(col_name, schema, consistency_level="Strong")
# We need to create indices for the vector fields. The indices will be loaded
# into memory for efficient search.
sparse_index = {"index_type": "SPARSE_INVERTED_INDEX", "metric_type": "BM25"}
col.create_index("sparse_vector", sparse_index)
dense_index = {"index_type": "FLAT", "metric_type": "IP"}
col.create_index("dense_vector", dense_index)
col.load()
# 3. insert text and sparse/dense vector representations into the collection
entities = [docs, docs_embeddings["dense"]]
col.insert(entities)
col.flush()
# 4. search and inspect the result!
k = 2 # we want to get the top 2 docs closest to the query
# Prepare the search requests for both full text search and dense vector search
full_text_search_params = {"metric_type": "BM25"}
# provide raw text query for full text search, while use the sparse vector as
# ANNS field
full_text_search_req = AnnSearchRequest([query], "sparse_vector", full_text_search_params, limit=k)
dense_search_params = {"metric_type": "IP"}
dense_req = AnnSearchRequest(
query_embeddings["dense"], "dense_vector", dense_search_params, limit=k
)
# Search topK docs based on dense and sparse vectors and rerank with RRF.
res = col.hybrid_search(
[full_text_search_req, dense_req], rerank=RRFRanker(), limit=k, output_fields=["text"]
)
# Currently Milvus only support 1 query in the same hybrid search request, so
# we inspect res[0] directly. In future release Milvus will accept batch
# hybrid search queries in the same call.
res = res[0]
if use_reranker:
result_texts = [hit.fields["text"] for hit in res]
from pymilvus.model.reranker import BGERerankFunction
bge_rf = BGERerankFunction(device="cpu")
# rerank the results using BGE CrossEncoder model
results = bge_rf(query, result_texts, top_k=2)
for hit in results:
print(f"text: {hit.text} distance {hit.score}")
else:
for hit in res:
print(f'text: {hit.fields["text"]} distance {hit.distance}')
# If you used both BGE-M3 and the reranker, you should see the following:
# text: Alan Turing was the first person to conduct substantial research in AI. distance 0.9306981017573297
# text: Artificial intelligence was founded as an academic discipline in 1956. distance 0.03217001154515051
#
# If you used only BGE-M3, you should see the following:
# text: Alan Turing was the first person to conduct substantial research in AI. distance 0.032786883413791656
# text: Artificial intelligence was founded as an academic discipline in 1956. distance 0.016129031777381897
# In this simple example the reranker yields the same result as the embedding based hybrid search, but in more complex
# scenarios the reranker can provide more accurate results.
# If you used random vectors, the result will be different each time you run the script.
# Drop the collection to clean up the data.
utility.drop_collection(col_name)