Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Add numberOfVectors to get_stats api #553

Merged
merged 14 commits into from
Jul 28, 2023
19 changes: 17 additions & 2 deletions src/marqo/tensor_search/tensor_search.py
Original file line number Diff line number Diff line change
Expand Up @@ -267,9 +267,24 @@ def _autofill_index_settings(index_settings: dict):


def get_stats(config: Config, index_name: str):
doc_count = HttpRequests(config).post(path=F"{index_name}/_count")["count"]
"""Returns the number of documents and vectors in the index.

The _count API counts top-level documents.
The _stats API includes the count of nested documents, which is __chunks in Marqo.

Difference between the two gives the numberOfVectors."""

try:
doc_count = HttpRequests(config).post(path=F"{index_name}/_count")["count"]
nested_doc_count = HttpRequests(config).get(path=F"{index_name}/_stats/docs")["indices"][index_name]["primaries"][
"docs"]["count"]
except KeyError as e:
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

nice job catching KeyError.

You may also need to catch TypeError, if you are parsing an unknown dict

Copy link
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

thanks! added!

return errors.InternalError(f"Marqo encountered an unexpected response from Marqo-os when calling `get_stats(index=)`. "
f"The expected fields do not exist in the response. Original error message = {e}")

return {
"numberOfDocuments": doc_count
"numberOfDocuments": doc_count,
"numberOfVectors": nested_doc_count - doc_count
}


Expand Down
69 changes: 63 additions & 6 deletions tests/tensor_search/test_get_stats.py
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Let's get some edge case and HTTP error tests

Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Let's also get some cases where the HTTP response isn't expected (like an unknown dict type because of an error)

Original file line number Diff line number Diff line change
Expand Up @@ -14,25 +14,82 @@ def setUp(self) -> None:
except IndexNotFoundError as s:
pass

def test_get_stats_empty(self):
def tearDown(self) -> None:
try:
tensor_search.delete_index(config=self.config, index_name=self.index_name_1)
except IndexNotFoundError as s:
pass

def test_get_stats_empty(self):
tensor_search.create_vector_index(config=self.config, index_name=self.index_name_1)
assert tensor_search.get_stats(config=self.config, index_name=self.index_name_1)["numberOfDocuments"] == 0

def test_get_stats_non_empty(self):
try:
tensor_search.delete_index(config=self.config, index_name=self.index_name_1)
except IndexNotFoundError as s:
pass
tensor_search.create_vector_index(config=self.config, index_name=self.index_name_1)
tensor_search.add_documents(
config=self.config, add_docs_params=AddDocsParams(
docs=[{"1": "2"},{"134": "2"},{"14": "62"}],
docs=[{"1": "2"}, {"134": "2"}, {"14": "62"}],
index_name=self.index_name_1,
auto_refresh=True, device="cpu"
)
)
assert tensor_search.get_stats(config=self.config, index_name=self.index_name_1)["numberOfDocuments"] == 3

def test_get_stats_number_of_vectors(self):

tensor_search.create_vector_index(config=self.config, index_name=self.index_name_1,
index_settings={'index_defaults': {"model": "random/small"}})
expected_number_of_vectors = 7
expected_number_of_documents = 5
tensor_search.add_documents(
config=self.config, add_docs_params=AddDocsParams(
docs=[
{"description_1": "test-2", "description_2": "test"}, # 2 vectors
{"description_1": "test-2", "description_2": "test", "description_3": "test"}, # 3 vectors
{"description_2": "test"}, # 1 vector
{"my_multi_modal_field": {
"text_1": "test", "text_2": "test"}}, # 1 vector
{"non_tensor_field": "test"} # 0 vectors
],
index_name=self.index_name_1,
auto_refresh=True, device="cpu",
non_tensor_fields=["non_tensor_field"],
mappings={"my_multi_modal_field": {"type": "multimodal_combination", "weights": {
"text_1": 0.5, "text_2": 0.8}}}
)
)

assert tensor_search.get_stats(config=self.config, index_name=self.index_name_1)["numberOfDocuments"] \
== expected_number_of_documents
assert tensor_search.get_stats(config=self.config, index_name=self.index_name_1)["numberOfVectors"] \
== expected_number_of_vectors

def test_get_stats_number_of_vectors(self):

tensor_search.create_vector_index(config=self.config, index_name=self.index_name_1,
index_settings={'index_defaults': {"model": "random/small"}})
expected_number_of_vectors = 7
expected_number_of_documents = 6
res = tensor_search.add_documents(
config=self.config, add_docs_params=AddDocsParams(
docs=[
{"description_1": "test-2", "description_2": "test"}, # 2 vectors
{"description_1": "test-2", "description_2": "test", "description_3": "test"}, # 3 vectors
{"description_2": "test"}, # 1 vector
{"my_multi_modal_field": {
"text_1": "test", "text_2": "test"}}, # 1 vector
{"non_tensor_field": "test"}, # 0 vectors
{"list_field": ["this", "that"]}, # 0 vectors
],
index_name=self.index_name_1,
auto_refresh=True, device="cpu",
non_tensor_fields=["non_tensor_field", "list_field"],
mappings={"my_multi_modal_field":
{"type": "multimodal_combination", "weights": {"text_1": 0.5, "text_2": 0.8}}}
)
)

assert tensor_search.get_stats(config=self.config, index_name=self.index_name_1)["numberOfDocuments"] \
== expected_number_of_documents
assert tensor_search.get_stats(config=self.config, index_name=self.index_name_1)["numberOfVectors"] \
== expected_number_of_vectors