Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Danyil/fix cloud cleanup #216

Merged
merged 5 commits into from
Jan 29, 2024
Merged
Show file tree
Hide file tree
Changes from 3 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
19 changes: 11 additions & 8 deletions tests/cloud_test_logic/cloud_test_index.py
Original file line number Diff line number Diff line change
Expand Up @@ -30,14 +30,14 @@ class CloudTestIndex(str, Enum):
-> 4) use structured_image_prepro
"""

unstructured_text = "unstr_txt"
unstructured_image = "unstr_img"
unstructured_text_custom_prepro = "unstr_txt_custom_prepro"
unstructured_text = "pymarqo_unstr_txt"
unstructured_image = "pymarqo_unstr_img"
unstructured_text_custom_prepro = "pymarqo_unstr_txt_custom_prepro"
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Is this index created during the test? There is a restriction on the length of the index name (32 chars). Note we add 4 chars as unique identifiers in addition to this string

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This one is actually not used anywhere and it just needs to be renamed.

Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Can you make sure this index can be created correctly and the custom model can be used? Because this is related to custom models which is kind of important.

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I could add it and add another test but it will force us to have 4 indexes created per test execution. @pandu-k should we do it?

Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

custom models are an important use case. Ideally we find a way to make it work with the current indexes, but if we need to add a new index for this to work we can do that too.

Tbh when we fix this, this test should be extended to include private (auth-required) models


structured_image_prepro = "str_img_prepro"
structured_image_custom = "str_img_custom"
structured_text = "str_txt"
structured_image = "str_img"
structured_image_prepro = "pymarqo_str_img_prepro"
structured_image_custom = "pymarqo_str_img_custom"
structured_text = "pymarqo_str_txt"
structured_image = "pymarqo_str_img"


index_name_to_settings_mappings = {
Expand Down Expand Up @@ -74,7 +74,10 @@ class CloudTestIndex(str, Enum):
{"name": "int_filter_field_1", "type": "int", "features": ["filter", "score_modifier"]},
{"name": "bool_field_1", "type": "bool", "features": ["filter"]},
],
"tensorFields": ["text_field_1", "text_field_2", "text_field_3", "image_field_1"]
"tensorFields": ["text_field_1", "text_field_2", "text_field_3", "image_field_1"],
"imagePreprocessing": {
"patchMethod": "simple",
}
},
# CloudTestIndex.unstructured_text_custom_prepro: {
# "type": "unstructured",
Expand Down
7 changes: 4 additions & 3 deletions tests/cloud_test_logic/delete_all_cloud_test_indexes.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,18 +13,19 @@ def delete_all_test_indices(wait_for_readiness=False):
"url": os.environ.get("MARQO_URL", 'http://localhost:8882'),
}
suffix = os.environ.get("MQ_TEST_RUN_IDENTIFIER", None)
prefix = "pymarqo"
api_key = os.environ.get("MARQO_API_KEY", None)
if api_key:
local_marqo_settings["api_key"] = api_key
print(f"Deleting all test indices from Marqo Cloud Account that match the following criteria:")
print(f"- index name starts with 'test_index'")
print(f"- index name starts with '{prefix}'")
print(f"- index name contains the value of the environment variable MQ_TEST_RUN_IDENTIFIER: {suffix}\n")
client = marqo.Client(**local_marqo_settings)
indexes = client.get_indexes()
indices_to_delete = []
for index in indexes['results']:
if index["indexName"].startswith('test_index'):
if suffix is not None and suffix in index["indexName"].split('_'):
if index["indexName"].startswith(prefix):
if suffix is not None and index["indexName"].endswith(suffix):
indices_to_delete.append(index["indexName"])
elif suffix is None:
indices_to_delete.append(index["indexName"])
Expand Down
62 changes: 38 additions & 24 deletions tests/v2_tests/test_image_chunking.py
Original file line number Diff line number Diff line change
@@ -1,3 +1,5 @@
import json

import requests
from PIL import Image
from marqo.client import Client
Expand All @@ -11,7 +13,6 @@ class TestImageChunking(MarqoTestCase):
"""Test for image chunking as a preprocessing step
"""

@mark.fixed
def test_image_no_chunking(self):

image_size = (256, 384)
Expand Down Expand Up @@ -58,6 +59,7 @@ def test_image_no_chunking(self):
# the highlight should be the location
assert results['hits'][0]['_highlights'][0]['image_field_1'] == temp_file_name

@mark.fixed
def test_image_simple_chunking(self):

image_size = (256, 384)
Expand All @@ -70,51 +72,63 @@ def test_image_simple_chunking(self):
pass

settings = {
"treat_urls_and_pointers_as_images":True, # allows us to find an image file and index it
"model":"ViT-B/16",
"image_preprocessing_method":"simple"
"type": "structured",
"model": "open_clip/ViT-B-32/laion2b_s34b_b79k",
"allFields": [
{"name": "text_field_1", "type": "text", "features": ["lexical_search", "filter"]},
{"name": "text_field_2", "type": "text", "features": ["lexical_search", "filter"]},
{"name": "image_field_1", "type": "image_pointer"},
],
"tensorFields": ["text_field_1", "text_field_2", "image_field_1"],
"imagePreprocessing": {
"patchMethod": "simple",
},
}

test_index_name = self.create_test_index(
cloud_test_index_to_use=CloudTestIndex.image_index_with_preprocessing_method,
open_source_test_index_name=self.generic_test_index_name,
open_source_index_kwargs=settings,
test_index_name = self.get_test_index_name(
cloud_test_index_to_use=CloudTestIndex.structured_image,
open_source_test_index_name=None
)
if not self.client.config.is_marqo_cloud:
self.client.create_index(self.generic_test_index_name, settings_dict=settings)
test_index_name = self.generic_test_index_name
temp_file_name = 'https://avatars.githubusercontent.com/u/13092433?v=4'

img = Image.open(requests.get(temp_file_name, stream=True).raw)

document1 = {'_id': '1', # '_id' can be provided but is not required
'attributes': 'hello',
'description': 'the image chunking can (optionally) chunk the image into sub-patches (akin to segmenting text) by using either a learned model or simple box generation and cropping',
'location': temp_file_name}
document1 = {'_id': '1', # '_id' can be provided but is not required
'text_field_1': 'hello',
'text_field_2': 'the image chunking can (optionally) chunk the image into sub-patches (akin to segmenting text) by using either a learned model or simple box generation and cropping',
'image_field_1': temp_file_name}

client.index(test_index_name).add_documents([document1], tensor_fields=['location', 'description', 'attributes'])
client.index(test_index_name).add_documents([document1])

# test the search works
if self.IS_MULTI_INSTANCE:
self.warm_request(client.index(test_index_name).search,'a')

results = client.index(test_index_name).search('a')
print(results)
assert results['hits'][0]['location'] == temp_file_name
assert results['hits'][0]['image_field_1'] == temp_file_name

# search only the image location
if self.IS_MULTI_INSTANCE:
self.warm_request(client.index(test_index_name).search,'a', searchable_attributes=['location'])
self.warm_request(client.index(test_index_name).search,'a', searchable_attributes=['image_field_1'])

results = client.index(test_index_name).search('a', searchable_attributes=['location'])
print(results)
assert results['hits'][0]['location'] == temp_file_name
results = client.index(test_index_name).search('a', searchable_attributes=['image_field_1'])
assert results['hits'][0]['image_field_1'] == temp_file_name
# the highlight should be the location
assert results['hits'][0]['_highlights']['location'] != temp_file_name
assert len(results['hits'][0]['_highlights']['location']) == 4
assert all(isinstance(_n, (float, int)) for _n in results['hits'][0]['_highlights']['location'])
assert json.loads(results['hits'][0]['_highlights'][0]['image_field_1']) != temp_file_name
assert len(json.loads(results['hits'][0]['_highlights'][0]['image_field_1'])) == 4
assert all(
isinstance(_n, (float, int)) for _n in json.loads(results['hits'][0]['_highlights'][0]['image_field_1'])
)

# search using the image itself, should return a full sized image as highlight
if self.IS_MULTI_INSTANCE:
self.warm_request(client.index(test_index_name).search,temp_file_name)

results = client.index(test_index_name).search(temp_file_name)
print(results)
assert abs(np.array(results['hits'][0]['_highlights']['location']) - np.array([0, 0, img.size[0], img.size[1]])).sum() < 1e-6
assert abs(np.array(json.loads(results['hits'][0]['_highlights'][0]['image_field_1'])) - np.array([0, 0, img.size[0], img.size[1]])).sum() < 1e-6

if not self.client.config.is_marqo_cloud:
self.client.delete_index(test_index_name)
Loading