From 55bbb9c86c4dd7d9a336e0ba638d525efdb689e3 Mon Sep 17 00:00:00 2001 From: pandu-k <107458762+pandu-k@users.noreply.github.com> Date: Fri, 3 Mar 2023 15:55:31 +1100 Subject: [PATCH] Prevent extra fields in index settings (#365) * prevent xtra fields in index_settings * removed incorrect field from index_defaults in test --- .../tensor_search/models/settings_object.py | 2 + tests/s2_inference/test_generic_clip_model.py | 2 - tests/tensor_search/test_validation.py | 165 ++++++++++++++++-- 3 files changed, 151 insertions(+), 18 deletions(-) diff --git a/src/marqo/tensor_search/models/settings_object.py b/src/marqo/tensor_search/models/settings_object.py index dfff0f354..72a9490f8 100644 --- a/src/marqo/tensor_search/models/settings_object.py +++ b/src/marqo/tensor_search/models/settings_object.py @@ -8,6 +8,7 @@ NsFields.index_defaults, NsFields.number_of_shards ], + "additionalProperties": False, "properties": { NsFields.index_defaults: { "type": "object", @@ -18,6 +19,7 @@ NsFields.text_preprocessing, NsFields.image_preprocessing ], + "additionalProperties": False, "properties": { NsFields.treat_urls_and_pointers_as_images: { "type": "boolean", diff --git a/tests/s2_inference/test_generic_clip_model.py b/tests/s2_inference/test_generic_clip_model.py index d38bca842..719720ebe 100644 --- a/tests/s2_inference/test_generic_clip_model.py +++ b/tests/s2_inference/test_generic_clip_model.py @@ -362,7 +362,6 @@ def test_vectorise_without_clip_type(self): def test_validate_model_properties_unknown_model_error(self): - pass """_validate_model_properties should throw an error if model is not in registry, and if model_properties have not been given in index """ @@ -372,7 +371,6 @@ def test_validate_model_properties_unknown_model_error(self): index_settings={ "index_defaults": { 'model': model_name, - 'type' : "clip" } } ) diff --git a/tests/tensor_search/test_validation.py b/tests/tensor_search/test_validation.py index acec3b683..60cafa9fd 100644 --- a/tests/tensor_search/test_validation.py +++ b/tests/tensor_search/test_validation.py @@ -325,27 +325,65 @@ def get_good_index_settings(): } def test_validate_index_settings(self): - good_settings = { - "index_defaults": { - "treat_urls_and_pointers_as_images": False, - "model": "hf/all_datasets_v4_MiniLM-L6", - "normalize_embeddings": True, - "text_preprocessing": { - "split_length": 2, - "split_overlap": 0, - "split_method": "sentence" + + good_settings =[ + { + "index_defaults": { + "treat_urls_and_pointers_as_images": False, + "model": "hf/all_datasets_v4_MiniLM-L6", + "normalize_embeddings": True, + "text_preprocessing": { + "split_length": 2, + "split_overlap": 0, + "split_method": "sentence" + }, + "image_preprocessing": { + "patch_method": None + } }, - "image_preprocessing": { - "patch_method": None - } + "number_of_shards": 5 }, - "number_of_shards": 5 - } - assert good_settings == validation.validate_settings_object(good_settings) + { # extra field in text_preprocessing: OK + "index_defaults": { + "treat_urls_and_pointers_as_images": False, + "model": "hf/all_datasets_v4_MiniLM-L6", + "normalize_embeddings": True, + "text_preprocessing": { + "split_length": 2, + "split_overlap": 0, + "split_method": "sentence", + "blah blah blah": "woohoo" + }, + "image_preprocessing": { + "patch_method": None + } + }, + "number_of_shards": 5 + }, + { # extra field in image_preprocessing: OK + "index_defaults": { + "treat_urls_and_pointers_as_images": False, + "model": "hf/all_datasets_v4_MiniLM-L6", + "normalize_embeddings": True, + "text_preprocessing": { + "split_length": 2, + "split_overlap": 0, + "split_method": "sentence", + }, + "image_preprocessing": { + "patch_method": None, + "blah blah blah": "woohoo" + } + }, + "number_of_shards": 5, + } + ] + for settings in good_settings: + assert settings == validation.validate_settings_object(settings) def test_validate_index_settings_model_properties(self): good_settings = self.get_good_index_settings() - good_settings['model_properties'] = dict() + good_settings['index_defaults']['model_properties'] = dict() assert good_settings == validation.validate_settings_object(good_settings) def test_validate_index_settings_bad(self): @@ -422,6 +460,101 @@ def test_validate_index_settings_img_preprocessing(self): settings['index_defaults']['image_preprocessing']["path_method"] = "frcnn" assert settings == validation.validate_settings_object(settings) + def test_validate_index_settings_misplaced_fields(self): + bad_settings = [ + { + "index_defaults": { + "treat_urls_and_pointers_as_images": False, + "model": "hf/all_datasets_v4_MiniLM-L6", + "normalize_embeddings": True, + "text_preprocessing": { + "split_length": 2, + "split_overlap": 0, + "split_method": "sentence" + }, + "image_preprocessing": { + "patch_method": None + } + }, + "number_of_shards": 5, + "model": "hf/all_datasets_v4_MiniLM-L6" # model is also outside, here... + }, + { + "index_defaults": { + "image_preprocessing": { + "patch_method": None # no models here + }, + "normalize_embeddings": True, + "text_preprocessing": { + "split_length": 2, + "split_method": "sentence", + "split_overlap": 0 + }, + "treat_urls_and_pointers_as_images": False + }, + "model": "open_clip/ViT-L-14/laion2b_s32b_b82k", # model here (bad) + "number_of_shards": 5, + "treat_urls_and_pointers_as_images": True + }, + { + "index_defaults": { + "image_preprocessing": { + "patch_method": None, + "model": "open_clip/ViT-L-14/laion2b_s32b_b82k", + }, + "normalize_embeddings": True, + "text_preprocessing": { + "split_length": 2, + "split_method": "sentence", + "split_overlap": 0 + }, + "treat_urls_and_pointers_as_images": False, + "number_of_shards": 5, # shouldn't be here + }, + "treat_urls_and_pointers_as_images": True + }, + { # good, BUT extra field in index_defaults + "index_defaults": { + "number_of_shards": 5, + "treat_urls_and_pointers_as_images": False, + "model": "hf/all_datasets_v4_MiniLM-L6", + "normalize_embeddings": True, + "text_preprocessing": { + "split_length": 2, + "split_overlap": 0, + "split_method": "sentence" + }, + "image_preprocessing": { + "patch_method": None + } + }, + "number_of_shards": 5 + }, + { # good, BUT extra field in root + "model": "hf/all_datasets_v4_MiniLM-L6", + "index_defaults": { + "treat_urls_and_pointers_as_images": False, + "model": "hf/all_datasets_v4_MiniLM-L6", + "normalize_embeddings": True, + "text_preprocessing": { + "split_length": 2, + "split_overlap": 0, + "split_method": "sentence" + }, + "image_preprocessing": { + "patch_method": None + } + }, + "number_of_shards": 5 + } + ] + for bad_set in bad_settings: + try: + validation.validate_settings_object(bad_set) + raise AssertionError + except InvalidArgError as e: + pass + def test_validate_mappings(self): mappings = [ {