Prevent extra fields in index settings (#365)

* prevent xtra fields in index_settings * removed incorrect field from index_defaults in test
marqo-ai · Mar 3, 2023 · 55bbb9c · 55bbb9c
1 parent 73bd429
commit 55bbb9c
Show file tree

Hide file tree

Showing 3 changed files with 151 additions and 18 deletions.
diff --git a/src/marqo/tensor_search/models/settings_object.py b/src/marqo/tensor_search/models/settings_object.py
@@ -8,6 +8,7 @@
         NsFields.index_defaults,
         NsFields.number_of_shards
     ],
+    "additionalProperties": False,
     "properties": {
         NsFields.index_defaults: {
             "type": "object",
@@ -18,6 +19,7 @@
                 NsFields.text_preprocessing,
                 NsFields.image_preprocessing
             ],
+            "additionalProperties": False,
             "properties": {
                 NsFields.treat_urls_and_pointers_as_images: {
                     "type": "boolean",

diff --git a/tests/s2_inference/test_generic_clip_model.py b/tests/s2_inference/test_generic_clip_model.py
@@ -362,7 +362,6 @@ def test_vectorise_without_clip_type(self):
 
 
     def test_validate_model_properties_unknown_model_error(self):
-        pass
         """_validate_model_properties should throw an error if model is not in registry,
             and if model_properties have not been given in index
         """
@@ -372,7 +371,6 @@ def test_validate_model_properties_unknown_model_error(self):
             index_settings={
                 "index_defaults": {
                     'model': model_name,
-                    'type' : "clip"
                 }
             }
         )

diff --git a/tests/tensor_search/test_validation.py b/tests/tensor_search/test_validation.py
@@ -325,27 +325,65 @@ def get_good_index_settings():
         }
 
     def test_validate_index_settings(self):
-        good_settings = {
-            "index_defaults": {
-                "treat_urls_and_pointers_as_images": False,
-                "model": "hf/all_datasets_v4_MiniLM-L6",
-                "normalize_embeddings": True,
-                "text_preprocessing": {
-                    "split_length": 2,
-                    "split_overlap": 0,
-                    "split_method": "sentence"
+
+        good_settings =[
+            {
+                "index_defaults": {
+                    "treat_urls_and_pointers_as_images": False,
+                    "model": "hf/all_datasets_v4_MiniLM-L6",
+                    "normalize_embeddings": True,
+                    "text_preprocessing": {
+                        "split_length": 2,
+                        "split_overlap": 0,
+                        "split_method": "sentence"
+                    },
+                    "image_preprocessing": {
+                        "patch_method": None
+                    }
                 },
-                "image_preprocessing": {
-                    "patch_method": None
-                }
+                "number_of_shards": 5
             },
-            "number_of_shards": 5
-        }
-        assert good_settings == validation.validate_settings_object(good_settings)
+            {   # extra field in text_preprocessing: OK
+                "index_defaults": {
+                    "treat_urls_and_pointers_as_images": False,
+                    "model": "hf/all_datasets_v4_MiniLM-L6",
+                    "normalize_embeddings": True,
+                    "text_preprocessing": {
+                        "split_length": 2,
+                        "split_overlap": 0,
+                        "split_method": "sentence",
+                        "blah blah blah": "woohoo"
+                    },
+                    "image_preprocessing": {
+                        "patch_method": None
+                    }
+                },
+                "number_of_shards": 5
+            },
+            {  # extra field in image_preprocessing: OK
+                "index_defaults": {
+                    "treat_urls_and_pointers_as_images": False,
+                    "model": "hf/all_datasets_v4_MiniLM-L6",
+                    "normalize_embeddings": True,
+                    "text_preprocessing": {
+                        "split_length": 2,
+                        "split_overlap": 0,
+                        "split_method": "sentence",
+                    },
+                    "image_preprocessing": {
+                        "patch_method": None,
+                        "blah blah blah": "woohoo"
+                    }
+                },
+                "number_of_shards": 5,
+            }
+        ]
+        for settings in good_settings:
+            assert settings == validation.validate_settings_object(settings)
 
     def test_validate_index_settings_model_properties(self):
         good_settings = self.get_good_index_settings()
-        good_settings['model_properties'] = dict()
+        good_settings['index_defaults']['model_properties'] = dict()
         assert good_settings == validation.validate_settings_object(good_settings)
 
     def test_validate_index_settings_bad(self):
@@ -422,6 +460,101 @@ def test_validate_index_settings_img_preprocessing(self):
         settings['index_defaults']['image_preprocessing']["path_method"] = "frcnn"
         assert settings == validation.validate_settings_object(settings)
 
+    def test_validate_index_settings_misplaced_fields(self):
+        bad_settings = [
+            {
+                "index_defaults": {
+                    "treat_urls_and_pointers_as_images": False,
+                    "model": "hf/all_datasets_v4_MiniLM-L6",
+                    "normalize_embeddings": True,
+                    "text_preprocessing": {
+                        "split_length": 2,
+                        "split_overlap": 0,
+                        "split_method": "sentence"
+                    },
+                    "image_preprocessing": {
+                        "patch_method": None
+                    }
+                },
+                "number_of_shards": 5,
+                "model": "hf/all_datasets_v4_MiniLM-L6"  # model is also outside, here...
+            },
+            {
+                "index_defaults": {
+                    "image_preprocessing": {
+                        "patch_method": None  # no models here
+                    },
+                    "normalize_embeddings": True,
+                    "text_preprocessing": {
+                        "split_length": 2,
+                        "split_method": "sentence",
+                        "split_overlap": 0
+                    },
+                    "treat_urls_and_pointers_as_images": False
+                },
+                "model": "open_clip/ViT-L-14/laion2b_s32b_b82k", # model here (bad)
+                "number_of_shards": 5,
+                "treat_urls_and_pointers_as_images": True
+            },
+            {
+                "index_defaults": {
+                    "image_preprocessing": {
+                        "patch_method": None,
+                        "model": "open_clip/ViT-L-14/laion2b_s32b_b82k",
+                    },
+                    "normalize_embeddings": True,
+                    "text_preprocessing": {
+                        "split_length": 2,
+                        "split_method": "sentence",
+                        "split_overlap": 0
+                    },
+                    "treat_urls_and_pointers_as_images": False,
+                    "number_of_shards": 5,  # shouldn't be here
+                },
+                "treat_urls_and_pointers_as_images": True
+            },
+            {  # good, BUT extra field in index_defaults
+                "index_defaults": {
+                    "number_of_shards": 5,
+                    "treat_urls_and_pointers_as_images": False,
+                    "model": "hf/all_datasets_v4_MiniLM-L6",
+                    "normalize_embeddings": True,
+                    "text_preprocessing": {
+                        "split_length": 2,
+                        "split_overlap": 0,
+                        "split_method": "sentence"
+                    },
+                    "image_preprocessing": {
+                        "patch_method": None
+                    }
+                },
+                "number_of_shards": 5
+            },
+            {  # good, BUT extra field in root
+                "model": "hf/all_datasets_v4_MiniLM-L6",
+                "index_defaults": {
+                    "treat_urls_and_pointers_as_images": False,
+                    "model": "hf/all_datasets_v4_MiniLM-L6",
+                    "normalize_embeddings": True,
+                    "text_preprocessing": {
+                        "split_length": 2,
+                        "split_overlap": 0,
+                        "split_method": "sentence"
+                    },
+                    "image_preprocessing": {
+                        "patch_method": None
+                    }
+                },
+                "number_of_shards": 5
+            }
+        ]
+        for bad_set in bad_settings:
+            try:
+                validation.validate_settings_object(bad_set)
+                raise AssertionError
+            except InvalidArgError as e:
+                pass
+
     def test_validate_mappings(self):
         mappings = [
              {