From c0d527809f4c3a774e4e193cd86f9b98a28e860c Mon Sep 17 00:00:00 2001
From: Ishaaq Chandy <ishaaq@gmail.com>
Date: Wed, 11 Dec 2024 16:02:54 +1100
Subject: [PATCH] Add SigLIP imagePreprocessor config

SigLIP models do not use the default OpenCLIP preprocessor which can
result in failure to load an index properly.

Also set size as per the base model's image_size configuration.

See https://docs.marqo.ai/latest/models/marqo/bring-your-own-model/#example-2-load-a-custom-openclip-model-from-a-public-url-with-custom-configurations

> It is very important to provide the correct imagePreprocessor
> configuration to match the model architecture as Marqo can not infer
> the correct configuration from the model name when you load a
> checkpoint file and will use the default configuration("OpenCLIP")
---
 .../open_clip_properties.py                   | 26 +++++++++++++++++++
 tests/test_model_properties.py                | 16 ++++++++++++
 2 files changed, 42 insertions(+)

diff --git a/src/marqo_commons/model_registry/model_properties_data/open_clip_properties.py b/src/marqo_commons/model_registry/model_properties_data/open_clip_properties.py
index c5bc50f..f29e1a4 100644
--- a/src/marqo_commons/model_registry/model_properties_data/open_clip_properties.py
+++ b/src/marqo_commons/model_registry/model_properties_data/open_clip_properties.py
@@ -22,6 +22,8 @@ class OpenClipModelProperties(ModelProperties):
     modality: List[Modality] = [Modality.text, Modality.image]
     type: ModelType = ModelType.open_clip
     pretrained: Optional[str] = None
+    imagePreprocessor: Optional[str] = None # if unspecified Marqo will default to "OpenCLIP"
+    size: Optional[int] = None # image_size for image preprocessor
     notes: str = ""
 
     @classmethod
@@ -368,6 +370,8 @@ def get_all_model_properties_objects(cls) -> Dict[str, "OpenClipModelProperties"
                 dimensions=1152,
                 notes="open_clip model: ViT-SO400M-14-SigLIP-384/webli",
                 pretrained="webli",
+                imagePreprocessor="SigLIP",
+                size=384,
             ),
             "open_clip/ViT-H-14-378-quickgelu/dfn5b": OpenClipModelProperties(
                 name="open_clip/ViT-H-14-378-quickgelu/dfn5b",
@@ -380,6 +384,8 @@ def get_all_model_properties_objects(cls) -> Dict[str, "OpenClipModelProperties"
                 dimensions=1024,
                 notes="open_clip model: ViT-L-16-SigLIP-384/webli",
                 pretrained="webli",
+                imagePreprocessor="SigLIP",
+                size=384,
             ),
             "open_clip/ViT-H-14-quickgelu/dfn5b": OpenClipModelProperties(
                 name="open_clip/ViT-H-14-quickgelu/dfn5b",
@@ -392,30 +398,40 @@ def get_all_model_properties_objects(cls) -> Dict[str, "OpenClipModelProperties"
                 dimensions=1024,
                 notes="open_clip model: ViT-L-16-SigLIP-256/webli",
                 pretrained="webli",
+                imagePreprocessor="SigLIP",
+                size=256,
             ),
             "open_clip/ViT-B-16-SigLIP-512/webli": OpenClipModelProperties(
                 name="open_clip/ViT-B-16-SigLIP-512/webli",
                 dimensions=768,
                 notes="open_clip model: ViT-B-16-SigLIP-512/webli",
                 pretrained="webli",
+                imagePreprocessor="SigLIP",
+                size=512,
             ),
             "open_clip/ViT-B-16-SigLIP-384/webli": OpenClipModelProperties(
                 name="open_clip/ViT-B-16-SigLIP-384/webli",
                 dimensions=768,
                 notes="open_clip model: ViT-B-16-SigLIP-384/webli",
                 pretrained="webli",
+                imagePreprocessor="SigLIP",
+                size=384,
             ),
             "open_clip/ViT-B-16-SigLIP-256/webli": OpenClipModelProperties(
                 name="open_clip/ViT-B-16-SigLIP-256/webli",
                 dimensions=768,
                 notes="open_clip model: ViT-B-16-SigLIP-256/webli",
                 pretrained="webli",
+                imagePreprocessor="SigLIP",
+                size=256,
             ),
             "open_clip/ViT-B-16-SigLIP/webli": OpenClipModelProperties(
                 name="open_clip/ViT-B-16-SigLIP/webli",
                 dimensions=768,
                 notes="open_clip model: ViT-B-16-SigLIP/webli",
                 pretrained="webli",
+                imagePreprocessor="SigLIP",
+                size=224,
             ),
             "open_clip/ViT-L-14-quickgelu/dfn2b": OpenClipModelProperties(
                 name="open_clip/ViT-L-14-quickgelu/dfn2b",
@@ -468,26 +484,36 @@ def get_all_model_properties_objects(cls) -> Dict[str, "OpenClipModelProperties"
                 name="hf-hub:Marqo/marqo-fashionSigLIP",
                 dimensions=768,
                 note="Marqo's fashionSigLIP model",
+                imagePreprocessor="SigLIP",
+                size=224,
             ),
             "visheratin/nllb-clip-base-siglip": OpenClipModelProperties(
                 name="hf-hub:visheratin/nllb-clip-base-siglip",
                 dimensions=768,
                 note="A multilingual CLIP model",
+                imagePreprocessor="SigLIP",
+                size=384,
             ),
             "visheratin/nllb-siglip-mrl-base": OpenClipModelProperties(
                 name="hf-hub:visheratin/nllb-siglip-mrl-base",
                 dimensions=768,
                 note="A multilingual CLIP model",
+                imagePreprocessor="SigLIP",
+                size=384,
             ),
             "visheratin/nllb-clip-large-siglip": OpenClipModelProperties(
                 name="hf-hub:visheratin/nllb-clip-large-siglip",
                 dimensions=1152,
                 note="A multilingual CLIP model",
+                imagePreprocessor="SigLIP",
+                size=384,
             ),
             "visheratin/nllb-siglip-mrl-large": OpenClipModelProperties(
                 name="hf-hub:visheratin/nllb-siglip-mrl-large",
                 dimensions=1152,
                 note="A multilingual CLIP model",
+                imagePreprocessor="SigLIP",
+                size=384,
             ),
         }
 
diff --git a/tests/test_model_properties.py b/tests/test_model_properties.py
index 5444015..d739a78 100644
--- a/tests/test_model_properties.py
+++ b/tests/test_model_properties.py
@@ -171,3 +171,19 @@ def test_old_model_registry_matches_new(self):
                         new_model_registry_dict[model][key],
                         f"Model {model} has different value for key {key} in old and new model registry",
                     )
+
+    def test_siglip_models_have_imageprocesser_set(self):
+        open_clip_properties = _get_open_clip_properties()
+        for model_name, model_property in open_clip_properties.items():
+            if "siglip" in model_name.lower():
+                self.assertEqual(
+                    model_property["imageProcessor"],
+                    "SigLIP",
+                    f"Model {model_name} does not have imageProcessor set to SigLIP",
+                )
+            else:
+                # not a SigLIP model, imageProcessor should be None
+                self.assertIsNone(
+                    model_property.get("imageProcessor"),
+                    f"Model {model_name} has imageProcessor set even though it is not a SigLIP model",
+                )