diff --git a/src/transformers/models/auto/configuration_auto.py b/src/transformers/models/auto/configuration_auto.py index 44d435bc45a..6868175b2a7 100755 --- a/src/transformers/models/auto/configuration_auto.py +++ b/src/transformers/models/auto/configuration_auto.py @@ -54,6 +54,7 @@ ("camembert", "CamembertConfig"), ("canine", "CanineConfig"), ("chinese_clip", "ChineseCLIPConfig"), + ("chinese_clip_vision_model", "ChineseCLIPVisionConfig"), ("clap", "ClapConfig"), ("clip", "CLIPConfig"), ("clip_vision_model", "CLIPVisionConfig"), @@ -512,6 +513,7 @@ ("camembert", "CamemBERT"), ("canine", "CANINE"), ("chinese_clip", "Chinese-CLIP"), + ("chinese_clip_vision_model", "ChineseCLIPVisionModel"), ("clap", "CLAP"), ("clip", "CLIP"), ("clip_vision_model", "CLIPVisionModel"), @@ -773,6 +775,7 @@ ("xclip", "x_clip"), ("clip_vision_model", "clip"), ("siglip_vision_model", "siglip"), + ("chinese_clip_vision_model", "chinese_clip"), ] ) diff --git a/src/transformers/models/auto/modeling_auto.py b/src/transformers/models/auto/modeling_auto.py index 1de0249831d..1da2a644326 100755 --- a/src/transformers/models/auto/modeling_auto.py +++ b/src/transformers/models/auto/modeling_auto.py @@ -57,6 +57,7 @@ ("camembert", "CamembertModel"), ("canine", "CanineModel"), ("chinese_clip", "ChineseCLIPModel"), + ("chinese_clip_vision_model", "ChineseCLIPVisionModel"), ("clap", "ClapModel"), ("clip", "CLIPModel"), ("clip_vision_model", "CLIPVisionModel"), diff --git a/src/transformers/models/chinese_clip/configuration_chinese_clip.py b/src/transformers/models/chinese_clip/configuration_chinese_clip.py index 399b4e6b0ec..53b6d49b3f6 100644 --- a/src/transformers/models/chinese_clip/configuration_chinese_clip.py +++ b/src/transformers/models/chinese_clip/configuration_chinese_clip.py @@ -171,8 +171,7 @@ class ChineseCLIPVisionConfig(PretrainedConfig): This is the configuration class to store the configuration of a [`ChineseCLIPModel`]. It is used to instantiate an ChineseCLIP model according to the specified arguments, defining the model architecture. Instantiating a configuration with the defaults will yield a similar configuration to that of the ChineseCLIP - [OFA-Sys/chinese-clip-vit-base-patch16](https: - //huggingface.co/OFA-Sys/chinese-clip-vit-base-patch16) architecture. + [OFA-Sys/chinese-clip-vit-base-patch16](https://huggingface.co/OFA-Sys/chinese-clip-vit-base-patch16) architecture. Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the documentation from [`PretrainedConfig`] for more information. diff --git a/src/transformers/models/vision_text_dual_encoder/configuration_vision_text_dual_encoder.py b/src/transformers/models/vision_text_dual_encoder/configuration_vision_text_dual_encoder.py index 5dab0f42dc7..aab76f71db6 100644 --- a/src/transformers/models/vision_text_dual_encoder/configuration_vision_text_dual_encoder.py +++ b/src/transformers/models/vision_text_dual_encoder/configuration_vision_text_dual_encoder.py @@ -18,11 +18,19 @@ from ...configuration_utils import PretrainedConfig from ...utils import logging from ..auto.configuration_auto import AutoConfig +from ..chinese_clip.configuration_chinese_clip import ChineseCLIPVisionConfig from ..clip.configuration_clip import CLIPVisionConfig +from ..siglip.configuration_siglip import SiglipVisionConfig logger = logging.get_logger(__name__) +VISION_MODEL_CONFIGS = { + "clip_vision_model": CLIPVisionConfig, + "chinese_clip_vision_model": ChineseCLIPVisionConfig, + "siglip_vision_model": SiglipVisionConfig, +} + class VisionTextDualEncoderConfig(PretrainedConfig): r""" @@ -85,12 +93,13 @@ def __init__(self, projection_dim=512, logit_scale_init_value=2.6592, **kwargs): vision_model_type = vision_config.pop("model_type") text_model_type = text_config.pop("model_type") - if vision_model_type == "clip": - self.vision_config = AutoConfig.for_model(vision_model_type, **vision_config).vision_config - elif vision_model_type == "clip_vision_model": - self.vision_config = CLIPVisionConfig(**vision_config) + vision_config_class = VISION_MODEL_CONFIGS.get(vision_model_type) + if vision_config_class is not None: + self.vision_config = vision_config_class(**vision_config) else: self.vision_config = AutoConfig.for_model(vision_model_type, **vision_config) + if hasattr(self.vision_config, "vision_config"): + self.vision_config = self.vision_config.vision_config self.text_config = AutoConfig.for_model(text_model_type, **text_config) diff --git a/utils/check_copies.py b/utils/check_copies.py index 7001473b9f2..e170ede7aeb 100644 --- a/utils/check_copies.py +++ b/utils/check_copies.py @@ -1070,6 +1070,7 @@ def check_model_list_copy(overwrite: bool = False): "VisionTextDualEncoder", "CLIPVisionModel", "SiglipVisionModel", + "ChineseCLIPVisionModel", ] # Template for new entries to add in the main README when we have missing models. diff --git a/utils/check_table.py b/utils/check_table.py index 0042ce72fcc..99031f025c8 100644 --- a/utils/check_table.py +++ b/utils/check_table.py @@ -171,7 +171,7 @@ def _center_text(text: str, width: int) -> str: "XLS-R": "Wav2Vec2", "XLSR-Wav2Vec2": "Wav2Vec2", } -MODEL_NAMES_TO_IGNORE = ["CLIPVisionModel", "SiglipVisionModel"] +MODEL_NAMES_TO_IGNORE = ["CLIPVisionModel", "SiglipVisionModel", "ChineseCLIPVisionModel"] def get_model_table_from_auto_modules() -> str: