diff --git a/timm/models/vision_transformer.py b/timm/models/vision_transformer.py index 9a9c28e37..d328338e1 100644 --- a/timm/models/vision_transformer.py +++ b/timm/models/vision_transformer.py @@ -3499,18 +3499,6 @@ def vit_so400m_patch14_siglip_gap_224(pretrained: bool = False, **kwargs) -> Vis return model -@register_model -def vit_so400m_patch16_siglip_gap_256(pretrained: bool = False, **kwargs) -> VisionTransformer: - """ A SigLIP variant of ViT with global average pooling (GAP) instead of attention pooling (MAP).""" - model_args = dict( - patch_size=16, embed_dim=1152, depth=27, num_heads=16, mlp_ratio=3.7362, - class_token=False, global_pool='avg', fc_norm=False, - ) - model = _create_vision_transformer( - 'vit_so400m_patch16_siglip_gap_256', pretrained=pretrained, **dict(model_args, **kwargs)) - return model - - @register_model def vit_so400m_patch14_siglip_gap_378(pretrained: bool = False, **kwargs) -> VisionTransformer: """ A SigLIP variant of ViT with global average pooling (GAP) instead of attention pooling (MAP).""" @@ -3561,9 +3549,10 @@ def vit_so400m_patch14_siglip_gap_896(pretrained: bool = False, **kwargs) -> Vis @register_model def vit_so400m_patch16_siglip_gap_256(pretrained: bool = False, **kwargs) -> VisionTransformer: + """ A SigLIP variant of ViT with global average pooling (GAP) instead of attention pooling (MAP).""" model_args = dict( - patch_size=16, embed_dim=1152, depth=27, num_heads=16, mlp_ratio=3.7362, class_token=False, - global_pool='avg', fc_norm=False, act_layer='gelu_tanh' + patch_size=16, embed_dim=1152, depth=27, num_heads=16, mlp_ratio=3.7362, + class_token=False, global_pool='avg', fc_norm=False, act_layer='gelu_tanh', ) model = _create_vision_transformer( 'vit_so400m_patch16_siglip_gap_256', pretrained=pretrained, **dict(model_args, **kwargs))