From 01f30ae5578e30c2ec1e7def345a892fb9f338c6 Mon Sep 17 00:00:00 2001 From: Amy Roberts <22614925+amyeroberts@users.noreply.github.com> Date: Fri, 22 Dec 2023 20:21:21 +0000 Subject: [PATCH 1/8] Enable instantiating model with pretrained backbone weights --- docs/source/en/main_classes/backbones.md | 39 +++++++++++++++++++----- 1 file changed, 32 insertions(+), 7 deletions(-) diff --git a/docs/source/en/main_classes/backbones.md b/docs/source/en/main_classes/backbones.md index 5c3aacd125d0..1850f21f731d 100644 --- a/docs/source/en/main_classes/backbones.md +++ b/docs/source/en/main_classes/backbones.md @@ -19,9 +19,9 @@ rendered properly in your Markdown viewer. Backbones are models used for feature extraction for computer vision tasks. One can use a model as backbone in two ways: * initializing `AutoBackbone` class with a pretrained model, -* initializing a supported backbone configuration and passing it to the model architecture. +* initializing a supported backbone configuration and passing it to the model architecture. -## Using AutoBackbone +## Using AutoBackbone You can use `AutoBackbone` class to initialize a model as a backbone and get the feature maps for any stage. You can define `out_indices` to indicate the index of the layers which you would like to get the feature maps from. You can also use `out_features` if you know the name of the layers. You can use them interchangeably. If you are using both `out_indices` and `out_features`, ensure they are consistent. Not passing any of the feature map arguments will make the backbone yield the feature maps of the last layer. To visualize how stages look like, let's take the Swin model. Each stage is responsible from feature extraction, outputting feature maps. @@ -71,11 +71,19 @@ In computer vision, models consist of backbone, neck, and a head. Backbone extra ```py from transformers import MaskFormerConfig, MaskFormerForInstanceSegmentation, ResNetConfig -backbone_config = ResNetConfig.from_pretrained("microsoft/resnet-50") -config = MaskFormerConfig(backbone_config=backbone_config) +config = MaskFormerConfig(backbone="microsoft/resnet50", use_pretrained_backbone=True) +model = MaskFormerForInstanceSegmentation(config) +``` + +You can also initialize a backbone with random weights, with the same architecture as a pretrained model to initialize the model neck with it. +```py +from transformers import MaskFormerConfig, MaskFormerForInstanceSegmentation, ResNetConfig + +config = MaskFormerConfig(backbone="microsoft/resnet50", use_pretrained_backbone=False) model = MaskFormerForInstanceSegmentation(config) ``` -You can also initialize a backbone with random weights to initialize the model neck with it. + +Alternatively you can initialize the backbone with a supported backbone configuration, with randomly initialized weights. Below you can see how to initialize the [MaskFormer](../model_doc/maskformer) model with instance segmentation head with [ResNet](../model_doc/resnet) backbone. ```py backbone_config = ResNetConfig() @@ -85,9 +93,26 @@ model = MaskFormerForInstanceSegmentation(config) `timm` models are also supported in transformers through `TimmBackbone` and `TimmBackboneConfig`. -```python -from transformers import TimmBackboneConfig, TimmBackbone +```py +from transformers import TimmBackboneConfig, TimmBackbone, MaskFormerConfig, MaskFormerForInstanceSegmentation +# Create a timm backbone model from a config backbone_config = TimmBackboneConfig("resnet50") model = TimmBackbone(config=backbone_config) + +# Create a timm backbone from a pretrained model +model = TimmBackbone.from_pretrained("resnet50") + +# Create a MaskFormer model with a timm backbone +config = MaskFormerConfig(backbone="resnet50", use_timm_backbone=True) +model = MaskFormerForInstanceSegmentation(config) +``` + +By default, `TimmBackbone` will use pretrained weights. You can disable this by setting `use_pretrained=False`. + +```py +from transformers import MaskFormerConfig, MaskFormerForInstanceSegmentation + +config = MaskFormerConfig(backbone="resnet50", use_timm_backbone=True, use_pretrained_backbone=False) +model = MaskFormerForInstanceSegmentation(config) ``` From 058ac86a234317097cf6b86bac28b4643bd09ce0 Mon Sep 17 00:00:00 2001 From: Amy Roberts <22614925+amyeroberts@users.noreply.github.com> Date: Tue, 2 Jan 2024 18:03:30 +0000 Subject: [PATCH 2/8] Remove doc updates until changes made in modeling code --- docs/source/en/main_classes/backbones.md | 39 +++++------------------- 1 file changed, 7 insertions(+), 32 deletions(-) diff --git a/docs/source/en/main_classes/backbones.md b/docs/source/en/main_classes/backbones.md index 1850f21f731d..5c3aacd125d0 100644 --- a/docs/source/en/main_classes/backbones.md +++ b/docs/source/en/main_classes/backbones.md @@ -19,9 +19,9 @@ rendered properly in your Markdown viewer. Backbones are models used for feature extraction for computer vision tasks. One can use a model as backbone in two ways: * initializing `AutoBackbone` class with a pretrained model, -* initializing a supported backbone configuration and passing it to the model architecture. +* initializing a supported backbone configuration and passing it to the model architecture. -## Using AutoBackbone +## Using AutoBackbone You can use `AutoBackbone` class to initialize a model as a backbone and get the feature maps for any stage. You can define `out_indices` to indicate the index of the layers which you would like to get the feature maps from. You can also use `out_features` if you know the name of the layers. You can use them interchangeably. If you are using both `out_indices` and `out_features`, ensure they are consistent. Not passing any of the feature map arguments will make the backbone yield the feature maps of the last layer. To visualize how stages look like, let's take the Swin model. Each stage is responsible from feature extraction, outputting feature maps. @@ -71,19 +71,11 @@ In computer vision, models consist of backbone, neck, and a head. Backbone extra ```py from transformers import MaskFormerConfig, MaskFormerForInstanceSegmentation, ResNetConfig -config = MaskFormerConfig(backbone="microsoft/resnet50", use_pretrained_backbone=True) -model = MaskFormerForInstanceSegmentation(config) -``` - -You can also initialize a backbone with random weights, with the same architecture as a pretrained model to initialize the model neck with it. -```py -from transformers import MaskFormerConfig, MaskFormerForInstanceSegmentation, ResNetConfig - -config = MaskFormerConfig(backbone="microsoft/resnet50", use_pretrained_backbone=False) +backbone_config = ResNetConfig.from_pretrained("microsoft/resnet-50") +config = MaskFormerConfig(backbone_config=backbone_config) model = MaskFormerForInstanceSegmentation(config) ``` - -Alternatively you can initialize the backbone with a supported backbone configuration, with randomly initialized weights. Below you can see how to initialize the [MaskFormer](../model_doc/maskformer) model with instance segmentation head with [ResNet](../model_doc/resnet) backbone. +You can also initialize a backbone with random weights to initialize the model neck with it. ```py backbone_config = ResNetConfig() @@ -93,26 +85,9 @@ model = MaskFormerForInstanceSegmentation(config) `timm` models are also supported in transformers through `TimmBackbone` and `TimmBackboneConfig`. -```py -from transformers import TimmBackboneConfig, TimmBackbone, MaskFormerConfig, MaskFormerForInstanceSegmentation +```python +from transformers import TimmBackboneConfig, TimmBackbone -# Create a timm backbone model from a config backbone_config = TimmBackboneConfig("resnet50") model = TimmBackbone(config=backbone_config) - -# Create a timm backbone from a pretrained model -model = TimmBackbone.from_pretrained("resnet50") - -# Create a MaskFormer model with a timm backbone -config = MaskFormerConfig(backbone="resnet50", use_timm_backbone=True) -model = MaskFormerForInstanceSegmentation(config) -``` - -By default, `TimmBackbone` will use pretrained weights. You can disable this by setting `use_pretrained=False`. - -```py -from transformers import MaskFormerConfig, MaskFormerForInstanceSegmentation - -config = MaskFormerConfig(backbone="resnet50", use_timm_backbone=True, use_pretrained_backbone=False) -model = MaskFormerForInstanceSegmentation(config) ``` From 273ba28028b24a20b340c7aa6feb1e83d1f6ebc9 Mon Sep 17 00:00:00 2001 From: Amy Roberts <22614925+amyeroberts@users.noreply.github.com> Date: Thu, 4 Jan 2024 15:17:26 +0000 Subject: [PATCH 3/8] Use load_backbone instead --- .../modeling_conditional_detr.py | 4 ++-- .../deformable_detr/modeling_deformable_detr.py | 4 ++-- src/transformers/models/deta/modeling_deta.py | 6 +++--- src/transformers/models/detr/modeling_detr.py | 4 ++-- src/transformers/models/dpt/modeling_dpt.py | 16 +++++++--------- .../models/mask2former/modeling_mask2former.py | 4 ++-- .../models/maskformer/modeling_maskformer.py | 11 +++++------ .../models/oneformer/modeling_oneformer.py | 5 ++--- .../modeling_table_transformer.py | 4 ++-- src/transformers/models/tvp/modeling_tvp.py | 4 ++-- .../models/upernet/modeling_upernet.py | 4 ++-- .../models/vit_hybrid/modeling_vit_hybrid.py | 4 ++-- .../models/vitmatte/modeling_vitmatte.py | 4 ++-- utils/check_config_attributes.py | 2 ++ 14 files changed, 37 insertions(+), 39 deletions(-) diff --git a/src/transformers/models/conditional_detr/modeling_conditional_detr.py b/src/transformers/models/conditional_detr/modeling_conditional_detr.py index b74f6accadfc..6c2cbb859c8e 100644 --- a/src/transformers/models/conditional_detr/modeling_conditional_detr.py +++ b/src/transformers/models/conditional_detr/modeling_conditional_detr.py @@ -37,7 +37,7 @@ replace_return_docstrings, requires_backends, ) -from ..auto import AutoBackbone +from ...utils.backbone_utils import load_backbone from .configuration_conditional_detr import ConditionalDetrConfig @@ -363,7 +363,7 @@ def __init__(self, config): **kwargs, ) else: - backbone = AutoBackbone.from_config(config.backbone_config) + backbone = load_backbone(config) # replace batch norm by frozen batch norm with torch.no_grad(): diff --git a/src/transformers/models/deformable_detr/modeling_deformable_detr.py b/src/transformers/models/deformable_detr/modeling_deformable_detr.py index 3767eef0392f..9eed0b8ba45c 100755 --- a/src/transformers/models/deformable_detr/modeling_deformable_detr.py +++ b/src/transformers/models/deformable_detr/modeling_deformable_detr.py @@ -44,7 +44,7 @@ from ...modeling_utils import PreTrainedModel from ...pytorch_utils import meshgrid from ...utils import is_ninja_available, logging -from ..auto import AutoBackbone +from ...utils.backbone_utils import load_backbone from .configuration_deformable_detr import DeformableDetrConfig from .load_custom import load_cuda_kernels @@ -409,7 +409,7 @@ def __init__(self, config): **kwargs, ) else: - backbone = AutoBackbone.from_config(config.backbone_config) + backbone = load_backbone(config) # replace batch norm by frozen batch norm with torch.no_grad(): diff --git a/src/transformers/models/deta/modeling_deta.py b/src/transformers/models/deta/modeling_deta.py index 330ccfe3f0c3..b1e263331c05 100644 --- a/src/transformers/models/deta/modeling_deta.py +++ b/src/transformers/models/deta/modeling_deta.py @@ -38,8 +38,8 @@ from ...modeling_outputs import BaseModelOutput from ...modeling_utils import PreTrainedModel from ...pytorch_utils import meshgrid -from ...utils import is_accelerate_available, is_torchvision_available, logging, requires_backends -from ..auto import AutoBackbone +from ...utils import is_torchvision_available, logging, requires_backends +from ...utils.backbone_utils import load_backbone from .configuration_deta import DetaConfig @@ -338,7 +338,7 @@ class DetaBackboneWithPositionalEncodings(nn.Module): def __init__(self, config): super().__init__() - backbone = AutoBackbone.from_config(config.backbone_config) + backbone = load_backbone(config) with torch.no_grad(): replace_batch_norm(backbone) self.model = backbone diff --git a/src/transformers/models/detr/modeling_detr.py b/src/transformers/models/detr/modeling_detr.py index a3078cd2d0ae..026100b24506 100644 --- a/src/transformers/models/detr/modeling_detr.py +++ b/src/transformers/models/detr/modeling_detr.py @@ -37,7 +37,7 @@ replace_return_docstrings, requires_backends, ) -from ..auto import AutoBackbone +from ...utils.backbone_utils import load_backbone from .configuration_detr import DetrConfig @@ -356,7 +356,7 @@ def __init__(self, config): **kwargs, ) else: - backbone = AutoBackbone.from_config(config.backbone_config) + backbone = load_backbone(config) # replace batch norm by frozen batch norm with torch.no_grad(): diff --git a/src/transformers/models/dpt/modeling_dpt.py b/src/transformers/models/dpt/modeling_dpt.py index 09fc6406fd85..595712bb721a 100755 --- a/src/transformers/models/dpt/modeling_dpt.py +++ b/src/transformers/models/dpt/modeling_dpt.py @@ -41,7 +41,7 @@ from ...modeling_utils import PreTrainedModel from ...pytorch_utils import find_pruneable_heads_and_indices, prune_linear_layer from ...utils import ModelOutput, logging -from ..auto import AutoBackbone +from ...utils.backbone_utils import load_backbone from .configuration_dpt import DPTConfig @@ -131,12 +131,10 @@ def __init__(self, config, feature_size=None): patch_size = patch_size if isinstance(patch_size, collections.abc.Iterable) else (patch_size, patch_size) num_patches = (image_size[1] // patch_size[1]) * (image_size[0] // patch_size[0]) - self.backbone = AutoBackbone.from_config(config.backbone_config) + self.backbone = load_backbone(config) feature_dim = self.backbone.channels[-1] - if len(config.backbone_config.out_features) != 3: - raise ValueError( - f"Expected backbone to have 3 output features, got {len(config.backbone_config.out_features)}" - ) + if len(self.backbone.channels) != 3: + raise ValueError(f"Expected backbone to have 3 output features, got {len(self.backbone.channels)}") self.residual_feature_map_index = [0, 1] # Always take the output of the first and second backbone stage if feature_size is None: @@ -1081,10 +1079,10 @@ def __init__(self, config): super().__init__(config) self.backbone = None - if config.backbone_config is not None and config.is_hybrid is False: - self.backbone = AutoBackbone.from_config(config.backbone_config) - else: + if config.is_hybrid: self.dpt = DPTModel(config, add_pooling_layer=False) + else: + self.backbone = load_backbone(config) # Neck self.neck = DPTNeck(config) diff --git a/src/transformers/models/mask2former/modeling_mask2former.py b/src/transformers/models/mask2former/modeling_mask2former.py index eeee25967e4f..a88028a80717 100644 --- a/src/transformers/models/mask2former/modeling_mask2former.py +++ b/src/transformers/models/mask2former/modeling_mask2former.py @@ -23,7 +23,6 @@ import torch from torch import Tensor, nn -from ... import AutoBackbone from ...activations import ACT2FN from ...file_utils import ( ModelOutput, @@ -36,6 +35,7 @@ from ...modeling_outputs import BaseModelOutput, BaseModelOutputWithCrossAttentions from ...modeling_utils import PreTrainedModel from ...utils import logging +from ...utils.backbone_utils import load_backbone from .configuration_mask2former import Mask2FormerConfig @@ -1376,7 +1376,7 @@ def __init__(self, config: Mask2FormerConfig): """ super().__init__() - self.encoder = AutoBackbone.from_config(config.backbone_config) + self.encoder = load_backbone(config) self.decoder = Mask2FormerPixelDecoder(config, feature_channels=self.encoder.channels) def forward(self, pixel_values: Tensor, output_hidden_states: bool = False) -> Mask2FormerPixelLevelModuleOutput: diff --git a/src/transformers/models/maskformer/modeling_maskformer.py b/src/transformers/models/maskformer/modeling_maskformer.py index dc46a6e87988..026ea15d4439 100644 --- a/src/transformers/models/maskformer/modeling_maskformer.py +++ b/src/transformers/models/maskformer/modeling_maskformer.py @@ -23,7 +23,6 @@ import torch from torch import Tensor, nn -from ... import AutoBackbone from ...activations import ACT2FN from ...modeling_attn_mask_utils import _prepare_4d_attention_mask from ...modeling_outputs import BaseModelOutputWithCrossAttentions @@ -37,6 +36,7 @@ replace_return_docstrings, requires_backends, ) +from ...utils.backbone_utils import load_backbone from ..detr import DetrConfig from .configuration_maskformer import MaskFormerConfig from .configuration_maskformer_swin import MaskFormerSwinConfig @@ -1428,14 +1428,13 @@ def __init__(self, config: MaskFormerConfig): The configuration used to instantiate this model. """ super().__init__() - - # TODD: add method to load pretrained weights of backbone - backbone_config = config.backbone_config - if backbone_config.model_type == "swin": + if hasattr(config, "backbone_config") and config.backbone_config.model_type == "swin": # for backwards compatibility + backbone_config = config.backbone_config backbone_config = MaskFormerSwinConfig.from_dict(backbone_config.to_dict()) backbone_config.out_features = ["stage1", "stage2", "stage3", "stage4"] - self.encoder = AutoBackbone.from_config(backbone_config) + config.backbone_config = backbone_config + self.encoder = load_backbone(config) feature_channels = self.encoder.channels self.decoder = MaskFormerPixelDecoder( diff --git a/src/transformers/models/oneformer/modeling_oneformer.py b/src/transformers/models/oneformer/modeling_oneformer.py index d0c0d405502e..894dac10f7ea 100644 --- a/src/transformers/models/oneformer/modeling_oneformer.py +++ b/src/transformers/models/oneformer/modeling_oneformer.py @@ -24,7 +24,6 @@ from torch import Tensor, nn from torch.cuda.amp import autocast -from ... import AutoBackbone from ...activations import ACT2FN from ...modeling_outputs import BaseModelOutput from ...modeling_utils import PreTrainedModel @@ -37,6 +36,7 @@ replace_return_docstrings, requires_backends, ) +from ...utils.backbone_utils import load_backbone from .configuration_oneformer import OneFormerConfig @@ -1478,8 +1478,7 @@ def __init__(self, config: OneFormerConfig): The configuration used to instantiate this model. """ super().__init__() - backbone_config = config.backbone_config - self.encoder = AutoBackbone.from_config(backbone_config) + self.encoder = load_backbone(config) self.decoder = OneFormerPixelDecoder(config, feature_channels=self.encoder.channels) def forward(self, pixel_values: Tensor, output_hidden_states: bool = False) -> OneFormerPixelLevelModuleOutput: diff --git a/src/transformers/models/table_transformer/modeling_table_transformer.py b/src/transformers/models/table_transformer/modeling_table_transformer.py index 81afcdc9c18f..19aa680ad038 100644 --- a/src/transformers/models/table_transformer/modeling_table_transformer.py +++ b/src/transformers/models/table_transformer/modeling_table_transformer.py @@ -37,7 +37,7 @@ replace_return_docstrings, requires_backends, ) -from ..auto import AutoBackbone +from ...utils.backbone_utils import load_backbone from .configuration_table_transformer import TableTransformerConfig @@ -290,7 +290,7 @@ def __init__(self, config): **kwargs, ) else: - backbone = AutoBackbone.from_config(config.backbone_config) + backbone = load_backbone(config) # replace batch norm by frozen batch norm with torch.no_grad(): diff --git a/src/transformers/models/tvp/modeling_tvp.py b/src/transformers/models/tvp/modeling_tvp.py index 04192630eebd..c80cc9df0b35 100644 --- a/src/transformers/models/tvp/modeling_tvp.py +++ b/src/transformers/models/tvp/modeling_tvp.py @@ -28,7 +28,7 @@ from ...modeling_utils import PreTrainedModel from ...pytorch_utils import prune_linear_layer from ...utils import logging -from ..auto import AutoBackbone +from ...utils.backbone_utils import load_backbone from .configuration_tvp import TvpConfig @@ -148,7 +148,7 @@ def forward(self, logits, labels): class TvpVisionModel(nn.Module): def __init__(self, config): super().__init__() - self.backbone = AutoBackbone.from_config(config.backbone_config) + self.backbone = load_backbone(config) self.grid_encoder_conv = nn.Conv2d( config.backbone_config.hidden_sizes[-1], config.hidden_size, diff --git a/src/transformers/models/upernet/modeling_upernet.py b/src/transformers/models/upernet/modeling_upernet.py index 2ad8e8c372f1..b889ae4eb4ce 100644 --- a/src/transformers/models/upernet/modeling_upernet.py +++ b/src/transformers/models/upernet/modeling_upernet.py @@ -20,10 +20,10 @@ from torch import nn from torch.nn import CrossEntropyLoss -from ... import AutoBackbone from ...modeling_outputs import SemanticSegmenterOutput from ...modeling_utils import PreTrainedModel from ...utils import add_start_docstrings, add_start_docstrings_to_model_forward, replace_return_docstrings +from ...utils.backbone_utils import load_backbone from .configuration_upernet import UperNetConfig @@ -348,7 +348,7 @@ class UperNetForSemanticSegmentation(UperNetPreTrainedModel): def __init__(self, config): super().__init__(config) - self.backbone = AutoBackbone.from_config(config.backbone_config) + self.backbone = load_backbone(config) # Semantic segmentation head(s) self.decode_head = UperNetHead(config, in_channels=self.backbone.channels) diff --git a/src/transformers/models/vit_hybrid/modeling_vit_hybrid.py b/src/transformers/models/vit_hybrid/modeling_vit_hybrid.py index 24b133e27af0..3dc715af511c 100644 --- a/src/transformers/models/vit_hybrid/modeling_vit_hybrid.py +++ b/src/transformers/models/vit_hybrid/modeling_vit_hybrid.py @@ -29,7 +29,7 @@ from ...modeling_utils import PreTrainedModel from ...pytorch_utils import find_pruneable_heads_and_indices, prune_linear_layer from ...utils import add_code_sample_docstrings, add_start_docstrings, add_start_docstrings_to_model_forward, logging -from ..auto import AutoBackbone +from ...utils.backbone_utils import load_backbone from .configuration_vit_hybrid import ViTHybridConfig @@ -150,7 +150,7 @@ def __init__(self, config, feature_size=None): image_size = image_size if isinstance(image_size, collections.abc.Iterable) else (image_size, image_size) patch_size = patch_size if isinstance(patch_size, collections.abc.Iterable) else (patch_size, patch_size) - self.backbone = AutoBackbone.from_config(config.backbone_config) + self.backbone = load_backbone(config) if self.backbone.config.model_type != "bit": raise ValueError(f"Backbone model type {self.backbone.model_type} is not supported.") feature_dim = self.backbone.channels[-1] diff --git a/src/transformers/models/vitmatte/modeling_vitmatte.py b/src/transformers/models/vitmatte/modeling_vitmatte.py index 01e6ed5aa0a3..465f5da6adf5 100644 --- a/src/transformers/models/vitmatte/modeling_vitmatte.py +++ b/src/transformers/models/vitmatte/modeling_vitmatte.py @@ -20,7 +20,6 @@ import torch from torch import nn -from ... import AutoBackbone from ...modeling_utils import PreTrainedModel from ...utils import ( ModelOutput, @@ -28,6 +27,7 @@ add_start_docstrings_to_model_forward, replace_return_docstrings, ) +from ...utils.backbone_utils import load_backbone from .configuration_vitmatte import VitMatteConfig @@ -259,7 +259,7 @@ def __init__(self, config): super().__init__(config) self.config = config - self.backbone = AutoBackbone.from_config(config.backbone_config) + self.backbone = load_backbone(config) self.decoder = VitMatteDetailCaptureModule(config) # Initialize weights and apply final processing diff --git a/utils/check_config_attributes.py b/utils/check_config_attributes.py index f1d8b741411f..af1feb9d2b0b 100644 --- a/utils/check_config_attributes.py +++ b/utils/check_config_attributes.py @@ -220,6 +220,8 @@ def check_attribute_being_used(config_class, attributes, default_value, source_s "out_indices", "sampling_rate", "use_pretrained_backbone", + "backbone", + "backbone_config", ] attributes_used_in_generation = ["encoder_no_repeat_ngram_size"] From 47c5c2a96df4ed027e3950eff0dccbfe9a860ece Mon Sep 17 00:00:00 2001 From: Amy Roberts <22614925+amyeroberts@users.noreply.github.com> Date: Thu, 4 Jan 2024 18:23:38 +0000 Subject: [PATCH 4/8] Add use_timm_backbone to the model configs --- src/transformers/models/deta/configuration_deta.py | 5 +++++ src/transformers/models/dpt/configuration_dpt.py | 5 +++++ .../models/mask2former/configuration_mask2former.py | 5 +++++ .../models/maskformer/configuration_maskformer.py | 5 +++++ src/transformers/models/oneformer/configuration_oneformer.py | 5 +++++ src/transformers/models/tvp/configuration_tvp.py | 4 ++++ src/transformers/models/upernet/configuration_upernet.py | 5 +++++ .../models/vit_hybrid/configuration_vit_hybrid.py | 5 +++++ src/transformers/models/vitmatte/configuration_vitmatte.py | 5 +++++ utils/check_config_attributes.py | 2 ++ 10 files changed, 46 insertions(+) diff --git a/src/transformers/models/deta/configuration_deta.py b/src/transformers/models/deta/configuration_deta.py index 1ade9465a9f3..633d6267ef3d 100644 --- a/src/transformers/models/deta/configuration_deta.py +++ b/src/transformers/models/deta/configuration_deta.py @@ -46,6 +46,9 @@ class DetaConfig(PretrainedConfig): is `False`, this loads the backbone's config and uses that to initialize the backbone with random weights. use_pretrained_backbone (`bool`, *optional*, `False`): Whether to use pretrained weights for the backbone. + use_timm_backbone (`bool`, *optional*, `False`): + Whether to load `backbone` from the timm library. If `False`, the backbone is loaded from the transformers + library. num_queries (`int`, *optional*, defaults to 900): Number of object queries, i.e. detection slots. This is the maximal number of objects [`DetaModel`] can detect in a single image. In case `two_stage` is set to `True`, we use `two_stage_num_proposals` instead. @@ -146,6 +149,7 @@ def __init__( backbone_config=None, backbone=None, use_pretrained_backbone=False, + use_timm_backbone=False, num_queries=900, max_position_embeddings=2048, encoder_layers=6, @@ -203,6 +207,7 @@ def __init__( self.backbone_config = backbone_config self.backbone = backbone self.use_pretrained_backbone = use_pretrained_backbone + self.use_timm_backbone = use_timm_backbone self.num_queries = num_queries self.max_position_embeddings = max_position_embeddings self.d_model = d_model diff --git a/src/transformers/models/dpt/configuration_dpt.py b/src/transformers/models/dpt/configuration_dpt.py index 0b6366659bc1..1b0389d76dcc 100644 --- a/src/transformers/models/dpt/configuration_dpt.py +++ b/src/transformers/models/dpt/configuration_dpt.py @@ -117,6 +117,9 @@ class DPTConfig(PretrainedConfig): is `False`, this loads the backbone's config and uses that to initialize the backbone with random weights. use_pretrained_backbone (`bool`, *optional*, defaults to `False`): Whether to use pretrained weights for the backbone. + use_timm_backbone (`bool`, *optional*, `False`): + Whether to load `backbone` from the timm library. If `False`, the backbone is loaded from the transformers + library. Example: @@ -169,6 +172,7 @@ def __init__( backbone_config=None, backbone=None, use_pretrained_backbone=False, + use_timm_backbone=False, **kwargs, ): super().__init__(**kwargs) @@ -229,6 +233,7 @@ def __init__( self.backbone = backbone self.use_pretrained_backbone = use_pretrained_backbone + self.use_timm_backbone = use_timm_backbone self.num_hidden_layers = None if use_autobackbone else num_hidden_layers self.num_attention_heads = None if use_autobackbone else num_attention_heads self.intermediate_size = None if use_autobackbone else intermediate_size diff --git a/src/transformers/models/mask2former/configuration_mask2former.py b/src/transformers/models/mask2former/configuration_mask2former.py index 7202e551a0cb..0d27ba39cbde 100644 --- a/src/transformers/models/mask2former/configuration_mask2former.py +++ b/src/transformers/models/mask2former/configuration_mask2former.py @@ -53,6 +53,9 @@ class Mask2FormerConfig(PretrainedConfig): is `False`, this loads the backbone's config and uses that to initialize the backbone with random weights. use_pretrained_backbone (`bool`, *optional*, `False`): Whether to use pretrained weights for the backbone. + use_timm_backbone (`bool`, *optional*, `False`): + Whether to load `backbone` from the timm library. If `False`, the backbone is loaded from the transformers + library. feature_size (`int`, *optional*, defaults to 256): The features (channels) of the resulting feature maps. mask_feature_size (`int`, *optional*, defaults to 256): @@ -162,6 +165,7 @@ def __init__( output_auxiliary_logits: bool = None, backbone=None, use_pretrained_backbone=False, + use_timm_backbone=False, **kwargs, ): if use_pretrained_backbone: @@ -228,6 +232,7 @@ def __init__( self.num_hidden_layers = decoder_layers self.backbone = backbone self.use_pretrained_backbone = use_pretrained_backbone + self.use_timm_backbone = use_timm_backbone super().__init__(**kwargs) diff --git a/src/transformers/models/maskformer/configuration_maskformer.py b/src/transformers/models/maskformer/configuration_maskformer.py index 3d2814dbfdc1..e906ceb2b39f 100644 --- a/src/transformers/models/maskformer/configuration_maskformer.py +++ b/src/transformers/models/maskformer/configuration_maskformer.py @@ -63,6 +63,9 @@ class MaskFormerConfig(PretrainedConfig): is `False`, this loads the backbone's config and uses that to initialize the backbone with random weights. use_pretrained_backbone (`bool`, *optional*, `False`): Whether to use pretrained weights for the backbone. + use_timm_backbone (`bool`, *optional*, `False`): + Whether to load `backbone` from the timm library. If `False`, the backbone is loaded from the transformers + library. decoder_config (`Dict`, *optional*): The configuration passed to the transformer decoder model, if unset the base config for `detr-resnet-50` will be used. @@ -122,6 +125,7 @@ def __init__( output_auxiliary_logits: Optional[bool] = None, backbone: Optional[str] = None, use_pretrained_backbone: bool = False, + use_timm_backbone: bool = False, **kwargs, ): if use_pretrained_backbone: @@ -193,6 +197,7 @@ def __init__( self.num_hidden_layers = self.decoder_config.num_hidden_layers self.backbone = backbone self.use_pretrained_backbone = use_pretrained_backbone + self.use_timm_backbone = use_timm_backbone super().__init__(**kwargs) @classmethod diff --git a/src/transformers/models/oneformer/configuration_oneformer.py b/src/transformers/models/oneformer/configuration_oneformer.py index 6cf54947de6b..429cb42b69d7 100644 --- a/src/transformers/models/oneformer/configuration_oneformer.py +++ b/src/transformers/models/oneformer/configuration_oneformer.py @@ -50,6 +50,9 @@ class OneFormerConfig(PretrainedConfig): is `False`, this loads the backbone's config and uses that to initialize the backbone with random weights. use_pretrained_backbone (`bool`, *optional*, defaults to `False`): Whether to use pretrained weights for the backbone. + use_timm_backbone (`bool`, *optional*, `False`): + Whether to load `backbone` from the timm library. If `False`, the backbone is loaded from the transformers + library. ignore_value (`int`, *optional*, defaults to 255): Values to be ignored in GT label while calculating loss. num_queries (`int`, *optional*, defaults to 150): @@ -152,6 +155,7 @@ def __init__( backbone_config: Optional[Dict] = None, backbone: Optional[str] = None, use_pretrained_backbone: bool = False, + use_timm_backbone: bool = False, ignore_value: int = 255, num_queries: int = 150, no_object_weight: int = 0.1, @@ -222,6 +226,7 @@ def __init__( self.backbone_config = backbone_config self.backbone = backbone self.use_pretrained_backbone = use_pretrained_backbone + self.use_timm_backbone = use_timm_backbone self.ignore_value = ignore_value self.num_queries = num_queries self.no_object_weight = no_object_weight diff --git a/src/transformers/models/tvp/configuration_tvp.py b/src/transformers/models/tvp/configuration_tvp.py index 954ee4e90cb1..e7e631f9cdc9 100644 --- a/src/transformers/models/tvp/configuration_tvp.py +++ b/src/transformers/models/tvp/configuration_tvp.py @@ -49,6 +49,9 @@ class TvpConfig(PretrainedConfig): is `False`, this loads the backbone's config and uses that to initialize the backbone with random weights. use_pretrained_backbone (`bool`, *optional*, defaults to `False`): Whether to use pretrained weights for the backbone. + use_timm_backbone (`bool`, *optional*, `False`): + Whether to load `backbone` from the timm library. If `False`, the backbone is loaded from the transformers + library. distance_loss_weight (`float`, *optional*, defaults to 1.0): The weight of distance loss. duration_loss_weight (`float`, *optional*, defaults to 0.1): @@ -143,6 +146,7 @@ def __init__( self.backbone_config = backbone_config self.backbone = backbone self.use_pretrained_backbone = use_pretrained_backbone + self.use_timm_backbone = use_timm_backbone self.distance_loss_weight = distance_loss_weight self.duration_loss_weight = duration_loss_weight self.visual_prompter_type = visual_prompter_type diff --git a/src/transformers/models/upernet/configuration_upernet.py b/src/transformers/models/upernet/configuration_upernet.py index c4e6f8168f55..9288bd67b610 100644 --- a/src/transformers/models/upernet/configuration_upernet.py +++ b/src/transformers/models/upernet/configuration_upernet.py @@ -42,6 +42,9 @@ class UperNetConfig(PretrainedConfig): is `False`, this loads the backbone's config and uses that to initialize the backbone with random weights. use_pretrained_backbone (`bool`, *optional*, `False`): Whether to use pretrained weights for the backbone. + use_timm_backbone (`bool`, *optional*, `False`): + Whether to load `backbone` from the timm library. If `False`, the backbone is loaded from the transformers + library. hidden_size (`int`, *optional*, defaults to 512): The number of hidden units in the convolutional layers. initializer_range (`float`, *optional*, defaults to 0.02): @@ -83,6 +86,7 @@ def __init__( backbone_config=None, backbone=None, use_pretrained_backbone=False, + use_timm_backbone=False, hidden_size=512, initializer_range=0.02, pool_scales=[1, 2, 3, 6], @@ -113,6 +117,7 @@ def __init__( self.backbone_config = backbone_config self.backbone = backbone self.use_pretrained_backbone = use_pretrained_backbone + self.use_timm_backbone = use_timm_backbone self.hidden_size = hidden_size self.initializer_range = initializer_range self.pool_scales = pool_scales diff --git a/src/transformers/models/vit_hybrid/configuration_vit_hybrid.py b/src/transformers/models/vit_hybrid/configuration_vit_hybrid.py index b0a37617dc1e..8a998cfb97e1 100644 --- a/src/transformers/models/vit_hybrid/configuration_vit_hybrid.py +++ b/src/transformers/models/vit_hybrid/configuration_vit_hybrid.py @@ -48,6 +48,9 @@ class ViTHybridConfig(PretrainedConfig): is `False`, this loads the backbone's config and uses that to initialize the backbone with random weights. use_pretrained_backbone (`bool`, *optional*, defaults to `False`): Whether to use pretrained weights for the backbone. + use_timm_backbone (`bool`, *optional*, `False`): + Whether to load `backbone` from the timm library. If `False`, the backbone is loaded from the transformers + library. hidden_size (`int`, *optional*, defaults to 768): Dimensionality of the encoder layers and the pooler layer. num_hidden_layers (`int`, *optional*, defaults to 12): @@ -100,6 +103,7 @@ def __init__( backbone_config=None, backbone=None, use_pretrained_backbone=False, + use_timm_backbone=False, hidden_size=768, num_hidden_layers=12, num_attention_heads=12, @@ -147,6 +151,7 @@ def __init__( self.backbone_config = backbone_config self.backbone = backbone self.use_pretrained_backbone = use_pretrained_backbone + self.use_timm_backbone = use_timm_backbone self.hidden_size = hidden_size self.num_hidden_layers = num_hidden_layers self.num_attention_heads = num_attention_heads diff --git a/src/transformers/models/vitmatte/configuration_vitmatte.py b/src/transformers/models/vitmatte/configuration_vitmatte.py index 608b606c9bcb..35a044052d1d 100644 --- a/src/transformers/models/vitmatte/configuration_vitmatte.py +++ b/src/transformers/models/vitmatte/configuration_vitmatte.py @@ -48,6 +48,9 @@ class VitMatteConfig(PretrainedConfig): is `False`, this loads the backbone's config and uses that to initialize the backbone with random weights. use_pretrained_backbone (`bool`, *optional*, defaults to `False`): Whether to use pretrained weights for the backbone. + use_timm_backbone (`bool`, *optional*, `False`): + Whether to load `backbone` from the timm library. If `False`, the backbone is loaded from the transformers + library. hidden_size (`int`, *optional*, defaults to 384): The number of input channels of the decoder. batch_norm_eps (`float`, *optional*, defaults to 1e-05): @@ -81,6 +84,7 @@ def __init__( backbone_config: PretrainedConfig = None, backbone=None, use_pretrained_backbone=False, + use_timm_backbone=False, hidden_size: int = 384, batch_norm_eps: float = 1e-5, initializer_range: float = 0.02, @@ -107,6 +111,7 @@ def __init__( self.backbone_config = backbone_config self.backbone = backbone self.use_pretrained_backbone = use_pretrained_backbone + self.use_timm_backbone = use_timm_backbone self.batch_norm_eps = batch_norm_eps self.hidden_size = hidden_size self.initializer_range = initializer_range diff --git a/utils/check_config_attributes.py b/utils/check_config_attributes.py index af1feb9d2b0b..825f53dbb751 100644 --- a/utils/check_config_attributes.py +++ b/utils/check_config_attributes.py @@ -219,9 +219,11 @@ def check_attribute_being_used(config_class, attributes, default_value, source_s "out_features", "out_indices", "sampling_rate", + # backbone related arguments passed to load_backbone "use_pretrained_backbone", "backbone", "backbone_config", + "use_timm_backbone" ] attributes_used_in_generation = ["encoder_no_repeat_ngram_size"] From 356eafa3b32d783cb2cf4632ffc6eaf54e52cf11 Mon Sep 17 00:00:00 2001 From: Amy Roberts <22614925+amyeroberts@users.noreply.github.com> Date: Wed, 24 Jan 2024 14:08:57 +0000 Subject: [PATCH 5/8] Add missing imports and arguments --- src/transformers/models/deta/modeling_deta.py | 2 +- src/transformers/models/tvp/configuration_tvp.py | 1 + utils/check_config_attributes.py | 2 +- 3 files changed, 3 insertions(+), 2 deletions(-) diff --git a/src/transformers/models/deta/modeling_deta.py b/src/transformers/models/deta/modeling_deta.py index b1e263331c05..b6c65a3c301b 100644 --- a/src/transformers/models/deta/modeling_deta.py +++ b/src/transformers/models/deta/modeling_deta.py @@ -38,7 +38,7 @@ from ...modeling_outputs import BaseModelOutput from ...modeling_utils import PreTrainedModel from ...pytorch_utils import meshgrid -from ...utils import is_torchvision_available, logging, requires_backends +from ...utils import is_accelerate_available, is_torchvision_available, logging, requires_backends from ...utils.backbone_utils import load_backbone from .configuration_deta import DetaConfig diff --git a/src/transformers/models/tvp/configuration_tvp.py b/src/transformers/models/tvp/configuration_tvp.py index e7e631f9cdc9..79c6cf47c6c1 100644 --- a/src/transformers/models/tvp/configuration_tvp.py +++ b/src/transformers/models/tvp/configuration_tvp.py @@ -106,6 +106,7 @@ def __init__( backbone_config=None, backbone=None, use_pretrained_backbone=False, + use_timm_backbone=False, distance_loss_weight=1.0, duration_loss_weight=0.1, visual_prompter_type="framepad", diff --git a/utils/check_config_attributes.py b/utils/check_config_attributes.py index 825f53dbb751..10ba5d187206 100644 --- a/utils/check_config_attributes.py +++ b/utils/check_config_attributes.py @@ -223,7 +223,7 @@ def check_attribute_being_used(config_class, attributes, default_value, source_s "use_pretrained_backbone", "backbone", "backbone_config", - "use_timm_backbone" + "use_timm_backbone", ] attributes_used_in_generation = ["encoder_no_repeat_ngram_size"] From 4cae432d54007df1e388f66e873362179af6d985 Mon Sep 17 00:00:00 2001 From: Amy Roberts <22614925+amyeroberts@users.noreply.github.com> Date: Wed, 24 Jan 2024 15:37:50 +0000 Subject: [PATCH 6/8] Update docstrings --- src/transformers/models/dpt/configuration_dpt.py | 2 +- src/transformers/models/oneformer/configuration_oneformer.py | 2 +- src/transformers/models/tvp/configuration_tvp.py | 2 +- src/transformers/models/vit_hybrid/configuration_vit_hybrid.py | 2 +- src/transformers/models/vitmatte/configuration_vitmatte.py | 2 +- 5 files changed, 5 insertions(+), 5 deletions(-) diff --git a/src/transformers/models/dpt/configuration_dpt.py b/src/transformers/models/dpt/configuration_dpt.py index 1b0389d76dcc..c514670f761e 100644 --- a/src/transformers/models/dpt/configuration_dpt.py +++ b/src/transformers/models/dpt/configuration_dpt.py @@ -117,7 +117,7 @@ class DPTConfig(PretrainedConfig): is `False`, this loads the backbone's config and uses that to initialize the backbone with random weights. use_pretrained_backbone (`bool`, *optional*, defaults to `False`): Whether to use pretrained weights for the backbone. - use_timm_backbone (`bool`, *optional*, `False`): + use_timm_backbone (`bool`, *optional*, defaults to `False`): Whether to load `backbone` from the timm library. If `False`, the backbone is loaded from the transformers library. diff --git a/src/transformers/models/oneformer/configuration_oneformer.py b/src/transformers/models/oneformer/configuration_oneformer.py index 429cb42b69d7..b88e2c559098 100644 --- a/src/transformers/models/oneformer/configuration_oneformer.py +++ b/src/transformers/models/oneformer/configuration_oneformer.py @@ -50,7 +50,7 @@ class OneFormerConfig(PretrainedConfig): is `False`, this loads the backbone's config and uses that to initialize the backbone with random weights. use_pretrained_backbone (`bool`, *optional*, defaults to `False`): Whether to use pretrained weights for the backbone. - use_timm_backbone (`bool`, *optional*, `False`): + use_timm_backbone (`bool`, *optional*, defaults to `False`): Whether to load `backbone` from the timm library. If `False`, the backbone is loaded from the transformers library. ignore_value (`int`, *optional*, defaults to 255): diff --git a/src/transformers/models/tvp/configuration_tvp.py b/src/transformers/models/tvp/configuration_tvp.py index 79c6cf47c6c1..7e985ab84e30 100644 --- a/src/transformers/models/tvp/configuration_tvp.py +++ b/src/transformers/models/tvp/configuration_tvp.py @@ -49,7 +49,7 @@ class TvpConfig(PretrainedConfig): is `False`, this loads the backbone's config and uses that to initialize the backbone with random weights. use_pretrained_backbone (`bool`, *optional*, defaults to `False`): Whether to use pretrained weights for the backbone. - use_timm_backbone (`bool`, *optional*, `False`): + use_timm_backbone (`bool`, *optional*, defaults to `False`): Whether to load `backbone` from the timm library. If `False`, the backbone is loaded from the transformers library. distance_loss_weight (`float`, *optional*, defaults to 1.0): diff --git a/src/transformers/models/vit_hybrid/configuration_vit_hybrid.py b/src/transformers/models/vit_hybrid/configuration_vit_hybrid.py index 8a998cfb97e1..30ebe4fba659 100644 --- a/src/transformers/models/vit_hybrid/configuration_vit_hybrid.py +++ b/src/transformers/models/vit_hybrid/configuration_vit_hybrid.py @@ -48,7 +48,7 @@ class ViTHybridConfig(PretrainedConfig): is `False`, this loads the backbone's config and uses that to initialize the backbone with random weights. use_pretrained_backbone (`bool`, *optional*, defaults to `False`): Whether to use pretrained weights for the backbone. - use_timm_backbone (`bool`, *optional*, `False`): + use_timm_backbone (`bool`, *optional*, defaults to `False`): Whether to load `backbone` from the timm library. If `False`, the backbone is loaded from the transformers library. hidden_size (`int`, *optional*, defaults to 768): diff --git a/src/transformers/models/vitmatte/configuration_vitmatte.py b/src/transformers/models/vitmatte/configuration_vitmatte.py index 35a044052d1d..4d2bcc612fe9 100644 --- a/src/transformers/models/vitmatte/configuration_vitmatte.py +++ b/src/transformers/models/vitmatte/configuration_vitmatte.py @@ -48,7 +48,7 @@ class VitMatteConfig(PretrainedConfig): is `False`, this loads the backbone's config and uses that to initialize the backbone with random weights. use_pretrained_backbone (`bool`, *optional*, defaults to `False`): Whether to use pretrained weights for the backbone. - use_timm_backbone (`bool`, *optional*, `False`): + use_timm_backbone (`bool`, *optional*, defaults to `False`): Whether to load `backbone` from the timm library. If `False`, the backbone is loaded from the transformers library. hidden_size (`int`, *optional*, defaults to 384): From 83ff616d6ac9ad796ad7b41ee4f4ea7f067410d3 Mon Sep 17 00:00:00 2001 From: Amy Roberts <22614925+amyeroberts@users.noreply.github.com> Date: Wed, 24 Jan 2024 16:39:17 +0000 Subject: [PATCH 7/8] Make sure test is properly configured --- tests/models/conditional_detr/test_modeling_conditional_detr.py | 1 + 1 file changed, 1 insertion(+) diff --git a/tests/models/conditional_detr/test_modeling_conditional_detr.py b/tests/models/conditional_detr/test_modeling_conditional_detr.py index 0bb9388d593f..aa0318f241aa 100644 --- a/tests/models/conditional_detr/test_modeling_conditional_detr.py +++ b/tests/models/conditional_detr/test_modeling_conditional_detr.py @@ -443,6 +443,7 @@ def test_different_timm_backbone(self): # let's pick a random timm backbone config.backbone = "tf_mobilenetv3_small_075" + config.use_timm_backbone = True for model_class in self.all_model_classes: model = model_class(config) From a3e6d7d339ff36c9db8e813c3519a8548b166b56 Mon Sep 17 00:00:00 2001 From: Amy Roberts <22614925+amyeroberts@users.noreply.github.com> Date: Wed, 24 Jan 2024 16:39:45 +0000 Subject: [PATCH 8/8] Include recent DPT updates --- src/transformers/models/dpt/configuration_dpt.py | 15 +++++++-------- src/transformers/models/dpt/modeling_dpt.py | 6 +++--- 2 files changed, 10 insertions(+), 11 deletions(-) diff --git a/src/transformers/models/dpt/configuration_dpt.py b/src/transformers/models/dpt/configuration_dpt.py index c514670f761e..5bb48ad9780a 100644 --- a/src/transformers/models/dpt/configuration_dpt.py +++ b/src/transformers/models/dpt/configuration_dpt.py @@ -183,9 +183,6 @@ def __init__( if use_pretrained_backbone: raise ValueError("Pretrained backbones are not supported yet.") - if backbone_config is not None and backbone is not None: - raise ValueError("You can't specify both `backbone` and `backbone_config`.") - use_autobackbone = False if self.is_hybrid: if backbone_config is None and backbone is None: @@ -197,17 +194,17 @@ def __init__( "out_features": ["stage1", "stage2", "stage3"], "embedding_dynamic_padding": True, } - self.backbone_config = BitConfig(**backbone_config) + backbone_config = BitConfig(**backbone_config) elif isinstance(backbone_config, dict): logger.info("Initializing the config with a `BiT` backbone.") - self.backbone_config = BitConfig(**backbone_config) + backbone_config = BitConfig(**backbone_config) elif isinstance(backbone_config, PretrainedConfig): - self.backbone_config = backbone_config + backbone_config = backbone_config else: raise ValueError( f"backbone_config must be a dictionary or a `PretrainedConfig`, got {backbone_config.__class__}." ) - + self.backbone_config = backbone_config self.backbone_featmap_shape = backbone_featmap_shape self.neck_ignore_stages = neck_ignore_stages @@ -225,12 +222,14 @@ def __init__( self.backbone_config = backbone_config self.backbone_featmap_shape = None self.neck_ignore_stages = [] - else: self.backbone_config = backbone_config self.backbone_featmap_shape = None self.neck_ignore_stages = [] + if use_autobackbone and backbone_config is not None and backbone is not None: + raise ValueError("You can't specify both `backbone` and `backbone_config`.") + self.backbone = backbone self.use_pretrained_backbone = use_pretrained_backbone self.use_timm_backbone = use_timm_backbone diff --git a/src/transformers/models/dpt/modeling_dpt.py b/src/transformers/models/dpt/modeling_dpt.py index 595712bb721a..e986e71d4851 100755 --- a/src/transformers/models/dpt/modeling_dpt.py +++ b/src/transformers/models/dpt/modeling_dpt.py @@ -1079,10 +1079,10 @@ def __init__(self, config): super().__init__(config) self.backbone = None - if config.is_hybrid: - self.dpt = DPTModel(config, add_pooling_layer=False) - else: + if config.backbone_config is not None and config.is_hybrid is False: self.backbone = load_backbone(config) + else: + self.dpt = DPTModel(config, add_pooling_layer=False) # Neck self.neck = DPTNeck(config)